27#elif defined(__SSE4_1__)
29#elif defined(__SSE2__)
39#ifdef HELIOS_CUDA_AVAILABLE
40#include <cuda_runtime.h>
43using namespace helios;
45#ifdef HELIOS_CUDA_AVAILABLE
56void launchBVHTraversal(
void *h_nodes,
int node_count,
unsigned int *h_primitive_indices,
int primitive_count,
float *h_primitive_aabb_min,
float *h_primitive_aabb_max,
float *h_query_aabb_min,
float *h_query_aabb_max,
int num_queries,
57 unsigned int *h_results,
unsigned int *h_result_counts,
int max_results_per_query);
58bool launchVoxelRayPathLengths(
int num_rays,
float *h_ray_origins,
float *h_ray_directions,
float grid_center_x,
float grid_center_y,
float grid_center_z,
float grid_size_x,
float grid_size_y,
float grid_size_z,
int grid_divisions_x,
59 int grid_divisions_y,
int grid_divisions_z,
int primitive_count,
int *h_voxel_ray_counts,
float *h_voxel_path_lengths,
int *h_voxel_transmitted,
int *h_voxel_hit_before,
int *h_voxel_hit_after,
int *h_voxel_hit_inside);
61void launchWarpEfficientBVH(
void *h_bvh_soa_gpu,
unsigned int *h_primitive_indices,
int primitive_count,
float *h_primitive_aabb_min,
float *h_primitive_aabb_max,
float *h_ray_origins,
float *h_ray_directions,
float *h_ray_max_distances,
62 int num_rays,
unsigned int *h_results,
unsigned int *h_result_counts,
int max_results_per_ray);
64void launchRayPrimitiveIntersection(
void *h_bvh_nodes,
int node_count,
unsigned int *h_primitive_indices,
int primitive_count,
int *h_primitive_types, float3 *h_primitive_vertices,
unsigned int *h_vertex_offsets,
int total_vertex_count,
65 float *h_ray_origins,
float *h_ray_directions,
float *h_ray_max_distances,
int num_rays,
float *h_hit_distances,
unsigned int *h_hit_primitive_ids,
unsigned int *h_hit_counts,
bool find_closest_hit);
70 return make_float3(v.
x, v.
y, v.
z);
84 vec3 ray_direction = direction;
88 ray_direction = ray_direction / ray_direction.
magnitude();
95 if (!bvh_nodes.empty()) {
97 if (primitive_cache.empty()) {
98 buildPrimitiveCache();
102 RayQuery query(origin, ray_direction, max_distance, target_UUIDs);
103 return castRayBVHTraversal(query);
108 std::vector<uint> search_primitives;
109 if (target_UUIDs.empty()) {
111 if (!primitive_cache.empty()) {
113 search_primitives.reserve(primitive_cache.size());
114 for (
const auto &cached_pair: primitive_cache) {
115 search_primitives.push_back(cached_pair.first);
122 search_primitives = target_UUIDs;
126 float nearest_distance = std::numeric_limits<float>::max();
127 if (max_distance > 0) {
128 nearest_distance = max_distance;
131 uint hit_primitive = 0;
132 bool found_intersection =
false;
135 for (
uint candidate_uuid: search_primitives) {
136 float intersection_distance;
140 if (!primitive_cache.empty()) {
141 HitResult primitive_result = intersectPrimitiveThreadSafe(origin, ray_direction, candidate_uuid, max_distance);
142 if (primitive_result.
hit) {
144 intersection_distance = primitive_result.
distance;
151 hit = rayPrimitiveIntersection(origin, ray_direction, candidate_uuid, intersection_distance);
156 if (intersection_distance > 1e-6f &&
157 intersection_distance < nearest_distance) {
159 nearest_distance = intersection_distance;
160 hit_primitive = candidate_uuid;
161 found_intersection =
true;
167 if (found_intersection) {
180 vec3 v0 = vertices[0];
181 vec3 v1 = vertices[1];
182 vec3 v2 = vertices[2];
183 vec3 edge1 = v1 - v0;
184 vec3 edge2 = v2 - v0;
189 vec3 v0 = vertices[0];
190 vec3 v1 = vertices[1];
191 vec3 v2 = vertices[2];
192 vec3 edge1 = v1 - v0;
193 vec3 edge2 = v2 - v0;
200 }
catch (
const std::exception &e) {
211 std::vector<HitResult> results;
212 results.reserve(ray_queries.size());
221 const size_t GPU_BATCH_THRESHOLD = 1000000;
222 const size_t MIN_PRIMITIVES_FOR_GPU = 500;
224#ifdef HELIOS_CUDA_AVAILABLE
225 bool use_gpu = gpu_acceleration_enabled && ray_queries.size() >= GPU_BATCH_THRESHOLD && d_bvh_nodes !=
nullptr && !primitive_indices.empty() && primitive_indices.size() >= MIN_PRIMITIVES_FOR_GPU;
229 castRaysGPU(ray_queries, results, local_stats);
232 castRaysCPU(ray_queries, results, local_stats);
236 castRaysCPU(ray_queries, results, local_stats);
240 if (stats !=
nullptr) {
241 *stats = local_stats;
247void CollisionDetection::castRaysCPU(
const std::vector<RayQuery> &ray_queries, std::vector<HitResult> &results, RayTracingStats &stats) {
252#ifdef HELIOS_CUDA_AVAILABLE
253void CollisionDetection::castRaysGPU(
const std::vector<RayQuery> &ray_queries, std::vector<HitResult> &results, RayTracingStats &stats) {
255 results = castRaysGPU(ray_queries, stats);
260 const std::vector<RayQuery> &ray_queries) {
263 std::vector<std::vector<std::vector<std::vector<HitResult>>>> grid_results;
264 grid_results.resize(grid_divisions.
x);
265 for (
int i = 0; i < grid_divisions.
x; i++) {
266 grid_results[i].resize(grid_divisions.
y);
267 for (
int j = 0; j < grid_divisions.
y; j++) {
268 grid_results[i][j].resize(grid_divisions.
z);
273 vec3 voxel_size =
make_vec3(grid_size.
x /
float(grid_divisions.
x), grid_size.
y /
float(grid_divisions.
y), grid_size.
z /
float(grid_divisions.
z));
276 for (
const auto &query: ray_queries) {
279 if (hit_result.
hit) {
283 int voxel_i = int(relative_pos.
x / voxel_size.
x);
284 int voxel_j = int(relative_pos.
y / voxel_size.
y);
285 int voxel_k = int(relative_pos.
z / voxel_size.
z);
288 if (voxel_i >= 0 && voxel_i < grid_divisions.x && voxel_j >= 0 && voxel_j < grid_divisions.y && voxel_k >= 0 && voxel_k < grid_divisions.
z) {
290 grid_results[voxel_i][voxel_j][voxel_k].push_back(hit_result);
300 if (ray_directions.empty()) {
302 std::cout <<
"WARNING (CollisionDetection::calculateVoxelPathLengths): No rays provided" << std::endl;
304 return std::vector<std::vector<HitResult>>();
307 if (voxel_centers.size() != voxel_sizes.size()) {
308 helios_runtime_error(
"ERROR (CollisionDetection::calculateVoxelPathLengths): voxel_centers and voxel_sizes vectors must have same size");
311 if (voxel_centers.empty()) {
313 std::cout <<
"WARNING (CollisionDetection::calculateVoxelPathLengths): No voxels provided" << std::endl;
315 return std::vector<std::vector<HitResult>>();
318 const size_t num_rays = ray_directions.size();
319 const size_t num_voxels = voxel_centers.size();
322 std::cout <<
"Calculating voxel path lengths for " << num_rays <<
" rays through " << num_voxels <<
" voxels..." << std::endl;
326 std::vector<std::vector<HitResult>> result(num_voxels);
329#pragma omp parallel for schedule(dynamic, 1000)
330 for (
int ray_idx = 0; ray_idx < static_cast<int>(num_rays); ++ray_idx) {
331 const vec3 &ray_direction = ray_directions[ray_idx];
334 for (
size_t voxel_idx = 0; voxel_idx < num_voxels; ++voxel_idx) {
335 const vec3 &voxel_center = voxel_centers[voxel_idx];
336 const vec3 &voxel_size = voxel_sizes[voxel_idx];
339 const vec3 half_size = voxel_size * 0.5f;
340 const vec3 voxel_min = voxel_center - half_size;
341 const vec3 voxel_max = voxel_center + half_size;
345 if (rayAABBIntersect(scan_origin, ray_direction, voxel_min, voxel_max, t_min, t_max)) {
347 const float path_length = t_max - std::max(0.0f, t_min);
349 if (path_length > 1e-6f) {
352 hit_result.
hit =
false;
362 result[voxel_idx].push_back(hit_result);
370 size_t total_intersections = 0;
371 for (
size_t i = 0; i < num_voxels; ++i) {
372 total_intersections += result[i].size();
374 std::cout <<
"Completed voxel path length calculations. Total ray-voxel intersections: " << total_intersections << std::endl;
381 std::vector<HitResult> &hit_results) {
384 hit_results.reserve(ray_origins.size());
386 if (ray_origins.size() != ray_directions.size()) {
387 helios_runtime_error(
"ERROR (CollisionDetection::calculateRayPathLengthsDetailed): ray_origins and ray_directions must have the same size");
395 for (
size_t i = 0; i < ray_origins.size(); i++) {
396 RayQuery query(ray_origins[i], ray_directions[i]);
398 hit_results.push_back(result);
407 if (mode == bvh_optimization_mode) {
412 bvh_optimization_mode = mode;
415 if (old_mode != mode && !bvh_nodes.empty()) {
417 std::cout <<
"CollisionDetection: Converting BVH from mode " <<
static_cast<int>(old_mode) <<
" to mode " <<
static_cast<int>(mode) << std::endl;
421 ensureOptimizedBVH();
424 auto memory_stats = getBVHMemoryUsage();
425 std::cout <<
"CollisionDetection: Memory usage - SoA: " << memory_stats.soa_memory_bytes <<
" bytes, Quantized: " << memory_stats.quantized_memory_bytes <<
" bytes (" << memory_stats.quantized_reduction_percent <<
"% reduction)"
432 return bvh_optimization_mode;
435void CollisionDetection::convertBVHLayout(BVHOptimizationMode from_mode, BVHOptimizationMode to_mode) {
442 if (ray_queries.empty()) {
448 ensureOptimizedBVH();
451 if (primitive_cache.empty()) {
452 buildPrimitiveCache();
456 std::vector<HitResult> results;
457 results.reserve(ray_queries.size());
459 auto start_time = std::chrono::high_resolution_clock::now();
462 switch (bvh_optimization_mode) {
464 results = castRaysSoA(ray_queries, local_stats);
468 auto end_time = std::chrono::high_resolution_clock::now();
469 auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
472 *stats = local_stats;
479 if (ray_stream.
packets.empty()) {
486 for (
auto &packet: ray_stream.
packets) {
488 auto queries = packet.toRayQueries();
494 if (results.size() != queries.size()) {
500 packet.results = std::move(results);
510 *stats = combined_stats;
513 if (printmessages && success) {
514 std::cout <<
"CollisionDetection: Processed " << ray_stream.
packets.size() <<
" ray packets (" << ray_stream.
total_rays <<
" total rays)" << std::endl;
524 MemoryUsageStats stats;
527 stats.soa_memory_bytes = bvh_nodes_soa.getMemoryUsage();
530 stats.quantized_memory_bytes = 0;
531 stats.quantized_reduction_percent = 0.0f;
536std::vector<CollisionDetection::HitResult> CollisionDetection::castRaysSoA(
const std::vector<RayQuery> &ray_queries, RayTracingStats &stats) {
537 std::vector<HitResult> results;
538 results.reserve(ray_queries.size());
540 if (bvh_nodes_soa.node_count == 0) {
542 results.resize(ray_queries.size());
543 stats.total_rays_cast = ray_queries.size();
544 stats.total_hits = 0;
545 stats.bvh_nodes_visited = 0;
546 stats.average_ray_distance = 0.0f;
550 stats.total_rays_cast = ray_queries.size();
551 stats.total_hits = 0;
552 stats.average_ray_distance = 0.0;
555 results.resize(ray_queries.size());
560 RayTracingStats local_stats = {};
562#pragma omp for schedule(guided, 32)
563 for (
int i = 0; i < static_cast<int>(ray_queries.size()); ++i) {
564 HitResult result = castRaySoATraversal(ray_queries[i], local_stats);
568 local_stats.total_hits++;
569 local_stats.average_ray_distance += result.distance;
575 stats.total_hits += local_stats.total_hits;
578 stats.average_ray_distance += local_stats.average_ray_distance;
581 stats.bvh_nodes_visited += local_stats.bvh_nodes_visited;
584 if (stats.total_hits > 0) {
585 stats.average_ray_distance /= stats.total_hits;
595using namespace helios;
600 if (bvh_nodes_soa.node_count == 0 || bvh_nodes_soa.aabb_mins.empty()) {
605 std::stack<size_t> node_stack;
608 float closest_distance = (query.max_distance > 0) ? query.max_distance : std::numeric_limits<float>::
max();
611 const size_t MAX_TRAVERSAL_STEPS = 10000000;
612 size_t traversal_steps = 0;
614 while (!node_stack.empty()) {
616 if (++traversal_steps > MAX_TRAVERSAL_STEPS) {
618 std::cout <<
"WARNING: BVH traversal exceeded maximum steps, terminating ray" << std::endl;
623 size_t node_idx = node_stack.top();
625 stats.bvh_nodes_visited++;
628 if (node_idx >= bvh_nodes_soa.node_count) {
630 std::cout <<
"WARNING: Invalid node index " << node_idx <<
" >= " << bvh_nodes_soa.node_count << std::endl;
636 if (node_idx + 1 < bvh_nodes_soa.node_count) {
638 __builtin_prefetch(&bvh_nodes_soa.aabb_mins[node_idx + 1], 0, 1);
639 __builtin_prefetch(&bvh_nodes_soa.aabb_maxs[node_idx + 1], 0, 1);
640#elif defined(_MSC_VER)
641 _mm_prefetch(
reinterpret_cast<const char *
>(&bvh_nodes_soa.aabb_mins[node_idx + 1]), _MM_HINT_T1);
642 _mm_prefetch(
reinterpret_cast<const char *
>(&bvh_nodes_soa.aabb_maxs[node_idx + 1]), _MM_HINT_T1);
647 if (!aabbIntersectSoA(query.origin, query.direction, closest_distance, node_idx)) {
652 if (bvh_nodes_soa.is_leaf_flags[node_idx]) {
654 uint32_t primitive_start = bvh_nodes_soa.primitive_starts[node_idx];
655 uint32_t primitive_count = bvh_nodes_soa.primitive_counts[node_idx];
657 for (uint32_t i = 0; i < primitive_count; ++i) {
658 uint primitive_id = primitive_indices[primitive_start + i];
661 if (!query.target_UUIDs.empty()) {
663 for (
uint target: query.target_UUIDs) {
664 if (primitive_id == target) {
674 HitResult primitive_result = intersectPrimitiveThreadSafe(query.origin, query.direction, primitive_id, closest_distance);
675 if (primitive_result.hit && primitive_result.distance < closest_distance) {
676 result = primitive_result;
677 closest_distance = primitive_result.distance;
682 uint32_t left_child = bvh_nodes_soa.left_children[node_idx];
683 uint32_t right_child = bvh_nodes_soa.right_children[node_idx];
685 if (left_child != 0xFFFFFFFF && left_child < bvh_nodes_soa.node_count) {
686 node_stack.push(left_child);
688 if (right_child != 0xFFFFFFFF && right_child < bvh_nodes_soa.node_count) {
689 node_stack.push(right_child);
698bool CollisionDetection::aabbIntersectSoA(
const helios::vec3 &ray_origin,
const helios::vec3 &ray_direction,
float max_distance,
size_t node_index)
const {
700 const vec3 &aabb_min = bvh_nodes_soa.aabb_mins[node_index];
701 const vec3 &aabb_max = bvh_nodes_soa.aabb_maxs[node_index];
705 __m128 ray_orig = _mm_set_ps(0.0f, ray_origin.
z, ray_origin.
y, ray_origin.
x);
706 __m128 ray_dir = _mm_set_ps(0.0f, ray_direction.
z, ray_direction.
y, ray_direction.
x);
707 __m128 aabb_min_vec = _mm_set_ps(0.0f, aabb_min.
z, aabb_min.
y, aabb_min.
x);
708 __m128 aabb_max_vec = _mm_set_ps(0.0f, aabb_max.
z, aabb_max.
y, aabb_max.
x);
711 __m128 inv_dir = _mm_div_ps(_mm_set1_ps(1.0f), ray_dir);
714 __m128 t1 = _mm_mul_ps(_mm_sub_ps(aabb_min_vec, ray_orig), inv_dir);
715 __m128 t2 = _mm_mul_ps(_mm_sub_ps(aabb_max_vec, ray_orig), inv_dir);
718 __m128 tmin = _mm_min_ps(t1, t2);
719 __m128 tmax = _mm_max_ps(t1, t2);
722 float tmin_vals[4], tmax_vals[4];
723 _mm_store_ps(tmin_vals, tmin);
724 _mm_store_ps(tmax_vals, tmax);
726 float t_near = std::max({tmin_vals[0], tmin_vals[1], tmin_vals[2], 0.0f});
727 float t_far = std::min({tmax_vals[0], tmax_vals[1], tmax_vals[2], max_distance});
729 return t_near <= t_far;
732 float inv_dir_x = 1.0f / ray_direction.
x;
733 float inv_dir_y = 1.0f / ray_direction.
y;
734 float inv_dir_z = 1.0f / ray_direction.
z;
736 float t1_x = (aabb_min.
x - ray_origin.
x) * inv_dir_x;
737 float t2_x = (aabb_max.
x - ray_origin.
x) * inv_dir_x;
738 float t1_y = (aabb_min.
y - ray_origin.
y) * inv_dir_y;
739 float t2_y = (aabb_max.
y - ray_origin.
y) * inv_dir_y;
740 float t1_z = (aabb_min.
z - ray_origin.
z) * inv_dir_z;
741 float t2_z = (aabb_max.
z - ray_origin.
z) * inv_dir_z;
743 float tmin_x = std::min(t1_x, t2_x);
744 float tmax_x = std::max(t1_x, t2_x);
745 float tmin_y = std::min(t1_y, t2_y);
746 float tmax_y = std::max(t1_y, t2_y);
747 float tmin_z = std::min(t1_z, t2_z);
748 float tmax_z = std::max(t1_z, t2_z);
750 float t_near = std::max({tmin_x, tmin_y, tmin_z, 0.0f});
751 float t_far = std::min({tmax_x, tmax_y, tmax_z, max_distance});
753 return t_near <= t_far;
762 if (bvh_nodes.empty()) {
767 std::stack<size_t> node_stack;
770 float closest_distance = (query.max_distance > 0) ? query.max_distance : std::numeric_limits<float>::
max();
772 while (!node_stack.empty()) {
773 size_t node_idx = node_stack.top();
776 if (node_idx >= bvh_nodes.size()) {
778 std::cout <<
"ERROR: Invalid BVH node index " << node_idx <<
" >= " << bvh_nodes.size() <<
" nodes" << std::endl;
784 const BVHNode &node = bvh_nodes[node_idx];
787 if (!rayAABBIntersect(query.origin, query.direction, node.aabb_min, node.aabb_max)) {
793 for (uint32_t i = 0; i < node.primitive_count; ++i) {
794 if (node.primitive_start + i >= primitive_indices.size()) {
796 std::cout <<
"ERROR: Invalid BVH primitive index " << (node.primitive_start + i) <<
" >= " << primitive_indices.size() <<
" primitives" << std::endl;
802 uint primitive_id = primitive_indices[node.primitive_start + i];
805 if (!query.target_UUIDs.empty()) {
807 for (
uint target: query.target_UUIDs) {
808 if (primitive_id == target) {
818 HitResult primitive_hit = intersectPrimitive(query, primitive_id);
819 if (primitive_hit.hit && primitive_hit.distance < closest_distance) {
820 result = primitive_hit;
821 closest_distance = primitive_hit.distance;
826 if (node.left_child != 0xFFFFFFFF && node.left_child < bvh_nodes.size()) {
827 node_stack.push(node.left_child);
829 if (node.right_child != 0xFFFFFFFF && node.right_child < bvh_nodes.size()) {
830 node_stack.push(node.right_child);
839bool CollisionDetection::rayAABBIntersect(
const vec3 &ray_origin,
const vec3 &ray_direction,
const vec3 &aabb_min,
const vec3 &aabb_max)
const {
841 const float EPSILON = 1e-8f;
844 float tmax = std::numeric_limits<float>::max();
847 if (std::abs(ray_direction.
x) > EPSILON) {
848 float inv_dir_x = 1.0f / ray_direction.
x;
849 float t1 = (aabb_min.
x - ray_origin.
x) * inv_dir_x;
850 float t2 = (aabb_max.
x - ray_origin.
x) * inv_dir_x;
852 float slab_tmin = std::min(t1, t2);
853 float slab_tmax = std::max(t1, t2);
855 tmin = std::max(tmin, slab_tmin);
856 tmax = std::min(tmax, slab_tmax);
862 if (ray_origin.
x < aabb_min.
x || ray_origin.
x > aabb_max.
x) {
868 if (std::abs(ray_direction.
y) > EPSILON) {
869 float inv_dir_y = 1.0f / ray_direction.
y;
870 float t1 = (aabb_min.
y - ray_origin.
y) * inv_dir_y;
871 float t2 = (aabb_max.
y - ray_origin.
y) * inv_dir_y;
873 float slab_tmin = std::min(t1, t2);
874 float slab_tmax = std::max(t1, t2);
876 tmin = std::max(tmin, slab_tmin);
877 tmax = std::min(tmax, slab_tmax);
883 if (ray_origin.
y < aabb_min.
y || ray_origin.
y > aabb_max.
y) {
889 if (std::abs(ray_direction.
z) > EPSILON) {
890 float inv_dir_z = 1.0f / ray_direction.
z;
891 float t1 = (aabb_min.
z - ray_origin.
z) * inv_dir_z;
892 float t2 = (aabb_max.
z - ray_origin.
z) * inv_dir_z;
894 float slab_tmin = std::min(t1, t2);
895 float slab_tmax = std::max(t1, t2);
897 tmin = std::max(tmin, slab_tmin);
898 tmax = std::min(tmax, slab_tmax);
904 if (ray_origin.
z < aabb_min.
z || ray_origin.
z > aabb_max.
z) {
919 if (rayPrimitiveIntersection(query.origin, query.direction, primitive_id, distance)) {
921 result.distance = distance;
922 result.primitive_UUID = primitive_id;
923 result.intersection_point = query.origin + query.direction * distance;
929 if (type == PRIMITIVE_TYPE_TRIANGLE && vertices.size() >= 3) {
930 vec3 edge1 = vertices[1] - vertices[0];
931 vec3 edge2 = vertices[2] - vertices[0];
932 result.normal =
cross(edge1, edge2);
933 if (result.normal.magnitude() > 1e-8f) {
934 result.normal = result.normal / result.normal.magnitude();
936 result.normal =
make_vec3(-query.direction.x, -query.direction.y, -query.direction.z);
938 }
else if (type == PRIMITIVE_TYPE_PATCH && vertices.size() >= 3) {
939 vec3 edge1 = vertices[1] - vertices[0];
940 vec3 edge2 = vertices[2] - vertices[0];
941 result.normal =
cross(edge1, edge2);
942 if (result.normal.magnitude() > 1e-8f) {
943 result.normal = result.normal / result.normal.magnitude();
945 result.normal =
make_vec3(-query.direction.x, -query.direction.y, -query.direction.z);
949 result.normal =
make_vec3(-query.direction.x, -query.direction.y, -query.direction.z);
960 auto it = primitive_cache.find(primitive_id);
961 if (it == primitive_cache.end()) {
967 if (rayPrimitiveIntersection(origin, direction, primitive_id, distance)) {
969 result.distance = distance;
970 result.primitive_UUID = primitive_id;
971 result.intersection_point = origin + direction * distance;
977 if (type == PRIMITIVE_TYPE_TRIANGLE && vertices.size() >= 3) {
978 vec3 edge1 = vertices[1] - vertices[0];
979 vec3 edge2 = vertices[2] - vertices[0];
980 result.normal =
cross(edge1, edge2);
981 if (result.normal.magnitude() > 1e-8f) {
982 result.normal = result.normal / result.normal.magnitude();
984 vec3 to_origin = origin - result.intersection_point;
985 if (result.normal * to_origin < 0) {
986 result.normal = result.normal * -1.0f;
989 result.normal =
make_vec3(-direction.
x, -direction.
y, -direction.
z);
990 result.normal = result.normal / result.normal.magnitude();
992 }
else if (type == PRIMITIVE_TYPE_PATCH && vertices.size() >= 4) {
993 vec3 edge1 = vertices[1] - vertices[0];
994 vec3 edge2 = vertices[2] - vertices[0];
995 result.normal =
cross(edge1, edge2);
996 if (result.normal.magnitude() > 1e-8f) {
997 result.normal = result.normal / result.normal.magnitude();
999 vec3 to_origin = origin - result.intersection_point;
1000 if (result.normal * to_origin < 0) {
1001 result.normal = result.normal * -1.0f;
1004 result.normal =
make_vec3(-direction.
x, -direction.
y, -direction.
z);
1005 result.normal = result.normal / result.normal.magnitude();
1008 result.normal =
make_vec3(-direction.
x, -direction.
y, -direction.
z);
1009 result.normal = result.normal / result.normal.magnitude();
1015 const CachedPrimitive &cached = it->second;
1018 if (cached.type == PRIMITIVE_TYPE_TRIANGLE && cached.vertices.size() >= 3) {
1020 if (triangleIntersect(origin, direction, cached.vertices[0], cached.vertices[1], cached.vertices[2], distance)) {
1021 if (distance > 1e-6f && (max_distance <= 0 || distance < max_distance)) {
1023 result.distance = distance;
1024 result.primitive_UUID = primitive_id;
1025 result.intersection_point = origin + direction * distance;
1028 vec3 edge1 = cached.vertices[1] - cached.vertices[0];
1029 vec3 edge2 = cached.vertices[2] - cached.vertices[0];
1030 result.normal =
cross(edge1, edge2);
1031 if (result.normal.magnitude() > 1e-8f) {
1032 result.normal = result.normal / result.normal.magnitude();
1034 vec3 to_origin = origin - result.intersection_point;
1035 if (result.normal * to_origin < 0) {
1036 result.normal = result.normal * -1.0f;
1039 result.normal =
make_vec3(-direction.
x, -direction.
y, -direction.
z);
1040 result.normal = result.normal / result.normal.magnitude();
1044 }
else if (cached.type == PRIMITIVE_TYPE_PATCH && cached.vertices.size() >= 4) {
1046 if (patchIntersect(origin, direction, cached.vertices[0], cached.vertices[1], cached.vertices[2], cached.vertices[3], distance)) {
1047 if (distance > 1e-6f && (max_distance <= 0 || distance < max_distance)) {
1049 result.distance = distance;
1050 result.primitive_UUID = primitive_id;
1051 result.intersection_point = origin + direction * distance;
1054 vec3 edge1 = cached.vertices[1] - cached.vertices[0];
1055 vec3 edge2 = cached.vertices[2] - cached.vertices[0];
1056 result.normal =
cross(edge1, edge2);
1057 if (result.normal.magnitude() > 1e-8f) {
1058 result.normal = result.normal / result.normal.magnitude();
1060 vec3 to_origin = origin - result.intersection_point;
1061 if (result.normal * to_origin < 0) {
1062 result.normal = result.normal * -1.0f;
1065 result.normal =
make_vec3(-direction.
x, -direction.
y, -direction.
z);
1066 result.normal = result.normal / result.normal.magnitude();
1070 }
else if (cached.type == PRIMITIVE_TYPE_VOXEL && cached.vertices.size() == 8) {
1073 vec3 aabb_min = cached.vertices[0];
1074 vec3 aabb_max = cached.vertices[0];
1076 for (
int i = 1; i < 8; i++) {
1077 aabb_min.
x = std::min(aabb_min.
x, cached.vertices[i].x);
1078 aabb_min.
y = std::min(aabb_min.
y, cached.vertices[i].y);
1079 aabb_min.
z = std::min(aabb_min.
z, cached.vertices[i].z);
1080 aabb_max.
x = std::max(aabb_max.
x, cached.vertices[i].x);
1081 aabb_max.
y = std::max(aabb_max.
y, cached.vertices[i].y);
1082 aabb_max.
z = std::max(aabb_max.
z, cached.vertices[i].z);
1086 float t_near = -std::numeric_limits<float>::max();
1087 float t_far = std::numeric_limits<float>::max();
1090 for (
int axis = 0; axis < 3; axis++) {
1091 float ray_dir_component = (axis == 0) ? direction.
x : (axis == 1) ? direction.y : direction.z;
1092 float ray_orig_component = (axis == 0) ? origin.
x : (axis == 1) ? origin.y : origin.z;
1093 float aabb_min_component = (axis == 0) ? aabb_min.
x : (axis == 1) ? aabb_min.y : aabb_min.z;
1094 float aabb_max_component = (axis == 0) ? aabb_max.
x : (axis == 1) ? aabb_max.y : aabb_max.z;
1096 if (std::abs(ray_dir_component) < 1e-8f) {
1098 if (ray_orig_component < aabb_min_component || ray_orig_component > aabb_max_component) {
1103 float t1 = (aabb_min_component - ray_orig_component) / ray_dir_component;
1104 float t2 = (aabb_max_component - ray_orig_component) / ray_dir_component;
1112 t_near = std::max(t_near, t1);
1113 t_far = std::min(t_far, t2);
1116 if (t_near > t_far) {
1123 if (t_far >= 0.0f) {
1125 float intersection_distance = (t_near >= 1e-6f) ? t_near : t_far;
1126 if (intersection_distance >= 1e-6f && (max_distance <= 0 || intersection_distance < max_distance)) {
1128 result.distance = intersection_distance;
1129 result.primitive_UUID = primitive_id;
1130 result.intersection_point = origin + direction * intersection_distance;
1134 vec3 hit_point = result.intersection_point;
1135 vec3 box_center = (aabb_min + aabb_max) * 0.5f;
1136 vec3 box_extent = (aabb_max - aabb_min) * 0.5f;
1139 vec3 local_hit = hit_point - box_center;
1140 vec3 abs_local_hit =
make_vec3(std::abs(local_hit.
x), std::abs(local_hit.
y), std::abs(local_hit.
z));
1143 float rel_x = abs_local_hit.
x / box_extent.
x;
1144 float rel_y = abs_local_hit.
y / box_extent.
y;
1145 float rel_z = abs_local_hit.
z / box_extent.
z;
1147 if (rel_x >= rel_y && rel_x >= rel_z) {
1149 result.normal =
make_vec3((local_hit.
x > 0) ? 1.0f : -1.0f, 0.0f, 0.0f);
1150 }
else if (rel_y >= rel_z) {
1152 result.normal =
make_vec3(0.0f, (local_hit.
y > 0) ? 1.0f : -1.0f, 0.0f);
1155 result.normal =
make_vec3(0.0f, 0.0f, (local_hit.
z > 0) ? 1.0f : -1.0f);
1165void CollisionDetection::buildPrimitiveCache() {
1166 primitive_cache.clear();
1169 std::vector<uint> all_primitives = context->
getAllUUIDs();
1172 for (
uint primitive_id: all_primitives) {
1178 primitive_cache[primitive_id] = CachedPrimitive(type, vertices);
1179 }
catch (
const std::exception &e) {
1182 if (printmessages) {
1183 std::cout <<
"Warning: Skipping primitive " << primitive_id <<
" in cache build (not accessible: " << e.what() <<
")" << std::endl;
1190 if (printmessages) {
1194bool CollisionDetection::triangleIntersect(
const vec3 &origin,
const vec3 &direction,
const vec3 &v0,
const vec3 &v1,
const vec3 &v2,
float &distance) {
1197 const float EPSILON = 1e-5f;
1200 float edge1_x = v1.
x - v0.
x, edge1_y = v1.
y - v0.
y, edge1_z = v1.
z - v0.
z;
1201 float edge2_x = v2.
x - v0.
x, edge2_y = v2.
y - v0.
y, edge2_z = v2.
z - v0.
z;
1204 float h_x = direction.
y * edge2_z - direction.
z * edge2_y;
1205 float h_y = direction.
z * edge2_x - direction.
x * edge2_z;
1206 float h_z = direction.
x * edge2_y - direction.
y * edge2_x;
1209 float a = edge1_x * h_x + edge1_y * h_y + edge1_z * h_z;
1211 if (a > -EPSILON && a < EPSILON) {
1218 float s_x = origin.
x - v0.
x, s_y = origin.
y - v0.
y, s_z = origin.
z - v0.
z;
1221 float u = f * (s_x * h_x + s_y * h_y + s_z * h_z);
1223 if (u < -EPSILON || u > 1.0f + EPSILON) {
1228 float q_x = s_y * edge1_z - s_z * edge1_y;
1229 float q_y = s_z * edge1_x - s_x * edge1_z;
1230 float q_z = s_x * edge1_y - s_y * edge1_x;
1233 float v = f * (direction.
x * q_x + direction.
y * q_y + direction.
z * q_z);
1235 if (v < -EPSILON || u + v > 1.0f + EPSILON) {
1240 float t = f * (edge2_x * q_x + edge2_y * q_y + edge2_z * q_z);
1250bool CollisionDetection::patchIntersect(
const vec3 &origin,
const vec3 &direction,
const vec3 &v0,
const vec3 &v1,
const vec3 &v2,
const vec3 &v3,
float &distance) {
1252 const float EPSILON = 1e-8f;
1263 float denom = direction * normal;
1264 if (std::abs(denom) > EPSILON) {
1265 float t = (anchor - origin) * normal / denom;
1267 if (t > EPSILON && t < 1e8f) {
1269 vec3 p = origin + direction * t;
1270 vec3 d = p - anchor;
1273 float ddota = d * a;
1274 float ddotb = d * b;
1277 if (ddota >= -EPSILON && ddota <= (a * a) + EPSILON && ddotb >= -EPSILON && ddotb <= (b * b) + EPSILON) {
1287#ifdef HELIOS_CUDA_AVAILABLE
1288#include <vector_types.h>
1296std::vector<CollisionDetection::HitResult> CollisionDetection::castRaysGPU(
const std::vector<RayQuery> &ray_queries, RayTracingStats &stats) {
1297 std::vector<HitResult> results;
1298 results.resize(ray_queries.size());
1300 if (ray_queries.empty() || !gpu_acceleration_enabled) {
1301 return castRaysSoA(ray_queries, stats);
1304 auto start = std::chrono::high_resolution_clock::now();
1307 if (bvh_nodes.empty()) {
1308 if (printmessages) {
1311 if (bvh_nodes.empty()) {
1312 helios_runtime_error(
"ERROR: BVH construction failed - no geometry available for ray tracing. Ensure primitives are properly added to the collision detection system.");
1317 std::vector<float> ray_origins(ray_queries.size() * 3);
1318 std::vector<float> ray_directions(ray_queries.size() * 3);
1319 std::vector<float> ray_max_distances(ray_queries.size());
1321 for (
size_t i = 0; i < ray_queries.size(); i++) {
1322 vec3 normalized_dir = ray_queries[i].direction;
1323 if (normalized_dir.
magnitude() > 1e-8f) {
1324 normalized_dir = normalized_dir / normalized_dir.
magnitude();
1327 ray_origins[i * 3] = ray_queries[i].origin.x;
1328 ray_origins[i * 3 + 1] = ray_queries[i].origin.y;
1329 ray_origins[i * 3 + 2] = ray_queries[i].origin.z;
1331 ray_directions[i * 3] = normalized_dir.
x;
1332 ray_directions[i * 3 + 1] = normalized_dir.
y;
1333 ray_directions[i * 3 + 2] = normalized_dir.
z;
1335 ray_max_distances[i] = ray_queries[i].max_distance;
1341 std::vector<int> primitive_types(primitive_indices.size());
1342 std::vector<float3> primitive_vertices;
1343 std::vector<unsigned int> vertex_offsets(primitive_indices.size());
1345 size_t triangle_count = 0;
1346 size_t patch_count = 0;
1347 size_t voxel_count = 0;
1349 if (printmessages) {
1353 size_t vertex_index = 0;
1354 for (
size_t i = 0; i < primitive_indices.size(); i++) {
1355 vertex_offsets[i] = vertex_index;
1358 primitive_types[i] =
static_cast<int>(ptype);
1362 if (ptype == PRIMITIVE_TYPE_TRIANGLE) {
1364 if (vertices.size() >= 3) {
1366 for (
int v = 0; v < 3; v++) {
1367 primitive_vertices.push_back(make_float3(vertices[v].x, vertices[v].y, vertices[v].z));
1369 primitive_vertices.push_back(make_float3(0, 0, 0));
1372 if (printmessages) {
1373 std::cout <<
"WARNING: Triangle primitive " << primitive_indices[i] <<
" has " << vertices.size() <<
" vertices" << std::endl;
1376 for (
int v = 0; v < 4; v++) {
1377 primitive_vertices.push_back(make_float3(0, 0, 0));
1381 }
else if (ptype == PRIMITIVE_TYPE_PATCH) {
1383 if (vertices.size() >= 4) {
1385 for (
int v = 0; v < 4; v++) {
1386 primitive_vertices.push_back(make_float3(vertices[v].x, vertices[v].y, vertices[v].z));
1390 if (printmessages) {
1391 std::cout <<
"WARNING: Patch primitive " << primitive_indices[i] <<
" has " << vertices.size() <<
" vertices" << std::endl;
1394 for (
int v = 0; v < 4; v++) {
1395 primitive_vertices.push_back(make_float3(0, 0, 0));
1399 }
else if (ptype == PRIMITIVE_TYPE_VOXEL) {
1403 vec3 voxel_min = vertices[0];
1404 vec3 voxel_max = vertices[0];
1407 for (
const auto &vertex: vertices) {
1408 voxel_min.
x = std::min(voxel_min.
x, vertex.x);
1409 voxel_min.
y = std::min(voxel_min.
y, vertex.y);
1410 voxel_min.
z = std::min(voxel_min.
z, vertex.z);
1411 voxel_max.
x = std::max(voxel_max.
x, vertex.x);
1412 voxel_max.
y = std::max(voxel_max.
y, vertex.y);
1413 voxel_max.
z = std::max(voxel_max.
z, vertex.z);
1417 primitive_vertices.push_back(make_float3(voxel_min.
x, voxel_min.
y, voxel_min.
z));
1418 primitive_vertices.push_back(make_float3(voxel_max.
x, voxel_max.
y, voxel_max.
z));
1419 primitive_vertices.push_back(make_float3(0, 0, 0));
1420 primitive_vertices.push_back(make_float3(0, 0, 0));
1424 for (
int v = 0; v < 4; v++) {
1425 primitive_vertices.push_back(make_float3(0, 0, 0));
1431 if (printmessages) {
1434 if (primitive_indices.empty()) {
1435 if (printmessages) {
1436 std::cout <<
"No primitives found for GPU ray tracing, falling back to CPU" << std::endl;
1438 return castRaysSoA(ray_queries, stats);
1443 std::vector<float> hit_distances(ray_queries.size());
1444 std::vector<unsigned int> hit_primitive_ids(ray_queries.size());
1445 std::vector<unsigned int> hit_counts(ray_queries.size());
1448 std::vector<GPUBVHNode> gpu_bvh_nodes(bvh_nodes.size());
1449 for (
size_t i = 0; i < bvh_nodes.size(); i++) {
1450 gpu_bvh_nodes[i].aabb_min = make_float3(bvh_nodes[i].aabb_min.
x, bvh_nodes[i].aabb_min.y, bvh_nodes[i].aabb_min.z);
1451 gpu_bvh_nodes[i].aabb_max = make_float3(bvh_nodes[i].aabb_max.
x, bvh_nodes[i].aabb_max.y, bvh_nodes[i].aabb_max.z);
1452 gpu_bvh_nodes[i].left_child = bvh_nodes[i].left_child;
1453 gpu_bvh_nodes[i].right_child = bvh_nodes[i].right_child;
1454 gpu_bvh_nodes[i].primitive_start = bvh_nodes[i].primitive_start;
1455 gpu_bvh_nodes[i].primitive_count = bvh_nodes[i].primitive_count;
1456 gpu_bvh_nodes[i].is_leaf = bvh_nodes[i].is_leaf ? 1 : 0;
1457 gpu_bvh_nodes[i].padding = 0;
1460 if (bvh_nodes[i].is_leaf && printmessages && i < 3) {
1461 std::cout <<
"BVH leaf node " << i <<
": primitive_start=" << bvh_nodes[i].primitive_start <<
", primitive_count=" << bvh_nodes[i].primitive_count <<
", max_index=" << (bvh_nodes[i].primitive_start + bvh_nodes[i].primitive_count - 1)
1468 static_cast<int>(gpu_bvh_nodes.size()),
1469 primitive_indices.data(),
1470 static_cast<int>(primitive_indices.size()),
1471 primitive_types.data(),
1472 primitive_vertices.data(),
1473 vertex_offsets.data(),
1474 static_cast<int>(primitive_vertices.size()),
1476 ray_directions.data(),
1477 ray_max_distances.data(),
1478 static_cast<int>(ray_queries.size()),
1479 hit_distances.data(),
1480 hit_primitive_ids.data(),
1486 size_t hit_count = 0;
1487 size_t filtered_count = 0;
1488 for (
size_t i = 0; i < ray_queries.size(); i++) {
1489 if (hit_counts[i] > 0 && hit_distances[i] <= ray_queries[i].max_distance) {
1490 results[i].hit =
true;
1491 results[i].primitive_UUID = hit_primitive_ids[i];
1492 results[i].distance = hit_distances[i];
1493 results[i].intersection_point = ray_queries[i].origin + ray_queries[i].direction * hit_distances[i];
1497 if (ptype == PRIMITIVE_TYPE_TRIANGLE || ptype == PRIMITIVE_TYPE_PATCH) {
1499 if (vertices.size() >= 3) {
1500 vec3 edge1 = vertices[1] - vertices[0];
1501 vec3 edge2 = vertices[2] - vertices[0];
1502 results[i].normal =
cross(edge1, edge2);
1503 if (results[i].normal.magnitude() > 1e-8f) {
1504 results[i].normal.normalize();
1511 if (hit_counts[i] > 0) {
1514 results[i].hit =
false;
1515 results[i].primitive_UUID = 0;
1516 results[i].distance = std::numeric_limits<float>::max();
1520 auto end = std::chrono::high_resolution_clock::now();
1521 double elapsed = std::chrono::duration<double, std::milli>(end - start).count();
1523 stats.total_rays_cast = ray_queries.size();
1524 stats.total_hits = hit_count;
1525 double rays_per_second = ray_queries.size() / (elapsed / 1000.0);
1535uint32_t CollisionDetection::rayAABBIntersectSIMD(
const vec3 *ray_origins,
const vec3 *ray_directions,
const vec3 *aabb_mins,
const vec3 *aabb_maxs,
float *t_mins,
float *t_maxs,
int count) {
1539 uint32_t hit_mask = 0;
1541 for (
int i = 0; i < 8; i += 8) {
1543 __m256 orig_x = _mm256_set_ps(ray_origins[i + 7].x, ray_origins[i + 6].x, ray_origins[i + 5].x, ray_origins[i + 4].x, ray_origins[i + 3].x, ray_origins[i + 2].x, ray_origins[i + 1].x, ray_origins[i + 0].x);
1544 __m256 orig_y = _mm256_set_ps(ray_origins[i + 7].y, ray_origins[i + 6].y, ray_origins[i + 5].y, ray_origins[i + 4].y, ray_origins[i + 3].y, ray_origins[i + 2].y, ray_origins[i + 1].y, ray_origins[i + 0].y);
1545 __m256 orig_z = _mm256_set_ps(ray_origins[i + 7].z, ray_origins[i + 6].z, ray_origins[i + 5].z, ray_origins[i + 4].z, ray_origins[i + 3].z, ray_origins[i + 2].z, ray_origins[i + 1].z, ray_origins[i + 0].z);
1548 __m256 dir_x = _mm256_set_ps(ray_directions[i + 7].x, ray_directions[i + 6].x, ray_directions[i + 5].x, ray_directions[i + 4].x, ray_directions[i + 3].x, ray_directions[i + 2].x, ray_directions[i + 1].x, ray_directions[i + 0].x);
1549 __m256 dir_y = _mm256_set_ps(ray_directions[i + 7].y, ray_directions[i + 6].y, ray_directions[i + 5].y, ray_directions[i + 4].y, ray_directions[i + 3].y, ray_directions[i + 2].y, ray_directions[i + 1].y, ray_directions[i + 0].y);
1550 __m256 dir_z = _mm256_set_ps(ray_directions[i + 7].z, ray_directions[i + 6].z, ray_directions[i + 5].z, ray_directions[i + 4].z, ray_directions[i + 3].z, ray_directions[i + 2].z, ray_directions[i + 1].z, ray_directions[i + 0].z);
1553 __m256 aabb_min_x = _mm256_set_ps(aabb_mins[i + 7].x, aabb_mins[i + 6].x, aabb_mins[i + 5].x, aabb_mins[i + 4].x, aabb_mins[i + 3].x, aabb_mins[i + 2].x, aabb_mins[i + 1].x, aabb_mins[i + 0].x);
1554 __m256 aabb_min_y = _mm256_set_ps(aabb_mins[i + 7].y, aabb_mins[i + 6].y, aabb_mins[i + 5].y, aabb_mins[i + 4].y, aabb_mins[i + 3].y, aabb_mins[i + 2].y, aabb_mins[i + 1].y, aabb_mins[i + 0].y);
1555 __m256 aabb_min_z = _mm256_set_ps(aabb_mins[i + 7].z, aabb_mins[i + 6].z, aabb_mins[i + 5].z, aabb_mins[i + 4].z, aabb_mins[i + 3].z, aabb_mins[i + 2].z, aabb_mins[i + 1].z, aabb_mins[i + 0].z);
1558 __m256 aabb_max_x = _mm256_set_ps(aabb_maxs[i + 7].x, aabb_maxs[i + 6].x, aabb_maxs[i + 5].x, aabb_maxs[i + 4].x, aabb_maxs[i + 3].x, aabb_maxs[i + 2].x, aabb_maxs[i + 1].x, aabb_maxs[i + 0].x);
1559 __m256 aabb_max_y = _mm256_set_ps(aabb_maxs[i + 7].y, aabb_maxs[i + 6].y, aabb_maxs[i + 5].y, aabb_maxs[i + 4].y, aabb_maxs[i + 3].y, aabb_maxs[i + 2].y, aabb_maxs[i + 1].y, aabb_maxs[i + 0].y);
1560 __m256 aabb_max_z = _mm256_set_ps(aabb_maxs[i + 7].z, aabb_maxs[i + 6].z, aabb_maxs[i + 5].z, aabb_maxs[i + 4].z, aabb_maxs[i + 3].z, aabb_maxs[i + 2].z, aabb_maxs[i + 1].z, aabb_maxs[i + 0].z);
1563 __m256 inv_dir_x = _mm256_div_ps(_mm256_set1_ps(1.0f), dir_x);
1564 __m256 inv_dir_y = _mm256_div_ps(_mm256_set1_ps(1.0f), dir_y);
1565 __m256 inv_dir_z = _mm256_div_ps(_mm256_set1_ps(1.0f), dir_z);
1568 __m256 t1_x = _mm256_mul_ps(_mm256_sub_ps(aabb_min_x, orig_x), inv_dir_x);
1569 __m256 t2_x = _mm256_mul_ps(_mm256_sub_ps(aabb_max_x, orig_x), inv_dir_x);
1570 __m256 tmin_x = _mm256_min_ps(t1_x, t2_x);
1571 __m256 tmax_x = _mm256_max_ps(t1_x, t2_x);
1574 __m256 t1_y = _mm256_mul_ps(_mm256_sub_ps(aabb_min_y, orig_y), inv_dir_y);
1575 __m256 t2_y = _mm256_mul_ps(_mm256_sub_ps(aabb_max_y, orig_y), inv_dir_y);
1576 __m256 tmin_y = _mm256_min_ps(t1_y, t2_y);
1577 __m256 tmax_y = _mm256_max_ps(t1_y, t2_y);
1580 __m256 t1_z = _mm256_mul_ps(_mm256_sub_ps(aabb_min_z, orig_z), inv_dir_z);
1581 __m256 t2_z = _mm256_mul_ps(_mm256_sub_ps(aabb_max_z, orig_z), inv_dir_z);
1582 __m256 tmin_z = _mm256_min_ps(t1_z, t2_z);
1583 __m256 tmax_z = _mm256_max_ps(t1_z, t2_z);
1586 __m256 t_min_final = _mm256_max_ps(_mm256_max_ps(tmin_x, tmin_y), tmin_z);
1587 __m256 t_max_final = _mm256_min_ps(_mm256_min_ps(tmax_x, tmax_y), tmax_z);
1590 _mm256_store_ps(&t_mins[i], t_min_final);
1591 _mm256_store_ps(&t_maxs[i], t_max_final);
1594 __m256 zero = _mm256_set1_ps(0.0f);
1595 __m256 hits = _mm256_and_ps(_mm256_cmp_ps(t_max_final, zero, _CMP_GE_OS), _mm256_cmp_ps(t_min_final, t_max_final, _CMP_LE_OS));
1598 hit_mask |= _mm256_movemask_ps(hits);
1608 uint32_t hit_mask = 0;
1610 for (
int i = 0; i < 4; i += 4) {
1612 __m128 orig_x = _mm_set_ps(ray_origins[i + 3].x, ray_origins[i + 2].x, ray_origins[i + 1].x, ray_origins[i + 0].x);
1613 __m128 orig_y = _mm_set_ps(ray_origins[i + 3].y, ray_origins[i + 2].y, ray_origins[i + 1].y, ray_origins[i + 0].y);
1614 __m128 orig_z = _mm_set_ps(ray_origins[i + 3].z, ray_origins[i + 2].z, ray_origins[i + 1].z, ray_origins[i + 0].z);
1617 __m128 dir_x = _mm_set_ps(ray_directions[i + 3].x, ray_directions[i + 2].x, ray_directions[i + 1].x, ray_directions[i + 0].x);
1618 __m128 dir_y = _mm_set_ps(ray_directions[i + 3].y, ray_directions[i + 2].y, ray_directions[i + 1].y, ray_directions[i + 0].y);
1619 __m128 dir_z = _mm_set_ps(ray_directions[i + 3].z, ray_directions[i + 2].z, ray_directions[i + 1].z, ray_directions[i + 0].z);
1622 __m128 aabb_min_x = _mm_set_ps(aabb_mins[i + 3].x, aabb_mins[i + 2].x, aabb_mins[i + 1].x, aabb_mins[i + 0].x);
1623 __m128 aabb_min_y = _mm_set_ps(aabb_mins[i + 3].y, aabb_mins[i + 2].y, aabb_mins[i + 1].y, aabb_mins[i + 0].y);
1624 __m128 aabb_min_z = _mm_set_ps(aabb_mins[i + 3].z, aabb_mins[i + 2].z, aabb_mins[i + 1].z, aabb_mins[i + 0].z);
1627 __m128 aabb_max_x = _mm_set_ps(aabb_maxs[i + 3].x, aabb_maxs[i + 2].x, aabb_maxs[i + 1].x, aabb_maxs[i + 0].x);
1628 __m128 aabb_max_y = _mm_set_ps(aabb_maxs[i + 3].y, aabb_maxs[i + 2].y, aabb_maxs[i + 1].y, aabb_maxs[i + 0].y);
1629 __m128 aabb_max_z = _mm_set_ps(aabb_maxs[i + 3].z, aabb_maxs[i + 2].z, aabb_maxs[i + 1].z, aabb_maxs[i + 0].z);
1632 __m128 inv_dir_x = _mm_div_ps(_mm_set1_ps(1.0f), dir_x);
1633 __m128 inv_dir_y = _mm_div_ps(_mm_set1_ps(1.0f), dir_y);
1634 __m128 inv_dir_z = _mm_div_ps(_mm_set1_ps(1.0f), dir_z);
1637 __m128 t1_x = _mm_mul_ps(_mm_sub_ps(aabb_min_x, orig_x), inv_dir_x);
1638 __m128 t2_x = _mm_mul_ps(_mm_sub_ps(aabb_max_x, orig_x), inv_dir_x);
1639 __m128 tmin_x = _mm_min_ps(t1_x, t2_x);
1640 __m128 tmax_x = _mm_max_ps(t1_x, t2_x);
1643 __m128 t1_y = _mm_mul_ps(_mm_sub_ps(aabb_min_y, orig_y), inv_dir_y);
1644 __m128 t2_y = _mm_mul_ps(_mm_sub_ps(aabb_max_y, orig_y), inv_dir_y);
1645 __m128 tmin_y = _mm_min_ps(t1_y, t2_y);
1646 __m128 tmax_y = _mm_max_ps(t1_y, t2_y);
1649 __m128 t1_z = _mm_mul_ps(_mm_sub_ps(aabb_min_z, orig_z), inv_dir_z);
1650 __m128 t2_z = _mm_mul_ps(_mm_sub_ps(aabb_max_z, orig_z), inv_dir_z);
1651 __m128 tmin_z = _mm_min_ps(t1_z, t2_z);
1652 __m128 tmax_z = _mm_max_ps(t1_z, t2_z);
1655 __m128 t_min_final = _mm_max_ps(_mm_max_ps(tmin_x, tmin_y), tmin_z);
1656 __m128 t_max_final = _mm_min_ps(_mm_min_ps(tmax_x, tmax_y), tmax_z);
1659 _mm_store_ps(&t_mins[i], t_min_final);
1660 _mm_store_ps(&t_maxs[i], t_max_final);
1663 __m128 zero = _mm_set1_ps(0.0f);
1664 __m128 hits = _mm_and_ps(_mm_cmpge_ps(t_max_final, zero), _mm_cmple_ps(t_min_final, t_max_final));
1667 hit_mask |= _mm_movemask_ps(hits);
1675 uint32_t hit_mask = 0;
1676 for (
int i = 0; i < count; ++i) {
1677 if (rayAABBIntersect(ray_origins[i], ray_directions[i], aabb_mins[i], aabb_maxs[i], t_mins[i], t_maxs[i])) {
1678 hit_mask |= (1 << i);
1684void CollisionDetection::traverseBVHSIMD(
const vec3 *ray_origins,
const vec3 *ray_directions,
int count, HitResult *results) {
1685 if (bvh_nodes.empty()) {
1687 for (
int i = 0; i < count; ++i) {
1688 results[i] = HitResult();
1694 int simd_batch_size = 1;
1696 simd_batch_size = 8;
1697#elif defined(__SSE4_1__)
1698 simd_batch_size = 4;
1702 for (
int batch_start = 0; batch_start < count; batch_start += simd_batch_size) {
1703 int batch_count = std::min(simd_batch_size, count - batch_start);
1706 for (
int i = 0; i < batch_count; ++i) {
1707 results[batch_start + i] = HitResult();
1710 if (batch_count >= simd_batch_size && simd_batch_size > 1) {
1712 traverseBVHSIMDImpl(&ray_origins[batch_start], &ray_directions[batch_start], batch_count, &results[batch_start]);
1715 for (
int i = 0; i < batch_count; ++i) {
1716 int ray_idx = batch_start + i;
1717 results[ray_idx] =
castRay(RayQuery(ray_origins[ray_idx], ray_directions[ray_idx]));
1723void CollisionDetection::traverseBVHSIMDImpl(
const vec3 *ray_origins,
const vec3 *ray_directions,
int count, HitResult *results) {
1724 const size_t MAX_STACK_SIZE = 64;
1727 alignas(32) uint32_t node_stacks[8][MAX_STACK_SIZE];
1728 alignas(32) uint32_t stack_tops[8] = {0};
1729 alignas(32)
float closest_distances[8];
1730 alignas(32)
bool ray_active[8];
1733 for (
int i = 0; i < count; ++i) {
1734 node_stacks[i][0] = 0;
1736 closest_distances[i] = std::numeric_limits<float>::max();
1737 ray_active[i] =
true;
1738 results[i] = HitResult();
1743 bool any_active =
false;
1744 for (
int i = 0; i < count; ++i) {
1745 if (ray_active[i] && stack_tops[i] > 0) {
1754 alignas(32)
vec3 test_aabb_mins[8];
1755 alignas(32)
vec3 test_aabb_maxs[8];
1756 alignas(32) uint32_t test_node_indices[8];
1757 alignas(32)
int test_ray_indices[8];
1760 for (
int i = 0; i < count; ++i) {
1761 if (ray_active[i] && stack_tops[i] > 0) {
1762 uint32_t node_idx = node_stacks[i][--stack_tops[i]];
1763 const BVHNode &node = bvh_nodes[node_idx];
1765 test_aabb_mins[test_count] = node.aabb_min;
1766 test_aabb_maxs[test_count] = node.aabb_max;
1767 test_node_indices[test_count] = node_idx;
1768 test_ray_indices[test_count] = i;
1771 if (test_count == count)
1776 if (test_count == 0)
1780 alignas(32)
vec3 batch_origins[8];
1781 alignas(32)
vec3 batch_directions[8];
1782 alignas(32)
float t_mins[8];
1783 alignas(32)
float t_maxs[8];
1785 for (
int i = 0; i < test_count; ++i) {
1786 int ray_idx = test_ray_indices[i];
1787 batch_origins[i] = ray_origins[ray_idx];
1788 batch_directions[i] = ray_directions[ray_idx];
1792 uint32_t hit_mask = rayAABBIntersectSIMD(batch_origins, batch_directions, test_aabb_mins, test_aabb_maxs, t_mins, t_maxs, test_count);
1795 for (
int i = 0; i < test_count; ++i) {
1796 if (!(hit_mask & (1 << i)))
1799 int ray_idx = test_ray_indices[i];
1800 uint32_t node_idx = test_node_indices[i];
1801 const BVHNode &node = bvh_nodes[node_idx];
1803 if (t_mins[i] > closest_distances[ray_idx])
1808 for (uint32_t prim_idx = node.primitive_start; prim_idx < node.primitive_start + node.primitive_count; ++prim_idx) {
1810 uint32_t primitive_id = primitive_indices[prim_idx];
1813 HitResult prim_result = intersectPrimitiveThreadSafe(batch_origins[i], batch_directions[i], primitive_id, closest_distances[ray_idx]);
1814 if (prim_result.hit && prim_result.distance < closest_distances[ray_idx]) {
1815 closest_distances[ray_idx] = prim_result.distance;
1816 results[ray_idx] = prim_result;
1821 if (stack_tops[ray_idx] < MAX_STACK_SIZE - 2) {
1822 node_stacks[ray_idx][stack_tops[ray_idx]++] = node.left_child;
1823 node_stacks[ray_idx][stack_tops[ray_idx]++] = node.right_child;
1826 ray_active[ray_idx] =
false;
1839 const float EPSILON = 1e-8f;
1842 float t_min_x = (aabb_min.
x - origin.
x) / direction.
x;
1843 float t_max_x = (aabb_max.
x - origin.
x) / direction.
x;
1846 if (direction.
x < 0.0f) {
1847 float temp = t_min_x;
1852 float t_min_y = (aabb_min.
y - origin.
y) / direction.
y;
1853 float t_max_y = (aabb_max.
y - origin.
y) / direction.
y;
1855 if (direction.
y < 0.0f) {
1856 float temp = t_min_y;
1862 float t_min = std::max(t_min_x, t_min_y);
1863 float t_max = std::min(t_max_x, t_max_y);
1865 if (t_min > t_max) {
1869 float t_min_z = (aabb_min.
z - origin.
z) / direction.
z;
1870 float t_max_z = (aabb_max.
z - origin.
z) / direction.
z;
1872 if (direction.
z < 0.0f) {
1873 float temp = t_min_z;
1879 t_min = std::max(t_min, t_min_z);
1880 t_max = std::min(t_max, t_max_z);
1882 if (t_min > t_max || t_max < EPSILON) {
1887 distance = (t_min > EPSILON) ? t_min : t_max;
1889 return distance > EPSILON;