27#elif defined(__SSE4_1__)
29#elif defined(__SSE2__)
39#ifdef HELIOS_CUDA_AVAILABLE
40#include <cuda_runtime.h>
43using namespace helios;
45#ifdef HELIOS_CUDA_AVAILABLE
56void launchBVHTraversal(
void *h_nodes,
int node_count,
unsigned int *h_primitive_indices,
int primitive_count,
float *h_primitive_aabb_min,
float *h_primitive_aabb_max,
float *h_query_aabb_min,
float *h_query_aabb_max,
int num_queries,
57 unsigned int *h_results,
unsigned int *h_result_counts,
int max_results_per_query);
58void launchVoxelRayPathLengths(
int num_rays,
float *h_ray_origins,
float *h_ray_directions,
float grid_center_x,
float grid_center_y,
float grid_center_z,
float grid_size_x,
float grid_size_y,
float grid_size_z,
int grid_divisions_x,
59 int grid_divisions_y,
int grid_divisions_z,
int primitive_count,
int *h_voxel_ray_counts,
float *h_voxel_path_lengths,
int *h_voxel_transmitted,
int *h_voxel_hit_before,
int *h_voxel_hit_after,
int *h_voxel_hit_inside);
61void launchWarpEfficientBVH(
void *h_bvh_soa_gpu,
unsigned int *h_primitive_indices,
int primitive_count,
float *h_primitive_aabb_min,
float *h_primitive_aabb_max,
float *h_ray_origins,
float *h_ray_directions,
float *h_ray_max_distances,
62 int num_rays,
unsigned int *h_results,
unsigned int *h_result_counts,
int max_results_per_ray);
64void launchRayPrimitiveIntersection(
void *h_bvh_nodes,
int node_count,
unsigned int *h_primitive_indices,
int primitive_count,
int *h_primitive_types, float3 *h_primitive_vertices,
unsigned int *h_vertex_offsets,
int total_vertex_count,
65 float *h_ray_origins,
float *h_ray_directions,
float *h_ray_max_distances,
int num_rays,
float *h_hit_distances,
unsigned int *h_hit_primitive_ids,
unsigned int *h_hit_counts,
bool find_closest_hit);
70 return make_float3(v.
x, v.
y, v.
z);
84 vec3 ray_direction = direction;
88 ray_direction = ray_direction / ray_direction.
magnitude();
95 if (!bvh_nodes.empty()) {
97 if (primitive_cache.empty()) {
98 buildPrimitiveCache();
102 RayQuery query(origin, ray_direction, max_distance, target_UUIDs);
103 return castRayBVHTraversal(query);
108 std::vector<uint> search_primitives;
109 if (target_UUIDs.empty()) {
111 if (!primitive_cache.empty()) {
113 search_primitives.reserve(primitive_cache.size());
114 for (
const auto &cached_pair: primitive_cache) {
115 search_primitives.push_back(cached_pair.first);
122 search_primitives = target_UUIDs;
126 float nearest_distance = std::numeric_limits<float>::max();
127 if (max_distance > 0) {
128 nearest_distance = max_distance;
131 uint hit_primitive = 0;
132 bool found_intersection =
false;
135 for (
uint candidate_uuid: search_primitives) {
136 float intersection_distance;
140 if (!primitive_cache.empty()) {
141 HitResult primitive_result = intersectPrimitiveThreadSafe(origin, ray_direction, candidate_uuid, max_distance);
142 if (primitive_result.
hit) {
144 intersection_distance = primitive_result.
distance;
151 hit = rayPrimitiveIntersection(origin, ray_direction, candidate_uuid, intersection_distance);
156 if (intersection_distance > 1e-6f &&
157 intersection_distance < nearest_distance) {
159 nearest_distance = intersection_distance;
160 hit_primitive = candidate_uuid;
161 found_intersection =
true;
167 if (found_intersection) {
180 vec3 v0 = vertices[0];
181 vec3 v1 = vertices[1];
182 vec3 v2 = vertices[2];
183 vec3 edge1 = v1 - v0;
184 vec3 edge2 = v2 - v0;
189 vec3 v0 = vertices[0];
190 vec3 v1 = vertices[1];
191 vec3 v2 = vertices[2];
192 vec3 edge1 = v1 - v0;
193 vec3 edge2 = v2 - v0;
200 }
catch (
const std::exception &e) {
211 std::vector<HitResult> results;
212 results.reserve(ray_queries.size());
222 const size_t GPU_BATCH_THRESHOLD = 1000000;
223 const size_t MIN_PRIMITIVES_FOR_GPU = 500;
225#ifdef HELIOS_CUDA_AVAILABLE
226 bool use_gpu = gpu_acceleration_enabled && ray_queries.size() >= GPU_BATCH_THRESHOLD && d_bvh_nodes !=
nullptr && !primitive_indices.empty() && primitive_indices.size() >= MIN_PRIMITIVES_FOR_GPU;
230 castRaysGPU(ray_queries, results, local_stats);
233 castRaysCPU(ray_queries, results, local_stats);
237 castRaysCPU(ray_queries, results, local_stats);
241 if (stats !=
nullptr) {
242 *stats = local_stats;
248void CollisionDetection::castRaysCPU(
const std::vector<RayQuery> &ray_queries, std::vector<HitResult> &results, RayTracingStats &stats) {
253#ifdef HELIOS_CUDA_AVAILABLE
254void CollisionDetection::castRaysGPU(
const std::vector<RayQuery> &ray_queries, std::vector<HitResult> &results, RayTracingStats &stats) {
256 results = castRaysGPU(ray_queries, stats);
261 const std::vector<RayQuery> &ray_queries) {
264 std::vector<std::vector<std::vector<std::vector<HitResult>>>> grid_results;
265 grid_results.resize(grid_divisions.
x);
266 for (
int i = 0; i < grid_divisions.
x; i++) {
267 grid_results[i].resize(grid_divisions.
y);
268 for (
int j = 0; j < grid_divisions.
y; j++) {
269 grid_results[i][j].resize(grid_divisions.
z);
274 vec3 voxel_size =
make_vec3(grid_size.
x /
float(grid_divisions.
x), grid_size.
y /
float(grid_divisions.
y), grid_size.
z /
float(grid_divisions.
z));
277 for (
const auto &query: ray_queries) {
280 if (hit_result.
hit) {
284 int voxel_i = int(relative_pos.
x / voxel_size.
x);
285 int voxel_j = int(relative_pos.
y / voxel_size.
y);
286 int voxel_k = int(relative_pos.
z / voxel_size.
z);
289 if (voxel_i >= 0 && voxel_i < grid_divisions.x && voxel_j >= 0 && voxel_j < grid_divisions.y && voxel_k >= 0 && voxel_k < grid_divisions.
z) {
291 grid_results[voxel_i][voxel_j][voxel_k].push_back(hit_result);
301 if (ray_directions.empty()) {
303 std::cout <<
"WARNING (CollisionDetection::calculateVoxelPathLengths): No rays provided" << std::endl;
305 return std::vector<std::vector<HitResult>>();
308 if (voxel_centers.size() != voxel_sizes.size()) {
309 helios_runtime_error(
"ERROR (CollisionDetection::calculateVoxelPathLengths): voxel_centers and voxel_sizes vectors must have same size");
312 if (voxel_centers.empty()) {
314 std::cout <<
"WARNING (CollisionDetection::calculateVoxelPathLengths): No voxels provided" << std::endl;
316 return std::vector<std::vector<HitResult>>();
319 const size_t num_rays = ray_directions.size();
320 const size_t num_voxels = voxel_centers.size();
323 std::cout <<
"Calculating voxel path lengths for " << num_rays <<
" rays through " << num_voxels <<
" voxels..." << std::endl;
327 std::vector<std::vector<HitResult>> result(num_voxels);
330#pragma omp parallel for schedule(dynamic, 1000)
331 for (
int ray_idx = 0; ray_idx < static_cast<int>(num_rays); ++ray_idx) {
332 const vec3 &ray_direction = ray_directions[ray_idx];
335 for (
size_t voxel_idx = 0; voxel_idx < num_voxels; ++voxel_idx) {
336 const vec3 &voxel_center = voxel_centers[voxel_idx];
337 const vec3 &voxel_size = voxel_sizes[voxel_idx];
340 const vec3 half_size = voxel_size * 0.5f;
341 const vec3 voxel_min = voxel_center - half_size;
342 const vec3 voxel_max = voxel_center + half_size;
346 if (rayAABBIntersect(scan_origin, ray_direction, voxel_min, voxel_max, t_min, t_max)) {
348 const float path_length = t_max - std::max(0.0f, t_min);
350 if (path_length > 1e-6f) {
353 hit_result.
hit =
false;
363 result[voxel_idx].push_back(hit_result);
371 size_t total_intersections = 0;
372 for (
size_t i = 0; i < num_voxels; ++i) {
373 total_intersections += result[i].size();
375 std::cout <<
"Completed voxel path length calculations. Total ray-voxel intersections: " << total_intersections << std::endl;
382 std::vector<HitResult> &hit_results) {
385 hit_results.reserve(ray_origins.size());
387 if (ray_origins.size() != ray_directions.size()) {
388 helios_runtime_error(
"ERROR (CollisionDetection::calculateRayPathLengthsDetailed): ray_origins and ray_directions must have the same size");
396 for (
size_t i = 0; i < ray_origins.size(); i++) {
397 RayQuery query(ray_origins[i], ray_directions[i]);
399 hit_results.push_back(result);
408 if (mode == bvh_optimization_mode) {
413 bvh_optimization_mode = mode;
416 if (old_mode != mode && !bvh_nodes.empty()) {
418 std::cout <<
"CollisionDetection: Converting BVH from mode " <<
static_cast<int>(old_mode) <<
" to mode " <<
static_cast<int>(mode) << std::endl;
422 ensureOptimizedBVH();
425 auto memory_stats = getBVHMemoryUsage();
426 std::cout <<
"CollisionDetection: Memory usage - SoA: " << memory_stats.soa_memory_bytes <<
" bytes, Quantized: " << memory_stats.quantized_memory_bytes <<
" bytes (" << memory_stats.quantized_reduction_percent <<
"% reduction)"
433 return bvh_optimization_mode;
436void CollisionDetection::convertBVHLayout(BVHOptimizationMode from_mode, BVHOptimizationMode to_mode) {
443 if (ray_queries.empty()) {
449 ensureOptimizedBVH();
452 if (primitive_cache.empty()) {
453 buildPrimitiveCache();
457 std::vector<HitResult> results;
458 results.reserve(ray_queries.size());
460 auto start_time = std::chrono::high_resolution_clock::now();
463 switch (bvh_optimization_mode) {
465 results = castRaysSoA(ray_queries, local_stats);
469 auto end_time = std::chrono::high_resolution_clock::now();
470 auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
473 *stats = local_stats;
480 if (ray_stream.
packets.empty()) {
487 for (
auto &packet: ray_stream.
packets) {
489 auto queries = packet.toRayQueries();
495 if (results.size() != queries.size()) {
501 packet.results = std::move(results);
511 *stats = combined_stats;
514 if (printmessages && success) {
515 std::cout <<
"CollisionDetection: Processed " << ray_stream.
packets.size() <<
" ray packets (" << ray_stream.
total_rays <<
" total rays)" << std::endl;
525 MemoryUsageStats stats;
528 stats.soa_memory_bytes = bvh_nodes_soa.getMemoryUsage();
531 stats.quantized_memory_bytes = 0;
532 stats.quantized_reduction_percent = 0.0f;
537std::vector<CollisionDetection::HitResult> CollisionDetection::castRaysSoA(
const std::vector<RayQuery> &ray_queries, RayTracingStats &stats) {
538 std::vector<HitResult> results;
539 results.reserve(ray_queries.size());
541 if (bvh_nodes_soa.node_count == 0) {
543 results.resize(ray_queries.size());
544 stats.total_rays_cast = ray_queries.size();
545 stats.total_hits = 0;
546 stats.bvh_nodes_visited = 0;
547 stats.average_ray_distance = 0.0f;
551 stats.total_rays_cast = ray_queries.size();
552 stats.total_hits = 0;
553 stats.average_ray_distance = 0.0;
556 results.resize(ray_queries.size());
561 RayTracingStats local_stats = {};
563#pragma omp for schedule(guided, 32)
564 for (
int i = 0; i < static_cast<int>(ray_queries.size()); ++i) {
565 HitResult result = castRaySoATraversal(ray_queries[i], local_stats);
569 local_stats.total_hits++;
570 local_stats.average_ray_distance += result.distance;
576 stats.total_hits += local_stats.total_hits;
579 stats.average_ray_distance += local_stats.average_ray_distance;
582 stats.bvh_nodes_visited += local_stats.bvh_nodes_visited;
585 if (stats.total_hits > 0) {
586 stats.average_ray_distance /= stats.total_hits;
596using namespace helios;
601 if (bvh_nodes_soa.node_count == 0 || bvh_nodes_soa.aabb_mins.empty()) {
606 std::stack<size_t> node_stack;
609 float closest_distance = (query.max_distance > 0) ? query.max_distance : std::numeric_limits<float>::
max();
612 const size_t MAX_TRAVERSAL_STEPS = 10000000;
613 size_t traversal_steps = 0;
615 while (!node_stack.empty()) {
617 if (++traversal_steps > MAX_TRAVERSAL_STEPS) {
619 std::cout <<
"WARNING: BVH traversal exceeded maximum steps, terminating ray" << std::endl;
624 size_t node_idx = node_stack.top();
626 stats.bvh_nodes_visited++;
629 if (node_idx >= bvh_nodes_soa.node_count) {
631 std::cout <<
"WARNING: Invalid node index " << node_idx <<
" >= " << bvh_nodes_soa.node_count << std::endl;
637 if (node_idx + 1 < bvh_nodes_soa.node_count) {
639 __builtin_prefetch(&bvh_nodes_soa.aabb_mins[node_idx + 1], 0, 1);
640 __builtin_prefetch(&bvh_nodes_soa.aabb_maxs[node_idx + 1], 0, 1);
641#elif defined(_MSC_VER)
642 _mm_prefetch(
reinterpret_cast<const char *
>(&bvh_nodes_soa.aabb_mins[node_idx + 1]), _MM_HINT_T1);
643 _mm_prefetch(
reinterpret_cast<const char *
>(&bvh_nodes_soa.aabb_maxs[node_idx + 1]), _MM_HINT_T1);
648 if (!aabbIntersectSoA(query.origin, query.direction, closest_distance, node_idx)) {
653 if (bvh_nodes_soa.is_leaf_flags[node_idx]) {
655 uint32_t primitive_start = bvh_nodes_soa.primitive_starts[node_idx];
656 uint32_t primitive_count = bvh_nodes_soa.primitive_counts[node_idx];
658 for (uint32_t i = 0; i < primitive_count; ++i) {
659 uint primitive_id = primitive_indices[primitive_start + i];
662 if (!query.target_UUIDs.empty()) {
664 for (
uint target: query.target_UUIDs) {
665 if (primitive_id == target) {
675 HitResult primitive_result = intersectPrimitiveThreadSafe(query.origin, query.direction, primitive_id, closest_distance);
676 if (primitive_result.hit && primitive_result.distance < closest_distance) {
677 result = primitive_result;
678 closest_distance = primitive_result.distance;
683 uint32_t left_child = bvh_nodes_soa.left_children[node_idx];
684 uint32_t right_child = bvh_nodes_soa.right_children[node_idx];
686 if (left_child != 0xFFFFFFFF && left_child < bvh_nodes_soa.node_count) {
687 node_stack.push(left_child);
689 if (right_child != 0xFFFFFFFF && right_child < bvh_nodes_soa.node_count) {
690 node_stack.push(right_child);
699bool CollisionDetection::aabbIntersectSoA(
const helios::vec3 &ray_origin,
const helios::vec3 &ray_direction,
float max_distance,
size_t node_index)
const {
701 const vec3 &aabb_min = bvh_nodes_soa.aabb_mins[node_index];
702 const vec3 &aabb_max = bvh_nodes_soa.aabb_maxs[node_index];
706 __m128 ray_orig = _mm_set_ps(0.0f, ray_origin.
z, ray_origin.
y, ray_origin.
x);
707 __m128 ray_dir = _mm_set_ps(0.0f, ray_direction.
z, ray_direction.
y, ray_direction.
x);
708 __m128 aabb_min_vec = _mm_set_ps(0.0f, aabb_min.
z, aabb_min.
y, aabb_min.
x);
709 __m128 aabb_max_vec = _mm_set_ps(0.0f, aabb_max.
z, aabb_max.
y, aabb_max.
x);
712 __m128 inv_dir = _mm_div_ps(_mm_set1_ps(1.0f), ray_dir);
715 __m128 t1 = _mm_mul_ps(_mm_sub_ps(aabb_min_vec, ray_orig), inv_dir);
716 __m128 t2 = _mm_mul_ps(_mm_sub_ps(aabb_max_vec, ray_orig), inv_dir);
719 __m128 tmin = _mm_min_ps(t1, t2);
720 __m128 tmax = _mm_max_ps(t1, t2);
723 float tmin_vals[4], tmax_vals[4];
724 _mm_store_ps(tmin_vals, tmin);
725 _mm_store_ps(tmax_vals, tmax);
727 float t_near = std::max({tmin_vals[0], tmin_vals[1], tmin_vals[2], 0.0f});
728 float t_far = std::min({tmax_vals[0], tmax_vals[1], tmax_vals[2], max_distance});
730 return t_near <= t_far;
733 float inv_dir_x = 1.0f / ray_direction.
x;
734 float inv_dir_y = 1.0f / ray_direction.
y;
735 float inv_dir_z = 1.0f / ray_direction.
z;
737 float t1_x = (aabb_min.
x - ray_origin.
x) * inv_dir_x;
738 float t2_x = (aabb_max.
x - ray_origin.
x) * inv_dir_x;
739 float t1_y = (aabb_min.
y - ray_origin.
y) * inv_dir_y;
740 float t2_y = (aabb_max.
y - ray_origin.
y) * inv_dir_y;
741 float t1_z = (aabb_min.
z - ray_origin.
z) * inv_dir_z;
742 float t2_z = (aabb_max.
z - ray_origin.
z) * inv_dir_z;
744 float tmin_x = std::min(t1_x, t2_x);
745 float tmax_x = std::max(t1_x, t2_x);
746 float tmin_y = std::min(t1_y, t2_y);
747 float tmax_y = std::max(t1_y, t2_y);
748 float tmin_z = std::min(t1_z, t2_z);
749 float tmax_z = std::max(t1_z, t2_z);
751 float t_near = std::max({tmin_x, tmin_y, tmin_z, 0.0f});
752 float t_far = std::min({tmax_x, tmax_y, tmax_z, max_distance});
754 return t_near <= t_far;
763 if (bvh_nodes.empty()) {
768 std::stack<size_t> node_stack;
771 float closest_distance = (query.max_distance > 0) ? query.max_distance : std::numeric_limits<float>::
max();
773 while (!node_stack.empty()) {
774 size_t node_idx = node_stack.top();
777 if (node_idx >= bvh_nodes.size()) {
779 std::cout <<
"ERROR: Invalid BVH node index " << node_idx <<
" >= " << bvh_nodes.size() <<
" nodes" << std::endl;
785 const BVHNode &node = bvh_nodes[node_idx];
788 if (!rayAABBIntersect(query.origin, query.direction, node.aabb_min, node.aabb_max)) {
794 for (uint32_t i = 0; i < node.primitive_count; ++i) {
795 if (node.primitive_start + i >= primitive_indices.size()) {
797 std::cout <<
"ERROR: Invalid BVH primitive index " << (node.primitive_start + i) <<
" >= " << primitive_indices.size() <<
" primitives" << std::endl;
803 uint primitive_id = primitive_indices[node.primitive_start + i];
806 if (!query.target_UUIDs.empty()) {
808 for (
uint target: query.target_UUIDs) {
809 if (primitive_id == target) {
819 HitResult primitive_hit = intersectPrimitive(query, primitive_id);
820 if (primitive_hit.hit && primitive_hit.distance < closest_distance) {
821 result = primitive_hit;
822 closest_distance = primitive_hit.distance;
827 if (node.left_child != 0xFFFFFFFF && node.left_child < bvh_nodes.size()) {
828 node_stack.push(node.left_child);
830 if (node.right_child != 0xFFFFFFFF && node.right_child < bvh_nodes.size()) {
831 node_stack.push(node.right_child);
840bool CollisionDetection::rayAABBIntersect(
const vec3 &ray_origin,
const vec3 &ray_direction,
const vec3 &aabb_min,
const vec3 &aabb_max)
const {
842 const float EPSILON = 1e-8f;
845 float tmax = std::numeric_limits<float>::max();
848 if (std::abs(ray_direction.
x) > EPSILON) {
849 float inv_dir_x = 1.0f / ray_direction.
x;
850 float t1 = (aabb_min.
x - ray_origin.
x) * inv_dir_x;
851 float t2 = (aabb_max.
x - ray_origin.
x) * inv_dir_x;
853 float slab_tmin = std::min(t1, t2);
854 float slab_tmax = std::max(t1, t2);
856 tmin = std::max(tmin, slab_tmin);
857 tmax = std::min(tmax, slab_tmax);
863 if (ray_origin.
x < aabb_min.
x || ray_origin.
x > aabb_max.
x) {
869 if (std::abs(ray_direction.
y) > EPSILON) {
870 float inv_dir_y = 1.0f / ray_direction.
y;
871 float t1 = (aabb_min.
y - ray_origin.
y) * inv_dir_y;
872 float t2 = (aabb_max.
y - ray_origin.
y) * inv_dir_y;
874 float slab_tmin = std::min(t1, t2);
875 float slab_tmax = std::max(t1, t2);
877 tmin = std::max(tmin, slab_tmin);
878 tmax = std::min(tmax, slab_tmax);
884 if (ray_origin.
y < aabb_min.
y || ray_origin.
y > aabb_max.
y) {
890 if (std::abs(ray_direction.
z) > EPSILON) {
891 float inv_dir_z = 1.0f / ray_direction.
z;
892 float t1 = (aabb_min.
z - ray_origin.
z) * inv_dir_z;
893 float t2 = (aabb_max.
z - ray_origin.
z) * inv_dir_z;
895 float slab_tmin = std::min(t1, t2);
896 float slab_tmax = std::max(t1, t2);
898 tmin = std::max(tmin, slab_tmin);
899 tmax = std::min(tmax, slab_tmax);
905 if (ray_origin.
z < aabb_min.
z || ray_origin.
z > aabb_max.
z) {
920 if (rayPrimitiveIntersection(query.origin, query.direction, primitive_id, distance)) {
922 result.distance = distance;
923 result.primitive_UUID = primitive_id;
924 result.intersection_point = query.origin + query.direction * distance;
930 if (type == PRIMITIVE_TYPE_TRIANGLE && vertices.size() >= 3) {
931 vec3 edge1 = vertices[1] - vertices[0];
932 vec3 edge2 = vertices[2] - vertices[0];
933 result.normal =
cross(edge1, edge2);
934 if (result.normal.magnitude() > 1e-8f) {
935 result.normal = result.normal / result.normal.magnitude();
937 result.normal =
make_vec3(-query.direction.x, -query.direction.y, -query.direction.z);
939 }
else if (type == PRIMITIVE_TYPE_PATCH && vertices.size() >= 3) {
940 vec3 edge1 = vertices[1] - vertices[0];
941 vec3 edge2 = vertices[2] - vertices[0];
942 result.normal =
cross(edge1, edge2);
943 if (result.normal.magnitude() > 1e-8f) {
944 result.normal = result.normal / result.normal.magnitude();
946 result.normal =
make_vec3(-query.direction.x, -query.direction.y, -query.direction.z);
950 result.normal =
make_vec3(-query.direction.x, -query.direction.y, -query.direction.z);
961 auto it = primitive_cache.find(primitive_id);
962 if (it == primitive_cache.end()) {
968 if (rayPrimitiveIntersection(origin, direction, primitive_id, distance)) {
970 result.distance = distance;
971 result.primitive_UUID = primitive_id;
972 result.intersection_point = origin + direction * distance;
978 if (type == PRIMITIVE_TYPE_TRIANGLE && vertices.size() >= 3) {
979 vec3 edge1 = vertices[1] - vertices[0];
980 vec3 edge2 = vertices[2] - vertices[0];
981 result.normal =
cross(edge1, edge2);
982 if (result.normal.magnitude() > 1e-8f) {
983 result.normal = result.normal / result.normal.magnitude();
985 vec3 to_origin = origin - result.intersection_point;
986 if (result.normal * to_origin < 0) {
987 result.normal = result.normal * -1.0f;
990 result.normal =
make_vec3(-direction.
x, -direction.
y, -direction.
z);
991 result.normal = result.normal / result.normal.magnitude();
993 }
else if (type == PRIMITIVE_TYPE_PATCH && vertices.size() >= 4) {
994 vec3 edge1 = vertices[1] - vertices[0];
995 vec3 edge2 = vertices[2] - vertices[0];
996 result.normal =
cross(edge1, edge2);
997 if (result.normal.magnitude() > 1e-8f) {
998 result.normal = result.normal / result.normal.magnitude();
1000 vec3 to_origin = origin - result.intersection_point;
1001 if (result.normal * to_origin < 0) {
1002 result.normal = result.normal * -1.0f;
1005 result.normal =
make_vec3(-direction.
x, -direction.
y, -direction.
z);
1006 result.normal = result.normal / result.normal.magnitude();
1009 result.normal =
make_vec3(-direction.
x, -direction.
y, -direction.
z);
1010 result.normal = result.normal / result.normal.magnitude();
1016 const CachedPrimitive &cached = it->second;
1019 if (cached.type == PRIMITIVE_TYPE_TRIANGLE && cached.vertices.size() >= 3) {
1021 if (triangleIntersect(origin, direction, cached.vertices[0], cached.vertices[1], cached.vertices[2], distance)) {
1022 if (distance > 1e-6f && (max_distance <= 0 || distance < max_distance)) {
1024 result.distance = distance;
1025 result.primitive_UUID = primitive_id;
1026 result.intersection_point = origin + direction * distance;
1029 vec3 edge1 = cached.vertices[1] - cached.vertices[0];
1030 vec3 edge2 = cached.vertices[2] - cached.vertices[0];
1031 result.normal =
cross(edge1, edge2);
1032 if (result.normal.magnitude() > 1e-8f) {
1033 result.normal = result.normal / result.normal.magnitude();
1035 vec3 to_origin = origin - result.intersection_point;
1036 if (result.normal * to_origin < 0) {
1037 result.normal = result.normal * -1.0f;
1040 result.normal =
make_vec3(-direction.
x, -direction.
y, -direction.
z);
1041 result.normal = result.normal / result.normal.magnitude();
1045 }
else if (cached.type == PRIMITIVE_TYPE_PATCH && cached.vertices.size() >= 4) {
1047 if (patchIntersect(origin, direction, cached.vertices[0], cached.vertices[1], cached.vertices[2], cached.vertices[3], distance)) {
1048 if (distance > 1e-6f && (max_distance <= 0 || distance < max_distance)) {
1050 result.distance = distance;
1051 result.primitive_UUID = primitive_id;
1052 result.intersection_point = origin + direction * distance;
1055 vec3 edge1 = cached.vertices[1] - cached.vertices[0];
1056 vec3 edge2 = cached.vertices[2] - cached.vertices[0];
1057 result.normal =
cross(edge1, edge2);
1058 if (result.normal.magnitude() > 1e-8f) {
1059 result.normal = result.normal / result.normal.magnitude();
1061 vec3 to_origin = origin - result.intersection_point;
1062 if (result.normal * to_origin < 0) {
1063 result.normal = result.normal * -1.0f;
1066 result.normal =
make_vec3(-direction.
x, -direction.
y, -direction.
z);
1067 result.normal = result.normal / result.normal.magnitude();
1071 }
else if (cached.type == PRIMITIVE_TYPE_VOXEL && cached.vertices.size() == 8) {
1074 vec3 aabb_min = cached.vertices[0];
1075 vec3 aabb_max = cached.vertices[0];
1077 for (
int i = 1; i < 8; i++) {
1078 aabb_min.
x = std::min(aabb_min.
x, cached.vertices[i].x);
1079 aabb_min.
y = std::min(aabb_min.
y, cached.vertices[i].y);
1080 aabb_min.
z = std::min(aabb_min.
z, cached.vertices[i].z);
1081 aabb_max.
x = std::max(aabb_max.
x, cached.vertices[i].x);
1082 aabb_max.
y = std::max(aabb_max.
y, cached.vertices[i].y);
1083 aabb_max.
z = std::max(aabb_max.
z, cached.vertices[i].z);
1087 float t_near = -std::numeric_limits<float>::max();
1088 float t_far = std::numeric_limits<float>::max();
1091 for (
int axis = 0; axis < 3; axis++) {
1092 float ray_dir_component = (axis == 0) ? direction.
x : (axis == 1) ? direction.y : direction.z;
1093 float ray_orig_component = (axis == 0) ? origin.
x : (axis == 1) ? origin.y : origin.z;
1094 float aabb_min_component = (axis == 0) ? aabb_min.
x : (axis == 1) ? aabb_min.y : aabb_min.z;
1095 float aabb_max_component = (axis == 0) ? aabb_max.
x : (axis == 1) ? aabb_max.y : aabb_max.z;
1097 if (std::abs(ray_dir_component) < 1e-8f) {
1099 if (ray_orig_component < aabb_min_component || ray_orig_component > aabb_max_component) {
1104 float t1 = (aabb_min_component - ray_orig_component) / ray_dir_component;
1105 float t2 = (aabb_max_component - ray_orig_component) / ray_dir_component;
1113 t_near = std::max(t_near, t1);
1114 t_far = std::min(t_far, t2);
1117 if (t_near > t_far) {
1124 if (t_far >= 0.0f) {
1126 float intersection_distance = (t_near >= 1e-6f) ? t_near : t_far;
1127 if (intersection_distance >= 1e-6f && (max_distance <= 0 || intersection_distance < max_distance)) {
1129 result.distance = intersection_distance;
1130 result.primitive_UUID = primitive_id;
1131 result.intersection_point = origin + direction * intersection_distance;
1135 vec3 hit_point = result.intersection_point;
1136 vec3 box_center = (aabb_min + aabb_max) * 0.5f;
1137 vec3 box_extent = (aabb_max - aabb_min) * 0.5f;
1140 vec3 local_hit = hit_point - box_center;
1141 vec3 abs_local_hit =
make_vec3(std::abs(local_hit.
x), std::abs(local_hit.
y), std::abs(local_hit.
z));
1144 float rel_x = abs_local_hit.
x / box_extent.
x;
1145 float rel_y = abs_local_hit.
y / box_extent.
y;
1146 float rel_z = abs_local_hit.
z / box_extent.
z;
1148 if (rel_x >= rel_y && rel_x >= rel_z) {
1150 result.normal =
make_vec3((local_hit.
x > 0) ? 1.0f : -1.0f, 0.0f, 0.0f);
1151 }
else if (rel_y >= rel_z) {
1153 result.normal =
make_vec3(0.0f, (local_hit.
y > 0) ? 1.0f : -1.0f, 0.0f);
1156 result.normal =
make_vec3(0.0f, 0.0f, (local_hit.
z > 0) ? 1.0f : -1.0f);
1166void CollisionDetection::buildPrimitiveCache() {
1167 primitive_cache.clear();
1170 std::vector<uint> all_primitives = context->
getAllUUIDs();
1173 for (
uint primitive_id: all_primitives) {
1179 primitive_cache[primitive_id] = CachedPrimitive(type, vertices);
1180 }
catch (
const std::exception &e) {
1183 if (printmessages) {
1184 std::cout <<
"Warning: Skipping primitive " << primitive_id <<
" in cache build (not accessible: " << e.what() <<
")" << std::endl;
1191 if (printmessages) {
1195bool CollisionDetection::triangleIntersect(
const vec3 &origin,
const vec3 &direction,
const vec3 &v0,
const vec3 &v1,
const vec3 &v2,
float &distance) {
1197 const float EPSILON = 1e-8f;
1200 float edge1_x = v1.
x - v0.
x, edge1_y = v1.
y - v0.
y, edge1_z = v1.
z - v0.
z;
1201 float edge2_x = v2.
x - v0.
x, edge2_y = v2.
y - v0.
y, edge2_z = v2.
z - v0.
z;
1204 float h_x = direction.
y * edge2_z - direction.
z * edge2_y;
1205 float h_y = direction.
z * edge2_x - direction.
x * edge2_z;
1206 float h_z = direction.
x * edge2_y - direction.
y * edge2_x;
1209 float a = edge1_x * h_x + edge1_y * h_y + edge1_z * h_z;
1211 if (a > -EPSILON && a < EPSILON) {
1218 float s_x = origin.
x - v0.
x, s_y = origin.
y - v0.
y, s_z = origin.
z - v0.
z;
1221 float u = f * (s_x * h_x + s_y * h_y + s_z * h_z);
1223 if (u < -EPSILON || u > 1.0f + EPSILON) {
1228 float q_x = s_y * edge1_z - s_z * edge1_y;
1229 float q_y = s_z * edge1_x - s_x * edge1_z;
1230 float q_z = s_x * edge1_y - s_y * edge1_x;
1233 float v = f * (direction.
x * q_x + direction.
y * q_y + direction.
z * q_z);
1235 if (v < -EPSILON || u + v > 1.0f + EPSILON) {
1240 float t = f * (edge2_x * q_x + edge2_y * q_y + edge2_z * q_z);
1250bool CollisionDetection::patchIntersect(
const vec3 &origin,
const vec3 &direction,
const vec3 &v0,
const vec3 &v1,
const vec3 &v2,
const vec3 &v3,
float &distance) {
1252 const float EPSILON = 1e-8f;
1263 float denom = direction * normal;
1264 if (std::abs(denom) > EPSILON) {
1265 float t = (anchor - origin) * normal / denom;
1267 if (t > EPSILON && t < 1e8f) {
1269 vec3 p = origin + direction * t;
1270 vec3 d = p - anchor;
1273 float ddota = d * a;
1274 float ddotb = d * b;
1277 if (ddota >= -EPSILON && ddota <= (a * a) + EPSILON && ddotb >= -EPSILON && ddotb <= (b * b) + EPSILON) {
1287#ifdef HELIOS_CUDA_AVAILABLE
1288#include <vector_types.h>
1296std::vector<CollisionDetection::HitResult> CollisionDetection::castRaysGPU(
const std::vector<RayQuery> &ray_queries, RayTracingStats &stats) {
1297 std::vector<HitResult> results;
1298 results.resize(ray_queries.size());
1300 if (ray_queries.empty() || !gpu_acceleration_enabled) {
1301 return castRaysSoA(ray_queries, stats);
1304 auto start = std::chrono::high_resolution_clock::now();
1307 if (bvh_nodes.empty()) {
1308 if (printmessages) {
1311 if (bvh_nodes.empty()) {
1312 helios_runtime_error(
"ERROR: BVH construction failed - no geometry available for ray tracing. Ensure primitives are properly added to the collision detection system.");
1317 std::vector<float> ray_origins(ray_queries.size() * 3);
1318 std::vector<float> ray_directions(ray_queries.size() * 3);
1319 std::vector<float> ray_max_distances(ray_queries.size());
1321 for (
size_t i = 0; i < ray_queries.size(); i++) {
1322 vec3 normalized_dir = ray_queries[i].direction;
1323 if (normalized_dir.
magnitude() > 1e-8f) {
1324 normalized_dir = normalized_dir / normalized_dir.
magnitude();
1327 ray_origins[i * 3] = ray_queries[i].origin.x;
1328 ray_origins[i * 3 + 1] = ray_queries[i].origin.y;
1329 ray_origins[i * 3 + 2] = ray_queries[i].origin.z;
1331 ray_directions[i * 3] = normalized_dir.
x;
1332 ray_directions[i * 3 + 1] = normalized_dir.
y;
1333 ray_directions[i * 3 + 2] = normalized_dir.
z;
1335 ray_max_distances[i] = ray_queries[i].max_distance;
1339 std::vector<uint> all_primitives = context->
getAllUUIDs();
1340 std::vector<unsigned int> primitive_indices(all_primitives.size());
1341 std::vector<int> primitive_types(all_primitives.size());
1342 std::vector<float3> primitive_vertices;
1343 std::vector<unsigned int> vertex_offsets(all_primitives.size());
1345 size_t triangle_count = 0;
1346 size_t patch_count = 0;
1347 size_t voxel_count = 0;
1349 if (printmessages) {
1353 size_t vertex_index = 0;
1354 for (
size_t i = 0; i < all_primitives.size(); i++) {
1355 primitive_indices[i] = all_primitives[i];
1356 vertex_offsets[i] = vertex_index;
1359 primitive_types[i] =
static_cast<int>(ptype);
1363 if (ptype == PRIMITIVE_TYPE_TRIANGLE) {
1365 if (vertices.size() >= 3) {
1367 for (
int v = 0; v < 3; v++) {
1368 primitive_vertices.push_back(make_float3(vertices[v].x, vertices[v].y, vertices[v].z));
1370 primitive_vertices.push_back(make_float3(0, 0, 0));
1373 if (printmessages) {
1374 std::cout <<
"WARNING: Triangle primitive " << all_primitives[i] <<
" has " << vertices.size() <<
" vertices" << std::endl;
1377 for (
int v = 0; v < 4; v++) {
1378 primitive_vertices.push_back(make_float3(0, 0, 0));
1382 }
else if (ptype == PRIMITIVE_TYPE_PATCH) {
1384 if (vertices.size() >= 4) {
1386 for (
int v = 0; v < 4; v++) {
1387 primitive_vertices.push_back(make_float3(vertices[v].x, vertices[v].y, vertices[v].z));
1391 if (printmessages) {
1392 std::cout <<
"WARNING: Patch primitive " << all_primitives[i] <<
" has " << vertices.size() <<
" vertices" << std::endl;
1395 for (
int v = 0; v < 4; v++) {
1396 primitive_vertices.push_back(make_float3(0, 0, 0));
1400 }
else if (ptype == PRIMITIVE_TYPE_VOXEL) {
1404 vec3 voxel_min = vertices[0];
1405 vec3 voxel_max = vertices[0];
1408 for (
const auto &vertex: vertices) {
1409 voxel_min.
x = std::min(voxel_min.
x, vertex.x);
1410 voxel_min.
y = std::min(voxel_min.
y, vertex.y);
1411 voxel_min.
z = std::min(voxel_min.
z, vertex.z);
1412 voxel_max.
x = std::max(voxel_max.
x, vertex.x);
1413 voxel_max.
y = std::max(voxel_max.
y, vertex.y);
1414 voxel_max.
z = std::max(voxel_max.
z, vertex.z);
1418 primitive_vertices.push_back(make_float3(voxel_min.
x, voxel_min.
y, voxel_min.
z));
1419 primitive_vertices.push_back(make_float3(voxel_max.
x, voxel_max.
y, voxel_max.
z));
1420 primitive_vertices.push_back(make_float3(0, 0, 0));
1421 primitive_vertices.push_back(make_float3(0, 0, 0));
1425 for (
int v = 0; v < 4; v++) {
1426 primitive_vertices.push_back(make_float3(0, 0, 0));
1432 if (printmessages) {
1435 if (all_primitives.empty()) {
1436 if (printmessages) {
1437 std::cout <<
"No primitives found for GPU ray tracing, falling back to CPU" << std::endl;
1439 return castRaysSoA(ray_queries, stats);
1444 std::vector<float> hit_distances(ray_queries.size());
1445 std::vector<unsigned int> hit_primitive_ids(ray_queries.size());
1446 std::vector<unsigned int> hit_counts(ray_queries.size());
1449 std::vector<GPUBVHNode> gpu_bvh_nodes(bvh_nodes.size());
1450 for (
size_t i = 0; i < bvh_nodes.size(); i++) {
1451 gpu_bvh_nodes[i].aabb_min = make_float3(bvh_nodes[i].aabb_min.
x, bvh_nodes[i].aabb_min.y, bvh_nodes[i].aabb_min.z);
1452 gpu_bvh_nodes[i].aabb_max = make_float3(bvh_nodes[i].aabb_max.
x, bvh_nodes[i].aabb_max.y, bvh_nodes[i].aabb_max.z);
1453 gpu_bvh_nodes[i].left_child = bvh_nodes[i].left_child;
1454 gpu_bvh_nodes[i].right_child = bvh_nodes[i].right_child;
1455 gpu_bvh_nodes[i].primitive_start = bvh_nodes[i].primitive_start;
1456 gpu_bvh_nodes[i].primitive_count = bvh_nodes[i].primitive_count;
1457 gpu_bvh_nodes[i].is_leaf = bvh_nodes[i].is_leaf ? 1 : 0;
1458 gpu_bvh_nodes[i].padding = 0;
1461 if (bvh_nodes[i].is_leaf && printmessages && i < 3) {
1462 std::cout <<
"BVH leaf node " << i <<
": primitive_start=" << bvh_nodes[i].primitive_start <<
", primitive_count=" << bvh_nodes[i].primitive_count <<
", max_index=" << (bvh_nodes[i].primitive_start + bvh_nodes[i].primitive_count - 1)
1469 static_cast<int>(gpu_bvh_nodes.size()),
1470 primitive_indices.data(),
1471 static_cast<int>(all_primitives.size()),
1472 primitive_types.data(),
1473 primitive_vertices.data(),
1474 vertex_offsets.data(),
1475 static_cast<int>(primitive_vertices.size()),
1477 ray_directions.data(),
1478 ray_max_distances.data(),
1479 static_cast<int>(ray_queries.size()),
1480 hit_distances.data(),
1481 hit_primitive_ids.data(),
1487 size_t hit_count = 0;
1488 for (
size_t i = 0; i < ray_queries.size(); i++) {
1489 if (hit_counts[i] > 0 && hit_distances[i] <= ray_queries[i].max_distance) {
1490 results[i].hit =
true;
1491 results[i].primitive_UUID = hit_primitive_ids[i];
1492 results[i].distance = hit_distances[i];
1493 results[i].intersection_point = ray_queries[i].origin + ray_queries[i].direction * hit_distances[i];
1496 results[i].hit =
false;
1497 results[i].primitive_UUID = 0;
1498 results[i].distance = std::numeric_limits<float>::max();
1502 auto end = std::chrono::high_resolution_clock::now();
1503 double elapsed = std::chrono::duration<double, std::milli>(end - start).count();
1505 stats.total_rays_cast = ray_queries.size();
1506 stats.total_hits = hit_count;
1507 double rays_per_second = ray_queries.size() / (elapsed / 1000.0);
1517uint32_t CollisionDetection::rayAABBIntersectSIMD(
const vec3 *ray_origins,
const vec3 *ray_directions,
const vec3 *aabb_mins,
const vec3 *aabb_maxs,
float *t_mins,
float *t_maxs,
int count) {
1521 uint32_t hit_mask = 0;
1523 for (
int i = 0; i < 8; i += 8) {
1525 __m256 orig_x = _mm256_set_ps(ray_origins[i + 7].x, ray_origins[i + 6].x, ray_origins[i + 5].x, ray_origins[i + 4].x, ray_origins[i + 3].x, ray_origins[i + 2].x, ray_origins[i + 1].x, ray_origins[i + 0].x);
1526 __m256 orig_y = _mm256_set_ps(ray_origins[i + 7].y, ray_origins[i + 6].y, ray_origins[i + 5].y, ray_origins[i + 4].y, ray_origins[i + 3].y, ray_origins[i + 2].y, ray_origins[i + 1].y, ray_origins[i + 0].y);
1527 __m256 orig_z = _mm256_set_ps(ray_origins[i + 7].z, ray_origins[i + 6].z, ray_origins[i + 5].z, ray_origins[i + 4].z, ray_origins[i + 3].z, ray_origins[i + 2].z, ray_origins[i + 1].z, ray_origins[i + 0].z);
1530 __m256 dir_x = _mm256_set_ps(ray_directions[i + 7].x, ray_directions[i + 6].x, ray_directions[i + 5].x, ray_directions[i + 4].x, ray_directions[i + 3].x, ray_directions[i + 2].x, ray_directions[i + 1].x, ray_directions[i + 0].x);
1531 __m256 dir_y = _mm256_set_ps(ray_directions[i + 7].y, ray_directions[i + 6].y, ray_directions[i + 5].y, ray_directions[i + 4].y, ray_directions[i + 3].y, ray_directions[i + 2].y, ray_directions[i + 1].y, ray_directions[i + 0].y);
1532 __m256 dir_z = _mm256_set_ps(ray_directions[i + 7].z, ray_directions[i + 6].z, ray_directions[i + 5].z, ray_directions[i + 4].z, ray_directions[i + 3].z, ray_directions[i + 2].z, ray_directions[i + 1].z, ray_directions[i + 0].z);
1535 __m256 aabb_min_x = _mm256_set_ps(aabb_mins[i + 7].x, aabb_mins[i + 6].x, aabb_mins[i + 5].x, aabb_mins[i + 4].x, aabb_mins[i + 3].x, aabb_mins[i + 2].x, aabb_mins[i + 1].x, aabb_mins[i + 0].x);
1536 __m256 aabb_min_y = _mm256_set_ps(aabb_mins[i + 7].y, aabb_mins[i + 6].y, aabb_mins[i + 5].y, aabb_mins[i + 4].y, aabb_mins[i + 3].y, aabb_mins[i + 2].y, aabb_mins[i + 1].y, aabb_mins[i + 0].y);
1537 __m256 aabb_min_z = _mm256_set_ps(aabb_mins[i + 7].z, aabb_mins[i + 6].z, aabb_mins[i + 5].z, aabb_mins[i + 4].z, aabb_mins[i + 3].z, aabb_mins[i + 2].z, aabb_mins[i + 1].z, aabb_mins[i + 0].z);
1540 __m256 aabb_max_x = _mm256_set_ps(aabb_maxs[i + 7].x, aabb_maxs[i + 6].x, aabb_maxs[i + 5].x, aabb_maxs[i + 4].x, aabb_maxs[i + 3].x, aabb_maxs[i + 2].x, aabb_maxs[i + 1].x, aabb_maxs[i + 0].x);
1541 __m256 aabb_max_y = _mm256_set_ps(aabb_maxs[i + 7].y, aabb_maxs[i + 6].y, aabb_maxs[i + 5].y, aabb_maxs[i + 4].y, aabb_maxs[i + 3].y, aabb_maxs[i + 2].y, aabb_maxs[i + 1].y, aabb_maxs[i + 0].y);
1542 __m256 aabb_max_z = _mm256_set_ps(aabb_maxs[i + 7].z, aabb_maxs[i + 6].z, aabb_maxs[i + 5].z, aabb_maxs[i + 4].z, aabb_maxs[i + 3].z, aabb_maxs[i + 2].z, aabb_maxs[i + 1].z, aabb_maxs[i + 0].z);
1545 __m256 inv_dir_x = _mm256_div_ps(_mm256_set1_ps(1.0f), dir_x);
1546 __m256 inv_dir_y = _mm256_div_ps(_mm256_set1_ps(1.0f), dir_y);
1547 __m256 inv_dir_z = _mm256_div_ps(_mm256_set1_ps(1.0f), dir_z);
1550 __m256 t1_x = _mm256_mul_ps(_mm256_sub_ps(aabb_min_x, orig_x), inv_dir_x);
1551 __m256 t2_x = _mm256_mul_ps(_mm256_sub_ps(aabb_max_x, orig_x), inv_dir_x);
1552 __m256 tmin_x = _mm256_min_ps(t1_x, t2_x);
1553 __m256 tmax_x = _mm256_max_ps(t1_x, t2_x);
1556 __m256 t1_y = _mm256_mul_ps(_mm256_sub_ps(aabb_min_y, orig_y), inv_dir_y);
1557 __m256 t2_y = _mm256_mul_ps(_mm256_sub_ps(aabb_max_y, orig_y), inv_dir_y);
1558 __m256 tmin_y = _mm256_min_ps(t1_y, t2_y);
1559 __m256 tmax_y = _mm256_max_ps(t1_y, t2_y);
1562 __m256 t1_z = _mm256_mul_ps(_mm256_sub_ps(aabb_min_z, orig_z), inv_dir_z);
1563 __m256 t2_z = _mm256_mul_ps(_mm256_sub_ps(aabb_max_z, orig_z), inv_dir_z);
1564 __m256 tmin_z = _mm256_min_ps(t1_z, t2_z);
1565 __m256 tmax_z = _mm256_max_ps(t1_z, t2_z);
1568 __m256 t_min_final = _mm256_max_ps(_mm256_max_ps(tmin_x, tmin_y), tmin_z);
1569 __m256 t_max_final = _mm256_min_ps(_mm256_min_ps(tmax_x, tmax_y), tmax_z);
1572 _mm256_store_ps(&t_mins[i], t_min_final);
1573 _mm256_store_ps(&t_maxs[i], t_max_final);
1576 __m256 zero = _mm256_set1_ps(0.0f);
1577 __m256 hits = _mm256_and_ps(_mm256_cmp_ps(t_max_final, zero, _CMP_GE_OS), _mm256_cmp_ps(t_min_final, t_max_final, _CMP_LE_OS));
1580 hit_mask |= _mm256_movemask_ps(hits);
1590 uint32_t hit_mask = 0;
1592 for (
int i = 0; i < 4; i += 4) {
1594 __m128 orig_x = _mm_set_ps(ray_origins[i + 3].x, ray_origins[i + 2].x, ray_origins[i + 1].x, ray_origins[i + 0].x);
1595 __m128 orig_y = _mm_set_ps(ray_origins[i + 3].y, ray_origins[i + 2].y, ray_origins[i + 1].y, ray_origins[i + 0].y);
1596 __m128 orig_z = _mm_set_ps(ray_origins[i + 3].z, ray_origins[i + 2].z, ray_origins[i + 1].z, ray_origins[i + 0].z);
1599 __m128 dir_x = _mm_set_ps(ray_directions[i + 3].x, ray_directions[i + 2].x, ray_directions[i + 1].x, ray_directions[i + 0].x);
1600 __m128 dir_y = _mm_set_ps(ray_directions[i + 3].y, ray_directions[i + 2].y, ray_directions[i + 1].y, ray_directions[i + 0].y);
1601 __m128 dir_z = _mm_set_ps(ray_directions[i + 3].z, ray_directions[i + 2].z, ray_directions[i + 1].z, ray_directions[i + 0].z);
1604 __m128 aabb_min_x = _mm_set_ps(aabb_mins[i + 3].x, aabb_mins[i + 2].x, aabb_mins[i + 1].x, aabb_mins[i + 0].x);
1605 __m128 aabb_min_y = _mm_set_ps(aabb_mins[i + 3].y, aabb_mins[i + 2].y, aabb_mins[i + 1].y, aabb_mins[i + 0].y);
1606 __m128 aabb_min_z = _mm_set_ps(aabb_mins[i + 3].z, aabb_mins[i + 2].z, aabb_mins[i + 1].z, aabb_mins[i + 0].z);
1609 __m128 aabb_max_x = _mm_set_ps(aabb_maxs[i + 3].x, aabb_maxs[i + 2].x, aabb_maxs[i + 1].x, aabb_maxs[i + 0].x);
1610 __m128 aabb_max_y = _mm_set_ps(aabb_maxs[i + 3].y, aabb_maxs[i + 2].y, aabb_maxs[i + 1].y, aabb_maxs[i + 0].y);
1611 __m128 aabb_max_z = _mm_set_ps(aabb_maxs[i + 3].z, aabb_maxs[i + 2].z, aabb_maxs[i + 1].z, aabb_maxs[i + 0].z);
1614 __m128 inv_dir_x = _mm_div_ps(_mm_set1_ps(1.0f), dir_x);
1615 __m128 inv_dir_y = _mm_div_ps(_mm_set1_ps(1.0f), dir_y);
1616 __m128 inv_dir_z = _mm_div_ps(_mm_set1_ps(1.0f), dir_z);
1619 __m128 t1_x = _mm_mul_ps(_mm_sub_ps(aabb_min_x, orig_x), inv_dir_x);
1620 __m128 t2_x = _mm_mul_ps(_mm_sub_ps(aabb_max_x, orig_x), inv_dir_x);
1621 __m128 tmin_x = _mm_min_ps(t1_x, t2_x);
1622 __m128 tmax_x = _mm_max_ps(t1_x, t2_x);
1625 __m128 t1_y = _mm_mul_ps(_mm_sub_ps(aabb_min_y, orig_y), inv_dir_y);
1626 __m128 t2_y = _mm_mul_ps(_mm_sub_ps(aabb_max_y, orig_y), inv_dir_y);
1627 __m128 tmin_y = _mm_min_ps(t1_y, t2_y);
1628 __m128 tmax_y = _mm_max_ps(t1_y, t2_y);
1631 __m128 t1_z = _mm_mul_ps(_mm_sub_ps(aabb_min_z, orig_z), inv_dir_z);
1632 __m128 t2_z = _mm_mul_ps(_mm_sub_ps(aabb_max_z, orig_z), inv_dir_z);
1633 __m128 tmin_z = _mm_min_ps(t1_z, t2_z);
1634 __m128 tmax_z = _mm_max_ps(t1_z, t2_z);
1637 __m128 t_min_final = _mm_max_ps(_mm_max_ps(tmin_x, tmin_y), tmin_z);
1638 __m128 t_max_final = _mm_min_ps(_mm_min_ps(tmax_x, tmax_y), tmax_z);
1641 _mm_store_ps(&t_mins[i], t_min_final);
1642 _mm_store_ps(&t_maxs[i], t_max_final);
1645 __m128 zero = _mm_set1_ps(0.0f);
1646 __m128 hits = _mm_and_ps(_mm_cmpge_ps(t_max_final, zero), _mm_cmple_ps(t_min_final, t_max_final));
1649 hit_mask |= _mm_movemask_ps(hits);
1657 uint32_t hit_mask = 0;
1658 for (
int i = 0; i < count; ++i) {
1659 if (rayAABBIntersect(ray_origins[i], ray_directions[i], aabb_mins[i], aabb_maxs[i], t_mins[i], t_maxs[i])) {
1660 hit_mask |= (1 << i);
1666void CollisionDetection::traverseBVHSIMD(
const vec3 *ray_origins,
const vec3 *ray_directions,
int count, HitResult *results) {
1667 if (bvh_nodes.empty()) {
1669 for (
int i = 0; i < count; ++i) {
1670 results[i] = HitResult();
1676 int simd_batch_size = 1;
1678 simd_batch_size = 8;
1679#elif defined(__SSE4_1__)
1680 simd_batch_size = 4;
1684 for (
int batch_start = 0; batch_start < count; batch_start += simd_batch_size) {
1685 int batch_count = std::min(simd_batch_size, count - batch_start);
1688 for (
int i = 0; i < batch_count; ++i) {
1689 results[batch_start + i] = HitResult();
1692 if (batch_count >= simd_batch_size && simd_batch_size > 1) {
1694 traverseBVHSIMDImpl(&ray_origins[batch_start], &ray_directions[batch_start], batch_count, &results[batch_start]);
1697 for (
int i = 0; i < batch_count; ++i) {
1698 int ray_idx = batch_start + i;
1699 results[ray_idx] =
castRay(RayQuery(ray_origins[ray_idx], ray_directions[ray_idx]));
1705void CollisionDetection::traverseBVHSIMDImpl(
const vec3 *ray_origins,
const vec3 *ray_directions,
int count, HitResult *results) {
1706 const size_t MAX_STACK_SIZE = 64;
1709 alignas(32) uint32_t node_stacks[8][MAX_STACK_SIZE];
1710 alignas(32) uint32_t stack_tops[8] = {0};
1711 alignas(32)
float closest_distances[8];
1712 alignas(32)
bool ray_active[8];
1715 for (
int i = 0; i < count; ++i) {
1716 node_stacks[i][0] = 0;
1718 closest_distances[i] = std::numeric_limits<float>::max();
1719 ray_active[i] =
true;
1720 results[i] = HitResult();
1725 bool any_active =
false;
1726 for (
int i = 0; i < count; ++i) {
1727 if (ray_active[i] && stack_tops[i] > 0) {
1736 alignas(32)
vec3 test_aabb_mins[8];
1737 alignas(32)
vec3 test_aabb_maxs[8];
1738 alignas(32) uint32_t test_node_indices[8];
1739 alignas(32)
int test_ray_indices[8];
1742 for (
int i = 0; i < count; ++i) {
1743 if (ray_active[i] && stack_tops[i] > 0) {
1744 uint32_t node_idx = node_stacks[i][--stack_tops[i]];
1745 const BVHNode &node = bvh_nodes[node_idx];
1747 test_aabb_mins[test_count] = node.aabb_min;
1748 test_aabb_maxs[test_count] = node.aabb_max;
1749 test_node_indices[test_count] = node_idx;
1750 test_ray_indices[test_count] = i;
1753 if (test_count == count)
1758 if (test_count == 0)
1762 alignas(32)
vec3 batch_origins[8];
1763 alignas(32)
vec3 batch_directions[8];
1764 alignas(32)
float t_mins[8];
1765 alignas(32)
float t_maxs[8];
1767 for (
int i = 0; i < test_count; ++i) {
1768 int ray_idx = test_ray_indices[i];
1769 batch_origins[i] = ray_origins[ray_idx];
1770 batch_directions[i] = ray_directions[ray_idx];
1774 uint32_t hit_mask = rayAABBIntersectSIMD(batch_origins, batch_directions, test_aabb_mins, test_aabb_maxs, t_mins, t_maxs, test_count);
1777 for (
int i = 0; i < test_count; ++i) {
1778 if (!(hit_mask & (1 << i)))
1781 int ray_idx = test_ray_indices[i];
1782 uint32_t node_idx = test_node_indices[i];
1783 const BVHNode &node = bvh_nodes[node_idx];
1785 if (t_mins[i] > closest_distances[ray_idx])
1790 for (uint32_t prim_idx = node.primitive_start; prim_idx < node.primitive_start + node.primitive_count; ++prim_idx) {
1792 uint32_t primitive_id = primitive_indices[prim_idx];
1795 HitResult prim_result = intersectPrimitiveThreadSafe(batch_origins[i], batch_directions[i], primitive_id, closest_distances[ray_idx]);
1796 if (prim_result.hit && prim_result.distance < closest_distances[ray_idx]) {
1797 closest_distances[ray_idx] = prim_result.distance;
1798 results[ray_idx] = prim_result;
1803 if (stack_tops[ray_idx] < MAX_STACK_SIZE - 2) {
1804 node_stacks[ray_idx][stack_tops[ray_idx]++] = node.left_child;
1805 node_stacks[ray_idx][stack_tops[ray_idx]++] = node.right_child;
1808 ray_active[ray_idx] =
false;
1821 const float EPSILON = 1e-8f;
1824 float t_min_x = (aabb_min.
x - origin.
x) / direction.
x;
1825 float t_max_x = (aabb_max.
x - origin.
x) / direction.
x;
1828 if (direction.
x < 0.0f) {
1829 float temp = t_min_x;
1834 float t_min_y = (aabb_min.
y - origin.
y) / direction.
y;
1835 float t_max_y = (aabb_max.
y - origin.
y) / direction.
y;
1837 if (direction.
y < 0.0f) {
1838 float temp = t_min_y;
1844 float t_min = std::max(t_min_x, t_min_y);
1845 float t_max = std::min(t_max_x, t_max_y);
1847 if (t_min > t_max) {
1851 float t_min_z = (aabb_min.
z - origin.
z) / direction.
z;
1852 float t_max_z = (aabb_max.
z - origin.
z) / direction.
z;
1854 if (direction.
z < 0.0f) {
1855 float temp = t_min_z;
1861 t_min = std::max(t_min, t_min_z);
1862 t_max = std::min(t_max, t_max_z);
1864 if (t_min > t_max || t_max < EPSILON) {
1869 distance = (t_min > EPSILON) ? t_min : t_max;
1871 return distance > EPSILON;