1.3.49
 
Loading...
Searching...
No Matches
CollisionDetection_RayTracing.cpp
Go to the documentation of this file.
1
16#include <chrono>
17#include <functional>
18#include <limits>
19#include <queue>
20#include <stack>
21#include <thread>
22#include "CollisionDetection.h"
23
24// SIMD headers
25#ifdef __AVX2__
26#include <immintrin.h>
27#elif defined(__SSE4_1__)
28#include <smmintrin.h>
29#elif defined(__SSE2__)
30#include <emmintrin.h>
31#endif
32
33// MSVC prefetch support
34#ifdef _MSC_VER
35#include <intrin.h>
36#endif
37#include <algorithm>
38
39#ifdef HELIOS_CUDA_AVAILABLE
40#include <cuda_runtime.h>
41#endif
42
43using namespace helios;
44
45#ifdef HELIOS_CUDA_AVAILABLE
46// GPU BVH node structure (must match the one in .cu file)
47struct GPUBVHNode {
48 float3 aabb_min, aabb_max;
49 unsigned int left_child, right_child;
50 unsigned int primitive_start, primitive_count;
51 unsigned int is_leaf, padding;
52};
53
54// External CUDA functions
55extern "C" {
56void launchBVHTraversal(void *h_nodes, int node_count, unsigned int *h_primitive_indices, int primitive_count, float *h_primitive_aabb_min, float *h_primitive_aabb_max, float *h_query_aabb_min, float *h_query_aabb_max, int num_queries,
57 unsigned int *h_results, unsigned int *h_result_counts, int max_results_per_query);
58void launchVoxelRayPathLengths(int num_rays, float *h_ray_origins, float *h_ray_directions, float grid_center_x, float grid_center_y, float grid_center_z, float grid_size_x, float grid_size_y, float grid_size_z, int grid_divisions_x,
59 int grid_divisions_y, int grid_divisions_z, int primitive_count, int *h_voxel_ray_counts, float *h_voxel_path_lengths, int *h_voxel_transmitted, int *h_voxel_hit_before, int *h_voxel_hit_after, int *h_voxel_hit_inside);
60// Warp-efficient GPU kernels
61void launchWarpEfficientBVH(void *h_bvh_soa_gpu, unsigned int *h_primitive_indices, int primitive_count, float *h_primitive_aabb_min, float *h_primitive_aabb_max, float *h_ray_origins, float *h_ray_directions, float *h_ray_max_distances,
62 int num_rays, unsigned int *h_results, unsigned int *h_result_counts, int max_results_per_ray);
63// High-performance ray-triangle intersection kernel
64void launchRayPrimitiveIntersection(void *h_bvh_nodes, int node_count, unsigned int *h_primitive_indices, int primitive_count, int *h_primitive_types, float3 *h_primitive_vertices, unsigned int *h_vertex_offsets, int total_vertex_count,
65 float *h_ray_origins, float *h_ray_directions, float *h_ray_max_distances, int num_rays, float *h_hit_distances, unsigned int *h_hit_primitive_ids, unsigned int *h_hit_counts, bool find_closest_hit);
66}
67
68// Helper function to convert helios::vec3 to float3
69inline float3 heliosVecToFloat3(const helios::vec3 &v) {
70 return make_float3(v.x, v.y, v.z);
71}
72#endif
73
74// -------- GENERIC RAY-TRACING IMPLEMENTATIONS --------
75
77 return castRay(ray_query.origin, ray_query.direction, ray_query.max_distance, ray_query.target_UUIDs);
78}
79
80CollisionDetection::HitResult CollisionDetection::castRay(const vec3 &origin, const vec3 &direction, float max_distance, const std::vector<uint> &target_UUIDs) {
81 HitResult result;
82
83 // Normalize direction vector
84 vec3 ray_direction = direction;
85 if (ray_direction.magnitude() < 1e-8f) {
86 return result; // Invalid direction
87 }
88 ray_direction = ray_direction / ray_direction.magnitude();
89
90 // Ensure BVH is current before ray casting (handles automatic rebuilds)
91 const_cast<CollisionDetection *>(this)->ensureBVHCurrent();
92
93 // CRITICAL FIX: Use BVH traversal when available for performance
94 // This fixes the 10x+ performance regression by replacing brute-force primitive testing
95 if (!bvh_nodes.empty()) {
96 // Build primitive cache for thread-safe access if not already built
97 if (primitive_cache.empty()) {
98 buildPrimitiveCache();
99 }
100
101 // Use BVH traversal for both filtered and unfiltered queries - BVH supports UUID filtering
102 RayQuery query(origin, ray_direction, max_distance, target_UUIDs);
103 return castRayBVHTraversal(query);
104 }
105
106 // FALLBACK: Brute-force primitive testing (only when no BVH available or target UUIDs specified)
107 // This is the slow path that was causing the performance regression
108 std::vector<uint> search_primitives;
109 if (target_UUIDs.empty()) {
110 // Use cached primitives if available (thread-safe), otherwise use context (thread-unsafe)
111 if (!primitive_cache.empty()) {
112 // Use cached primitive IDs for thread-safe operation
113 search_primitives.reserve(primitive_cache.size());
114 for (const auto &cached_pair: primitive_cache) {
115 search_primitives.push_back(cached_pair.first);
116 }
117 } else {
118 // Fall back to context call (thread-unsafe)
119 search_primitives = context->getAllUUIDs();
120 }
121 } else {
122 search_primitives = target_UUIDs;
123 }
124
125 // Find nearest intersection
126 float nearest_distance = std::numeric_limits<float>::max();
127 if (max_distance > 0) {
128 nearest_distance = max_distance;
129 }
130
131 uint hit_primitive = 0;
132 bool found_intersection = false;
133
134 // Test each primitive for intersection
135 for (uint candidate_uuid: search_primitives) {
136 float intersection_distance;
137 bool hit = false;
138
139 // Use thread-safe cached intersection if available
140 if (!primitive_cache.empty()) {
141 HitResult primitive_result = intersectPrimitiveThreadSafe(origin, ray_direction, candidate_uuid, max_distance);
142 if (primitive_result.hit) {
143 hit = true;
144 intersection_distance = primitive_result.distance;
145 }
146 } else {
147 // Fall back to thread-unsafe context call
148 if (!context->doesPrimitiveExist(candidate_uuid)) {
149 continue;
150 }
151 hit = rayPrimitiveIntersection(origin, ray_direction, candidate_uuid, intersection_distance);
152 }
153
154 if (hit) {
155 // Check distance constraints
156 if (intersection_distance > 1e-6f && // Avoid self-intersection
157 intersection_distance < nearest_distance) { // Find nearest
158
159 nearest_distance = intersection_distance;
160 hit_primitive = candidate_uuid;
161 found_intersection = true;
162 }
163 }
164 }
165
166 // Fill result
167 if (found_intersection) {
168 result.hit = true;
169 result.distance = nearest_distance;
170 result.primitive_UUID = hit_primitive;
171 result.intersection_point = origin + ray_direction * nearest_distance;
172
173 // Calculate surface normal
174 try {
175 PrimitiveType type = context->getPrimitiveType(hit_primitive);
176 std::vector<vec3> vertices = context->getPrimitiveVertices(hit_primitive);
177
178 if (type == PRIMITIVE_TYPE_TRIANGLE && vertices.size() >= 3) {
179 // Calculate triangle normal
180 vec3 v0 = vertices[0];
181 vec3 v1 = vertices[1];
182 vec3 v2 = vertices[2];
183 vec3 edge1 = v1 - v0;
184 vec3 edge2 = v2 - v0;
185 result.normal = cross(edge1, edge2);
186 result.normal = result.normal / result.normal.magnitude();
187 } else if (type == PRIMITIVE_TYPE_PATCH && vertices.size() >= 4) {
188 // Calculate patch normal (assuming quad)
189 vec3 v0 = vertices[0];
190 vec3 v1 = vertices[1];
191 vec3 v2 = vertices[2];
192 vec3 edge1 = v1 - v0;
193 vec3 edge2 = v2 - v0;
194 result.normal = cross(edge1, edge2);
195 result.normal = result.normal / result.normal.magnitude();
196 } else {
197 // Default normal (pointing back along ray)
198 result.normal = make_vec3(-ray_direction.x, -ray_direction.y, -ray_direction.z);
199 }
200 } catch (const std::exception &e) {
201 // If we can't get surface normal, use default
202 result.normal = make_vec3(-ray_direction.x, -ray_direction.y, -ray_direction.z);
203 }
204 }
205
206 return result;
207}
208
209
210std::vector<CollisionDetection::HitResult> CollisionDetection::castRays(const std::vector<RayQuery> &ray_queries, RayTracingStats *stats) {
211 std::vector<HitResult> results;
212 results.reserve(ray_queries.size());
213
214 // Initialize statistics
215 RayTracingStats local_stats;
216 local_stats.total_rays_cast = ray_queries.size();
217
218 // Smart CPU/GPU selection based on ray count and scene complexity
219 // Performance analysis shows CPU is faster for small batches (<1000 rays) due to
220 // GPU launch overhead, while GPU provides significant speedup for large batches
221 // TEMPORARY: Force CPU for LiDAR until GPU normal calculation is implemented
222 const size_t GPU_BATCH_THRESHOLD = 1000000; // Use GPU for batches larger than this (effectively disabled)
223 const size_t MIN_PRIMITIVES_FOR_GPU = 500; // Keep CPU path for stability
224
225#ifdef HELIOS_CUDA_AVAILABLE
226 bool use_gpu = gpu_acceleration_enabled && ray_queries.size() >= GPU_BATCH_THRESHOLD && d_bvh_nodes != nullptr && !primitive_indices.empty() && primitive_indices.size() >= MIN_PRIMITIVES_FOR_GPU;
227
228 if (use_gpu) {
229 // Use GPU acceleration for large batches with sufficient scene complexity
230 castRaysGPU(ray_queries, results, local_stats);
231 } else {
232 // Use CPU for small batches or simple scenes
233 castRaysCPU(ray_queries, results, local_stats);
234 }
235#else
236 // Use CPU implementation when GPU not available
237 castRaysCPU(ray_queries, results, local_stats);
238#endif
239
240 // Copy statistics to output parameter if provided
241 if (stats != nullptr) {
242 *stats = local_stats;
243 }
244
245 return results;
246}
247
248void CollisionDetection::castRaysCPU(const std::vector<RayQuery> &ray_queries, std::vector<HitResult> &results, RayTracingStats &stats) {
249 // Use optimized batch processing - fail explicitly if there are issues
250 results = castRaysOptimized(ray_queries, &stats);
251}
252
253#ifdef HELIOS_CUDA_AVAILABLE
254void CollisionDetection::castRaysGPU(const std::vector<RayQuery> &ray_queries, std::vector<HitResult> &results, RayTracingStats &stats) {
255 // Use the high-performance GPU ray-triangle intersection implementation
256 results = castRaysGPU(ray_queries, stats);
257}
258#endif
259
260std::vector<std::vector<std::vector<std::vector<CollisionDetection::HitResult>>>> CollisionDetection::performGridRayIntersection(const vec3 &grid_center, const vec3 &grid_size, const helios::int3 &grid_divisions,
261 const std::vector<RayQuery> &ray_queries) {
262
263 // Initialize result grid
264 std::vector<std::vector<std::vector<std::vector<HitResult>>>> grid_results;
265 grid_results.resize(grid_divisions.x);
266 for (int i = 0; i < grid_divisions.x; i++) {
267 grid_results[i].resize(grid_divisions.y);
268 for (int j = 0; j < grid_divisions.y; j++) {
269 grid_results[i][j].resize(grid_divisions.z);
270 }
271 }
272
273 // Calculate voxel size
274 vec3 voxel_size = make_vec3(grid_size.x / float(grid_divisions.x), grid_size.y / float(grid_divisions.y), grid_size.z / float(grid_divisions.z));
275
276 // Process each ray
277 for (const auto &query: ray_queries) {
278 HitResult hit_result = castRay(query);
279
280 if (hit_result.hit) {
281 // Determine which voxel the hit point falls into
282 vec3 relative_pos = hit_result.intersection_point - (grid_center - grid_size * 0.5f);
283
284 int voxel_i = int(relative_pos.x / voxel_size.x);
285 int voxel_j = int(relative_pos.y / voxel_size.y);
286 int voxel_k = int(relative_pos.z / voxel_size.z);
287
288 // Check bounds
289 if (voxel_i >= 0 && voxel_i < grid_divisions.x && voxel_j >= 0 && voxel_j < grid_divisions.y && voxel_k >= 0 && voxel_k < grid_divisions.z) {
290
291 grid_results[voxel_i][voxel_j][voxel_k].push_back(hit_result);
292 }
293 }
294 }
295
296 return grid_results;
297}
298
299std::vector<std::vector<CollisionDetection::HitResult>> CollisionDetection::calculateVoxelPathLengths(const vec3 &scan_origin, const std::vector<vec3> &ray_directions, const std::vector<vec3> &voxel_centers, const std::vector<vec3> &voxel_sizes) {
300
301 if (ray_directions.empty()) {
302 if (printmessages) {
303 std::cout << "WARNING (CollisionDetection::calculateVoxelPathLengths): No rays provided" << std::endl;
304 }
305 return std::vector<std::vector<HitResult>>();
306 }
307
308 if (voxel_centers.size() != voxel_sizes.size()) {
309 helios_runtime_error("ERROR (CollisionDetection::calculateVoxelPathLengths): voxel_centers and voxel_sizes vectors must have same size");
310 }
311
312 if (voxel_centers.empty()) {
313 if (printmessages) {
314 std::cout << "WARNING (CollisionDetection::calculateVoxelPathLengths): No voxels provided" << std::endl;
315 }
316 return std::vector<std::vector<HitResult>>();
317 }
318
319 const size_t num_rays = ray_directions.size();
320 const size_t num_voxels = voxel_centers.size();
321
322 if (printmessages) {
323 std::cout << "Calculating voxel path lengths for " << num_rays << " rays through " << num_voxels << " voxels..." << std::endl;
324 }
325
326 // Initialize result structure - one vector of HitResults per voxel
327 std::vector<std::vector<HitResult>> result(num_voxels);
328
329// OpenMP parallel loop over rays for performance
330#pragma omp parallel for schedule(dynamic, 1000)
331 for (int ray_idx = 0; ray_idx < static_cast<int>(num_rays); ++ray_idx) {
332 const vec3 &ray_direction = ray_directions[ray_idx];
333
334 // Process each voxel for this ray
335 for (size_t voxel_idx = 0; voxel_idx < num_voxels; ++voxel_idx) {
336 const vec3 &voxel_center = voxel_centers[voxel_idx];
337 const vec3 &voxel_size = voxel_sizes[voxel_idx];
338
339 // Calculate voxel AABB from center and size
340 const vec3 half_size = voxel_size * 0.5f;
341 const vec3 voxel_min = voxel_center - half_size;
342 const vec3 voxel_max = voxel_center + half_size;
343
344 // Perform ray-AABB intersection test
345 float t_min, t_max;
346 if (rayAABBIntersect(scan_origin, ray_direction, voxel_min, voxel_max, t_min, t_max)) {
347 // Calculate path length: t_max - max(0, t_min)
348 const float path_length = t_max - std::max(0.0f, t_min);
349
350 if (path_length > 1e-6f) { // Only count meaningful intersections
351 // Create HitResult with path length information
352 HitResult hit_result;
353 hit_result.hit = false; // No actual primitive hit, just voxel traversal
354 hit_result.distance = -1.0f; // Not applicable for voxel traversal
355 hit_result.primitive_UUID = 0; // No primitive
356 hit_result.intersection_point = make_vec3(0, 0, 0); // Not applicable
357 hit_result.normal = make_vec3(0, 0, 0); // Not applicable
358 hit_result.path_length = path_length; // This is what we want!
359
360// Thread-safe update of results
361#pragma omp critical
362 {
363 result[voxel_idx].push_back(hit_result);
364 }
365 }
366 }
367 }
368 }
369
370 if (printmessages) {
371 size_t total_intersections = 0;
372 for (size_t i = 0; i < num_voxels; ++i) {
373 total_intersections += result[i].size();
374 }
375 std::cout << "Completed voxel path length calculations. Total ray-voxel intersections: " << total_intersections << std::endl;
376 }
377
378 return result;
379}
380
381void CollisionDetection::calculateRayPathLengthsDetailed(const vec3 &grid_center, const vec3 &grid_size, const helios::int3 &grid_divisions, const std::vector<vec3> &ray_origins, const std::vector<vec3> &ray_directions,
382 std::vector<HitResult> &hit_results) {
383
384 hit_results.clear();
385 hit_results.reserve(ray_origins.size());
386
387 if (ray_origins.size() != ray_directions.size()) {
388 helios_runtime_error("ERROR (CollisionDetection::calculateRayPathLengthsDetailed): ray_origins and ray_directions must have the same size");
389 return;
390 }
391
392 // Also update the existing voxel data structures
393 calculateVoxelRayPathLengths(grid_center, grid_size, grid_divisions, ray_origins, ray_directions);
394
395 // Cast each ray and collect detailed results
396 for (size_t i = 0; i < ray_origins.size(); i++) {
397 RayQuery query(ray_origins[i], ray_directions[i]);
398 HitResult result = castRay(query);
399 hit_results.push_back(result);
400 }
401}
402
403// ================================================================
404// PHASE 2 OPTIMIZATION METHODS: Structure-of-Arrays & Quantization
405// ================================================================
406
408 if (mode == bvh_optimization_mode) {
409 return; // No change needed
410 }
411
412 BVHOptimizationMode old_mode = bvh_optimization_mode;
413 bvh_optimization_mode = mode;
414
415 // Build optimized structures when mode changes
416 if (old_mode != mode && !bvh_nodes.empty()) {
417 if (printmessages) {
418 std::cout << "CollisionDetection: Converting BVH from mode " << static_cast<int>(old_mode) << " to mode " << static_cast<int>(mode) << std::endl;
419 }
420
421 // Build the optimized structures immediately
422 ensureOptimizedBVH();
423
424 if (printmessages) {
425 auto memory_stats = getBVHMemoryUsage();
426 std::cout << "CollisionDetection: Memory usage - SoA: " << memory_stats.soa_memory_bytes << " bytes, Quantized: " << memory_stats.quantized_memory_bytes << " bytes (" << memory_stats.quantized_reduction_percent << "% reduction)"
427 << std::endl;
428 }
429 }
430}
431
435
436void CollisionDetection::convertBVHLayout(BVHOptimizationMode from_mode, BVHOptimizationMode to_mode) {
437 // With only SOA_UNCOMPRESSED mode remaining, no conversion needed
438 return;
439}
440
441
442std::vector<CollisionDetection::HitResult> CollisionDetection::castRaysOptimized(const std::vector<RayQuery> &ray_queries, RayTracingStats *stats) {
443 if (ray_queries.empty()) {
444 return {};
445 }
446
447 // Ensure BVH is current and optimized structures are available
448 ensureBVHCurrent();
449 ensureOptimizedBVH();
450
451 // Build primitive cache for high-performance thread-safe primitive intersection
452 if (primitive_cache.empty()) {
453 buildPrimitiveCache();
454 }
455
456 RayTracingStats local_stats;
457 std::vector<HitResult> results;
458 results.reserve(ray_queries.size());
459
460 auto start_time = std::chrono::high_resolution_clock::now();
461
462 // Dispatch to appropriate optimized method based on current mode
463 switch (bvh_optimization_mode) {
465 results = castRaysSoA(ray_queries, local_stats);
466 break;
467 }
468
469 auto end_time = std::chrono::high_resolution_clock::now();
470 auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
471
472 if (stats) {
473 *stats = local_stats;
474 }
475
476 return results;
477}
478
480 if (ray_stream.packets.empty()) {
481 return true;
482 }
483
484 RayTracingStats combined_stats;
485 bool success = true;
486
487 for (auto &packet: ray_stream.packets) {
488 // Convert packet to ray queries
489 auto queries = packet.toRayQueries();
490
491 // Process the packet using optimized ray casting
492 RayTracingStats packet_stats;
493 auto results = castRaysOptimized(queries, &packet_stats);
494
495 if (results.size() != queries.size()) {
496 success = false;
497 continue;
498 }
499
500 // Store results back in packet
501 packet.results = std::move(results);
502
503 // Accumulate statistics
504 combined_stats.total_rays_cast += packet_stats.total_rays_cast;
505 combined_stats.total_hits += packet_stats.total_hits;
506 combined_stats.bvh_nodes_visited += packet_stats.bvh_nodes_visited;
507 combined_stats.average_ray_distance = (combined_stats.average_ray_distance * (combined_stats.total_rays_cast - packet_stats.total_rays_cast) + packet_stats.average_ray_distance * packet_stats.total_rays_cast) / combined_stats.total_rays_cast;
508 }
509
510 if (stats) {
511 *stats = combined_stats;
512 }
513
514 if (printmessages && success) {
515 std::cout << "CollisionDetection: Processed " << ray_stream.packets.size() << " ray packets (" << ray_stream.total_rays << " total rays)" << std::endl;
516 }
517
518 return success;
519}
520
521CollisionDetection::MemoryUsageStats CollisionDetection::getBVHMemoryUsage() const {
522 // Ensure optimized structures are built before calculating memory usage
523 const_cast<CollisionDetection *>(this)->ensureOptimizedBVH();
524
525 MemoryUsageStats stats;
526
527 // Calculate SoA memory usage
528 stats.soa_memory_bytes = bvh_nodes_soa.getMemoryUsage();
529
530 // With quantized mode removed, set quantized values to 0
531 stats.quantized_memory_bytes = 0;
532 stats.quantized_reduction_percent = 0.0f;
533
534 return stats;
535}
536
537std::vector<CollisionDetection::HitResult> CollisionDetection::castRaysSoA(const std::vector<RayQuery> &ray_queries, RayTracingStats &stats) {
538 std::vector<HitResult> results;
539 results.reserve(ray_queries.size());
540
541 if (bvh_nodes_soa.node_count == 0) {
542 // Return empty results if no SoA BVH, but still set statistics
543 results.resize(ray_queries.size());
544 stats.total_rays_cast = ray_queries.size();
545 stats.total_hits = 0;
546 stats.bvh_nodes_visited = 0;
547 stats.average_ray_distance = 0.0f;
548 return results;
549 }
550
551 stats.total_rays_cast = ray_queries.size();
552 stats.total_hits = 0;
553 stats.average_ray_distance = 0.0;
554
555 // Resize results vector for parallel access
556 results.resize(ray_queries.size());
557
558// OpenMP parallel ray processing for high-performance SoA traversal
559#pragma omp parallel
560 {
561 RayTracingStats local_stats = {}; // Thread-local statistics
562
563#pragma omp for schedule(guided, 32)
564 for (int i = 0; i < static_cast<int>(ray_queries.size()); ++i) {
565 HitResult result = castRaySoATraversal(ray_queries[i], local_stats);
566 results[i] = result;
567
568 if (result.hit) {
569 local_stats.total_hits++;
570 local_stats.average_ray_distance += result.distance;
571 }
572 }
573
574// Combine thread-local statistics atomically
575#pragma omp atomic
576 stats.total_hits += local_stats.total_hits;
577
578#pragma omp atomic
579 stats.average_ray_distance += local_stats.average_ray_distance;
580
581#pragma omp atomic
582 stats.bvh_nodes_visited += local_stats.bvh_nodes_visited;
583 }
584
585 if (stats.total_hits > 0) {
586 stats.average_ray_distance /= stats.total_hits;
587 }
588
589 return results;
590}
591
592// Optimized BVH traversal methods using Structure-of-Arrays layout
593
594#include "CollisionDetection.h"
595
596using namespace helios;
597
598CollisionDetection::HitResult CollisionDetection::castRaySoATraversal(const RayQuery &query, RayTracingStats &stats) {
599 HitResult result;
600
601 if (bvh_nodes_soa.node_count == 0 || bvh_nodes_soa.aabb_mins.empty()) {
602 return result; // No BVH built
603 }
604
605 // Stack-based traversal (more cache-friendly than recursion)
606 std::stack<size_t> node_stack;
607 node_stack.push(0); // Start from root
608
609 float closest_distance = (query.max_distance > 0) ? query.max_distance : std::numeric_limits<float>::max();
610
611 // Safety limit to prevent infinite loops
612 const size_t MAX_TRAVERSAL_STEPS = 10000000; // 10M steps should be more than enough
613 size_t traversal_steps = 0;
614
615 while (!node_stack.empty()) {
616 // Safety check to prevent infinite loops
617 if (++traversal_steps > MAX_TRAVERSAL_STEPS) {
618 if (printmessages) {
619 std::cout << "WARNING: BVH traversal exceeded maximum steps, terminating ray" << std::endl;
620 }
621 break;
622 }
623
624 size_t node_idx = node_stack.top();
625 node_stack.pop();
626 stats.bvh_nodes_visited++;
627
628 // Bounds check for node index
629 if (node_idx >= bvh_nodes_soa.node_count) {
630 if (printmessages) {
631 std::cout << "WARNING: Invalid node index " << node_idx << " >= " << bvh_nodes_soa.node_count << std::endl;
632 }
633 continue;
634 }
635
636 // Prefetch data for better cache performance
637 if (node_idx + 1 < bvh_nodes_soa.node_count) {
638#ifdef __GNUC__
639 __builtin_prefetch(&bvh_nodes_soa.aabb_mins[node_idx + 1], 0, 1);
640 __builtin_prefetch(&bvh_nodes_soa.aabb_maxs[node_idx + 1], 0, 1);
641#elif defined(_MSC_VER)
642 _mm_prefetch(reinterpret_cast<const char *>(&bvh_nodes_soa.aabb_mins[node_idx + 1]), _MM_HINT_T1);
643 _mm_prefetch(reinterpret_cast<const char *>(&bvh_nodes_soa.aabb_maxs[node_idx + 1]), _MM_HINT_T1);
644#endif
645 }
646
647 // AABB intersection test using SoA layout
648 if (!aabbIntersectSoA(query.origin, query.direction, closest_distance, node_idx)) {
649 continue;
650 }
651
652 // Check if leaf node
653 if (bvh_nodes_soa.is_leaf_flags[node_idx]) {
654 // Process primitives in this leaf
655 uint32_t primitive_start = bvh_nodes_soa.primitive_starts[node_idx];
656 uint32_t primitive_count = bvh_nodes_soa.primitive_counts[node_idx];
657
658 for (uint32_t i = 0; i < primitive_count; ++i) {
659 uint primitive_id = primitive_indices[primitive_start + i];
660
661 // Skip if not in target list (if specified)
662 if (!query.target_UUIDs.empty()) {
663 bool found = false;
664 for (uint target: query.target_UUIDs) {
665 if (primitive_id == target) {
666 found = true;
667 break;
668 }
669 }
670 if (!found)
671 continue;
672 }
673
674 // Use high-performance cached primitive intersection
675 HitResult primitive_result = intersectPrimitiveThreadSafe(query.origin, query.direction, primitive_id, closest_distance);
676 if (primitive_result.hit && primitive_result.distance < closest_distance) {
677 result = primitive_result;
678 closest_distance = primitive_result.distance;
679 }
680 }
681 } else {
682 // Internal node - add children to stack
683 uint32_t left_child = bvh_nodes_soa.left_children[node_idx];
684 uint32_t right_child = bvh_nodes_soa.right_children[node_idx];
685
686 if (left_child != 0xFFFFFFFF && left_child < bvh_nodes_soa.node_count) {
687 node_stack.push(left_child);
688 }
689 if (right_child != 0xFFFFFFFF && right_child < bvh_nodes_soa.node_count) {
690 node_stack.push(right_child);
691 }
692 }
693 }
694
695 return result;
696}
697
698
699bool CollisionDetection::aabbIntersectSoA(const helios::vec3 &ray_origin, const helios::vec3 &ray_direction, float max_distance, size_t node_index) const {
700 // Direct access to SoA arrays for optimal memory usage
701 const vec3 &aabb_min = bvh_nodes_soa.aabb_mins[node_index];
702 const vec3 &aabb_max = bvh_nodes_soa.aabb_maxs[node_index];
703
704#ifdef __SSE4_1__
705 // SIMD-optimized ray-AABB intersection for better performance
706 __m128 ray_orig = _mm_set_ps(0.0f, ray_origin.z, ray_origin.y, ray_origin.x);
707 __m128 ray_dir = _mm_set_ps(0.0f, ray_direction.z, ray_direction.y, ray_direction.x);
708 __m128 aabb_min_vec = _mm_set_ps(0.0f, aabb_min.z, aabb_min.y, aabb_min.x);
709 __m128 aabb_max_vec = _mm_set_ps(0.0f, aabb_max.z, aabb_max.y, aabb_max.x);
710
711 // Compute inverse ray direction
712 __m128 inv_dir = _mm_div_ps(_mm_set1_ps(1.0f), ray_dir);
713
714 // Compute t1 and t2 for all axes
715 __m128 t1 = _mm_mul_ps(_mm_sub_ps(aabb_min_vec, ray_orig), inv_dir);
716 __m128 t2 = _mm_mul_ps(_mm_sub_ps(aabb_max_vec, ray_orig), inv_dir);
717
718 // Get min and max for each axis
719 __m128 tmin = _mm_min_ps(t1, t2);
720 __m128 tmax = _mm_max_ps(t1, t2);
721
722 // Extract components
723 float tmin_vals[4], tmax_vals[4];
724 _mm_store_ps(tmin_vals, tmin);
725 _mm_store_ps(tmax_vals, tmax);
726
727 float t_near = std::max({tmin_vals[0], tmin_vals[1], tmin_vals[2], 0.0f});
728 float t_far = std::min({tmax_vals[0], tmax_vals[1], tmax_vals[2], max_distance});
729
730 return t_near <= t_far;
731#else
732 // Fallback scalar implementation
733 float inv_dir_x = 1.0f / ray_direction.x;
734 float inv_dir_y = 1.0f / ray_direction.y;
735 float inv_dir_z = 1.0f / ray_direction.z;
736
737 float t1_x = (aabb_min.x - ray_origin.x) * inv_dir_x;
738 float t2_x = (aabb_max.x - ray_origin.x) * inv_dir_x;
739 float t1_y = (aabb_min.y - ray_origin.y) * inv_dir_y;
740 float t2_y = (aabb_max.y - ray_origin.y) * inv_dir_y;
741 float t1_z = (aabb_min.z - ray_origin.z) * inv_dir_z;
742 float t2_z = (aabb_max.z - ray_origin.z) * inv_dir_z;
743
744 float tmin_x = std::min(t1_x, t2_x);
745 float tmax_x = std::max(t1_x, t2_x);
746 float tmin_y = std::min(t1_y, t2_y);
747 float tmax_y = std::max(t1_y, t2_y);
748 float tmin_z = std::min(t1_z, t2_z);
749 float tmax_z = std::max(t1_z, t2_z);
750
751 float t_near = std::max({tmin_x, tmin_y, tmin_z, 0.0f});
752 float t_far = std::min({tmax_x, tmax_y, tmax_z, max_distance});
753
754 return t_near <= t_far;
755#endif
756}
757
758
759// Basic BVH traversal using standard node structure
760CollisionDetection::HitResult CollisionDetection::castRayBVHTraversal(const RayQuery &query) {
761 HitResult result;
762
763 if (bvh_nodes.empty()) {
764 return result; // No BVH built
765 }
766
767 // Stack-based traversal using standard BVH nodes
768 std::stack<size_t> node_stack;
769 node_stack.push(0); // Start from root
770
771 float closest_distance = (query.max_distance > 0) ? query.max_distance : std::numeric_limits<float>::max();
772
773 while (!node_stack.empty()) {
774 size_t node_idx = node_stack.top();
775 node_stack.pop();
776
777 if (node_idx >= bvh_nodes.size()) {
778 if (printmessages) {
779 std::cout << "ERROR: Invalid BVH node index " << node_idx << " >= " << bvh_nodes.size() << " nodes" << std::endl;
780 }
781 result.hit = false;
782 return result;
783 }
784
785 const BVHNode &node = bvh_nodes[node_idx];
786
787 // AABB intersection test - this provides the early miss detection that was missing
788 if (!rayAABBIntersect(query.origin, query.direction, node.aabb_min, node.aabb_max)) {
789 continue; // Ray misses this node's bounding box - skip entire subtree
790 }
791
792 if (node.is_leaf) {
793 // Process primitives in this leaf
794 for (uint32_t i = 0; i < node.primitive_count; ++i) {
795 if (node.primitive_start + i >= primitive_indices.size()) {
796 if (printmessages) {
797 std::cout << "ERROR: Invalid BVH primitive index " << (node.primitive_start + i) << " >= " << primitive_indices.size() << " primitives" << std::endl;
798 }
799 result.hit = false;
800 return result;
801 }
802
803 uint primitive_id = primitive_indices[node.primitive_start + i];
804
805 // Skip if not in target list (if specified)
806 if (!query.target_UUIDs.empty()) {
807 bool found = false;
808 for (uint target: query.target_UUIDs) {
809 if (primitive_id == target) {
810 found = true;
811 break;
812 }
813 }
814 if (!found)
815 continue;
816 }
817
818 // Perform primitive intersection test
819 HitResult primitive_hit = intersectPrimitive(query, primitive_id);
820 if (primitive_hit.hit && primitive_hit.distance < closest_distance) {
821 result = primitive_hit;
822 closest_distance = primitive_hit.distance;
823 }
824 }
825 } else {
826 // Internal node - add children to stack for traversal
827 if (node.left_child != 0xFFFFFFFF && node.left_child < bvh_nodes.size()) {
828 node_stack.push(node.left_child);
829 }
830 if (node.right_child != 0xFFFFFFFF && node.right_child < bvh_nodes.size()) {
831 node_stack.push(node.right_child);
832 }
833 }
834 }
835
836 return result;
837}
838
839// Helper method to test ray-AABB intersection
840bool CollisionDetection::rayAABBIntersect(const vec3 &ray_origin, const vec3 &ray_direction, const vec3 &aabb_min, const vec3 &aabb_max) const {
841 // Robust ray-AABB intersection using slab method with proper axis-aligned ray handling
842 const float EPSILON = 1e-8f;
843
844 float tmin = 0.0f; // Ray starts at origin
845 float tmax = std::numeric_limits<float>::max(); // No maximum distance limit
846
847 // Handle X slab
848 if (std::abs(ray_direction.x) > EPSILON) {
849 float inv_dir_x = 1.0f / ray_direction.x;
850 float t1 = (aabb_min.x - ray_origin.x) * inv_dir_x;
851 float t2 = (aabb_max.x - ray_origin.x) * inv_dir_x;
852
853 float slab_tmin = std::min(t1, t2);
854 float slab_tmax = std::max(t1, t2);
855
856 tmin = std::max(tmin, slab_tmin);
857 tmax = std::min(tmax, slab_tmax);
858
859 if (tmin > tmax)
860 return false; // Early exit if no intersection
861 } else {
862 // Ray is parallel to X slab - check if ray origin is within X bounds
863 if (ray_origin.x < aabb_min.x || ray_origin.x > aabb_max.x) {
864 return false;
865 }
866 }
867
868 // Handle Y slab
869 if (std::abs(ray_direction.y) > EPSILON) {
870 float inv_dir_y = 1.0f / ray_direction.y;
871 float t1 = (aabb_min.y - ray_origin.y) * inv_dir_y;
872 float t2 = (aabb_max.y - ray_origin.y) * inv_dir_y;
873
874 float slab_tmin = std::min(t1, t2);
875 float slab_tmax = std::max(t1, t2);
876
877 tmin = std::max(tmin, slab_tmin);
878 tmax = std::min(tmax, slab_tmax);
879
880 if (tmin > tmax)
881 return false; // Early exit if no intersection
882 } else {
883 // Ray is parallel to Y slab - check if ray origin is within Y bounds
884 if (ray_origin.y < aabb_min.y || ray_origin.y > aabb_max.y) {
885 return false;
886 }
887 }
888
889 // Handle Z slab
890 if (std::abs(ray_direction.z) > EPSILON) {
891 float inv_dir_z = 1.0f / ray_direction.z;
892 float t1 = (aabb_min.z - ray_origin.z) * inv_dir_z;
893 float t2 = (aabb_max.z - ray_origin.z) * inv_dir_z;
894
895 float slab_tmin = std::min(t1, t2);
896 float slab_tmax = std::max(t1, t2);
897
898 tmin = std::max(tmin, slab_tmin);
899 tmax = std::min(tmax, slab_tmax);
900
901 if (tmin > tmax)
902 return false; // Early exit if no intersection
903 } else {
904 // Ray is parallel to Z slab - check if ray origin is within Z bounds
905 if (ray_origin.z < aabb_min.z || ray_origin.z > aabb_max.z) {
906 return false;
907 }
908 }
909
910 return tmin <= tmax;
911}
912
913// Helper method to intersect with individual primitive (reuses existing logic)
914CollisionDetection::HitResult CollisionDetection::intersectPrimitive(const RayQuery &query, uint primitive_id) {
915 // PERFORMANCE FIX: Use direct context call instead of expensive cached primitive data
916 // This avoids the need to build and maintain a primitive cache
917 HitResult result;
918
919 float distance;
920 if (rayPrimitiveIntersection(query.origin, query.direction, primitive_id, distance)) {
921 result.hit = true;
922 result.distance = distance;
923 result.primitive_UUID = primitive_id;
924 result.intersection_point = query.origin + query.direction * distance;
925
926 // Calculate surface normal directly from context (minimal overhead)
927 PrimitiveType type = context->getPrimitiveType(primitive_id);
928 std::vector<vec3> vertices = context->getPrimitiveVertices(primitive_id);
929
930 if (type == PRIMITIVE_TYPE_TRIANGLE && vertices.size() >= 3) {
931 vec3 edge1 = vertices[1] - vertices[0];
932 vec3 edge2 = vertices[2] - vertices[0];
933 result.normal = cross(edge1, edge2);
934 if (result.normal.magnitude() > 1e-8f) {
935 result.normal = result.normal / result.normal.magnitude();
936 } else {
937 result.normal = make_vec3(-query.direction.x, -query.direction.y, -query.direction.z);
938 }
939 } else if (type == PRIMITIVE_TYPE_PATCH && vertices.size() >= 3) {
940 vec3 edge1 = vertices[1] - vertices[0];
941 vec3 edge2 = vertices[2] - vertices[0];
942 result.normal = cross(edge1, edge2);
943 if (result.normal.magnitude() > 1e-8f) {
944 result.normal = result.normal / result.normal.magnitude();
945 } else {
946 result.normal = make_vec3(-query.direction.x, -query.direction.y, -query.direction.z);
947 }
948 } else {
949 // Default normal (opposite to ray direction)
950 result.normal = make_vec3(-query.direction.x, -query.direction.y, -query.direction.z);
951 }
952 }
953
954 return result;
955}
956
957CollisionDetection::HitResult CollisionDetection::intersectPrimitiveThreadSafe(const vec3 &origin, const vec3 &direction, uint primitive_id, float max_distance) {
958 HitResult result;
959
960 // Check if we have cached primitive data for this primitive
961 auto it = primitive_cache.find(primitive_id);
962 if (it == primitive_cache.end()) {
963 // Primitive not in cache - this shouldn't happen in optimized paths
964 // This indicates that the context was modified after the cache was built
965 // Fall back to thread-unsafe context call (only safe for sequential code)
966 // For parallel regions, this could cause issues, but it's better than crashing
967 float distance;
968 if (rayPrimitiveIntersection(origin, direction, primitive_id, distance)) {
969 result.hit = true;
970 result.distance = distance;
971 result.primitive_UUID = primitive_id;
972 result.intersection_point = origin + direction * distance;
973
974 // Calculate surface normal for uncached primitive
975 PrimitiveType type = context->getPrimitiveType(primitive_id);
976 std::vector<vec3> vertices = context->getPrimitiveVertices(primitive_id);
977
978 if (type == PRIMITIVE_TYPE_TRIANGLE && vertices.size() >= 3) {
979 vec3 edge1 = vertices[1] - vertices[0];
980 vec3 edge2 = vertices[2] - vertices[0];
981 result.normal = cross(edge1, edge2);
982 if (result.normal.magnitude() > 1e-8f) {
983 result.normal = result.normal / result.normal.magnitude();
984 // Ensure normal points towards ray origin (for LiDAR compatibility)
985 vec3 to_origin = origin - result.intersection_point;
986 if (result.normal * to_origin < 0) {
987 result.normal = result.normal * -1.0f;
988 }
989 } else {
990 result.normal = make_vec3(-direction.x, -direction.y, -direction.z);
991 result.normal = result.normal / result.normal.magnitude();
992 }
993 } else if (type == PRIMITIVE_TYPE_PATCH && vertices.size() >= 4) {
994 vec3 edge1 = vertices[1] - vertices[0];
995 vec3 edge2 = vertices[2] - vertices[0];
996 result.normal = cross(edge1, edge2);
997 if (result.normal.magnitude() > 1e-8f) {
998 result.normal = result.normal / result.normal.magnitude();
999 // Ensure normal points towards ray origin (for LiDAR compatibility)
1000 vec3 to_origin = origin - result.intersection_point;
1001 if (result.normal * to_origin < 0) {
1002 result.normal = result.normal * -1.0f;
1003 }
1004 } else {
1005 result.normal = make_vec3(-direction.x, -direction.y, -direction.z);
1006 result.normal = result.normal / result.normal.magnitude();
1007 }
1008 } else {
1009 result.normal = make_vec3(-direction.x, -direction.y, -direction.z);
1010 result.normal = result.normal / result.normal.magnitude();
1011 }
1012 }
1013 return result;
1014 }
1015
1016 const CachedPrimitive &cached = it->second;
1017
1018 // Perform intersection test based on primitive type
1019 if (cached.type == PRIMITIVE_TYPE_TRIANGLE && cached.vertices.size() >= 3) {
1020 float distance;
1021 if (triangleIntersect(origin, direction, cached.vertices[0], cached.vertices[1], cached.vertices[2], distance)) {
1022 if (distance > 1e-6f && (max_distance <= 0 || distance < max_distance)) {
1023 result.hit = true;
1024 result.distance = distance;
1025 result.primitive_UUID = primitive_id;
1026 result.intersection_point = origin + direction * distance;
1027
1028 // Calculate triangle normal
1029 vec3 edge1 = cached.vertices[1] - cached.vertices[0];
1030 vec3 edge2 = cached.vertices[2] - cached.vertices[0];
1031 result.normal = cross(edge1, edge2);
1032 if (result.normal.magnitude() > 1e-8f) {
1033 result.normal = result.normal / result.normal.magnitude();
1034 // Ensure normal points towards ray origin (for LiDAR compatibility)
1035 vec3 to_origin = origin - result.intersection_point;
1036 if (result.normal * to_origin < 0) {
1037 result.normal = result.normal * -1.0f;
1038 }
1039 } else {
1040 result.normal = make_vec3(-direction.x, -direction.y, -direction.z);
1041 result.normal = result.normal / result.normal.magnitude();
1042 }
1043 }
1044 }
1045 } else if (cached.type == PRIMITIVE_TYPE_PATCH && cached.vertices.size() >= 4) {
1046 float distance;
1047 if (patchIntersect(origin, direction, cached.vertices[0], cached.vertices[1], cached.vertices[2], cached.vertices[3], distance)) {
1048 if (distance > 1e-6f && (max_distance <= 0 || distance < max_distance)) {
1049 result.hit = true;
1050 result.distance = distance;
1051 result.primitive_UUID = primitive_id;
1052 result.intersection_point = origin + direction * distance;
1053
1054 // Calculate patch normal (use v0, v1, v2 like the original code)
1055 vec3 edge1 = cached.vertices[1] - cached.vertices[0];
1056 vec3 edge2 = cached.vertices[2] - cached.vertices[0];
1057 result.normal = cross(edge1, edge2);
1058 if (result.normal.magnitude() > 1e-8f) {
1059 result.normal = result.normal / result.normal.magnitude();
1060 // Ensure normal points towards ray origin (for LiDAR compatibility)
1061 vec3 to_origin = origin - result.intersection_point;
1062 if (result.normal * to_origin < 0) {
1063 result.normal = result.normal * -1.0f;
1064 }
1065 } else {
1066 result.normal = make_vec3(-direction.x, -direction.y, -direction.z);
1067 result.normal = result.normal / result.normal.magnitude();
1068 }
1069 }
1070 }
1071 } else if (cached.type == PRIMITIVE_TYPE_VOXEL && cached.vertices.size() == 8) {
1072 // Voxel (AABB) intersection using slab method
1073 // Calculate AABB from 8 vertices
1074 vec3 aabb_min = cached.vertices[0];
1075 vec3 aabb_max = cached.vertices[0];
1076
1077 for (int i = 1; i < 8; i++) {
1078 aabb_min.x = std::min(aabb_min.x, cached.vertices[i].x);
1079 aabb_min.y = std::min(aabb_min.y, cached.vertices[i].y);
1080 aabb_min.z = std::min(aabb_min.z, cached.vertices[i].z);
1081 aabb_max.x = std::max(aabb_max.x, cached.vertices[i].x);
1082 aabb_max.y = std::max(aabb_max.y, cached.vertices[i].y);
1083 aabb_max.z = std::max(aabb_max.z, cached.vertices[i].z);
1084 }
1085
1086 // Ray-AABB intersection using slab method
1087 float t_near = -std::numeric_limits<float>::max();
1088 float t_far = std::numeric_limits<float>::max();
1089
1090 // Check intersection with each slab (X, Y, Z)
1091 for (int axis = 0; axis < 3; axis++) {
1092 float ray_dir_component = (axis == 0) ? direction.x : (axis == 1) ? direction.y : direction.z;
1093 float ray_orig_component = (axis == 0) ? origin.x : (axis == 1) ? origin.y : origin.z;
1094 float aabb_min_component = (axis == 0) ? aabb_min.x : (axis == 1) ? aabb_min.y : aabb_min.z;
1095 float aabb_max_component = (axis == 0) ? aabb_max.x : (axis == 1) ? aabb_max.y : aabb_max.z;
1096
1097 if (std::abs(ray_dir_component) < 1e-8f) {
1098 // Ray is parallel to slab
1099 if (ray_orig_component < aabb_min_component || ray_orig_component > aabb_max_component) {
1100 return result; // Ray is outside slab and parallel - no intersection
1101 }
1102 } else {
1103 // Calculate intersection distances for this slab
1104 float t1 = (aabb_min_component - ray_orig_component) / ray_dir_component;
1105 float t2 = (aabb_max_component - ray_orig_component) / ray_dir_component;
1106
1107 // Ensure t1 <= t2
1108 if (t1 > t2) {
1109 std::swap(t1, t2);
1110 }
1111
1112 // Update near and far intersection distances
1113 t_near = std::max(t_near, t1);
1114 t_far = std::min(t_far, t2);
1115
1116 // Early exit if no intersection possible
1117 if (t_near > t_far) {
1118 return result;
1119 }
1120 }
1121 }
1122
1123 // Check if intersection is in front of ray origin and within max distance
1124 if (t_far >= 0.0f) {
1125 // Use t_near if it's positive (ray starts outside box), otherwise t_far (ray starts inside box)
1126 float intersection_distance = (t_near >= 1e-6f) ? t_near : t_far;
1127 if (intersection_distance >= 1e-6f && (max_distance <= 0 || intersection_distance < max_distance)) {
1128 result.hit = true;
1129 result.distance = intersection_distance;
1130 result.primitive_UUID = primitive_id;
1131 result.intersection_point = origin + direction * intersection_distance;
1132
1133 // Calculate normal based on which face was hit
1134 // Determine which face of the voxel was hit by examining intersection point
1135 vec3 hit_point = result.intersection_point;
1136 vec3 box_center = (aabb_min + aabb_max) * 0.5f;
1137 vec3 box_extent = (aabb_max - aabb_min) * 0.5f;
1138
1139 // Find which face the hit point is closest to
1140 vec3 local_hit = hit_point - box_center;
1141 vec3 abs_local_hit = make_vec3(std::abs(local_hit.x), std::abs(local_hit.y), std::abs(local_hit.z));
1142
1143 // Determine which axis has the largest relative coordinate (closest to face)
1144 float rel_x = abs_local_hit.x / box_extent.x;
1145 float rel_y = abs_local_hit.y / box_extent.y;
1146 float rel_z = abs_local_hit.z / box_extent.z;
1147
1148 if (rel_x >= rel_y && rel_x >= rel_z) {
1149 // Hit X face
1150 result.normal = make_vec3((local_hit.x > 0) ? 1.0f : -1.0f, 0.0f, 0.0f);
1151 } else if (rel_y >= rel_z) {
1152 // Hit Y face
1153 result.normal = make_vec3(0.0f, (local_hit.y > 0) ? 1.0f : -1.0f, 0.0f);
1154 } else {
1155 // Hit Z face
1156 result.normal = make_vec3(0.0f, 0.0f, (local_hit.z > 0) ? 1.0f : -1.0f);
1157 }
1158 }
1159 }
1160 }
1161 // Add other primitive types as needed (DISK, etc.)
1162
1163 return result;
1164}
1165
1166void CollisionDetection::buildPrimitiveCache() {
1167 primitive_cache.clear();
1168
1169 // Get all primitive UUIDs from context
1170 std::vector<uint> all_primitives = context->getAllUUIDs();
1171
1172 // Cache primitive data for thread-safe access
1173 for (uint primitive_id: all_primitives) {
1174 if (context->doesPrimitiveExist(primitive_id)) {
1175 try {
1176 PrimitiveType type = context->getPrimitiveType(primitive_id);
1177 std::vector<vec3> vertices = context->getPrimitiveVertices(primitive_id);
1178
1179 primitive_cache[primitive_id] = CachedPrimitive(type, vertices);
1180 } catch (const std::exception &e) {
1181 // Skip this primitive if it no longer exists or can't be accessed
1182 // This can happen when UUIDs from previous contexts persist
1183 if (printmessages) {
1184 std::cout << "Warning: Skipping primitive " << primitive_id << " in cache build (not accessible: " << e.what() << ")" << std::endl;
1185 }
1186 continue;
1187 }
1188 }
1189 }
1190
1191 if (printmessages) {
1192 }
1193}
1194
1195bool CollisionDetection::triangleIntersect(const vec3 &origin, const vec3 &direction, const vec3 &v0, const vec3 &v1, const vec3 &v2, float &distance) {
1196 // Möller-Trumbore triangle intersection algorithm (optimized - no vec3 temporaries)
1197 const float EPSILON = 1e-8f;
1198
1199 // Compute triangle edges directly as components (avoid vec3 constructors)
1200 float edge1_x = v1.x - v0.x, edge1_y = v1.y - v0.y, edge1_z = v1.z - v0.z;
1201 float edge2_x = v2.x - v0.x, edge2_y = v2.y - v0.y, edge2_z = v2.z - v0.z;
1202
1203 // Cross product: h = direction × edge2 (computed directly)
1204 float h_x = direction.y * edge2_z - direction.z * edge2_y;
1205 float h_y = direction.z * edge2_x - direction.x * edge2_z;
1206 float h_z = direction.x * edge2_y - direction.y * edge2_x;
1207
1208 // Dot product: a = edge1 · h
1209 float a = edge1_x * h_x + edge1_y * h_y + edge1_z * h_z;
1210
1211 if (a > -EPSILON && a < EPSILON) {
1212 return false; // Ray is parallel to triangle
1213 }
1214
1215 float f = 1.0f / a;
1216
1217 // Vector s = origin - v0 (computed as components)
1218 float s_x = origin.x - v0.x, s_y = origin.y - v0.y, s_z = origin.z - v0.z;
1219
1220 // u = f * (s · h)
1221 float u = f * (s_x * h_x + s_y * h_y + s_z * h_z);
1222
1223 if (u < -EPSILON || u > 1.0f + EPSILON) {
1224 return false;
1225 }
1226
1227 // Cross product: q = s × edge1 (computed directly)
1228 float q_x = s_y * edge1_z - s_z * edge1_y;
1229 float q_y = s_z * edge1_x - s_x * edge1_z;
1230 float q_z = s_x * edge1_y - s_y * edge1_x;
1231
1232 // v = f * (direction · q)
1233 float v = f * (direction.x * q_x + direction.y * q_y + direction.z * q_z);
1234
1235 if (v < -EPSILON || u + v > 1.0f + EPSILON) {
1236 return false;
1237 }
1238
1239 // t = f * (edge2 · q) - computed directly as dot product
1240 float t = f * (edge2_x * q_x + edge2_y * q_y + edge2_z * q_z);
1241
1242 if (t > EPSILON) {
1243 distance = t;
1244 return true;
1245 }
1246
1247 return false; // Line intersection but not ray intersection
1248}
1249
1250bool CollisionDetection::patchIntersect(const vec3 &origin, const vec3 &direction, const vec3 &v0, const vec3 &v1, const vec3 &v2, const vec3 &v3, float &distance) {
1251 // Patch (quadrilateral) intersection using radiation model algorithm
1252 const float EPSILON = 1e-8f;
1253
1254 // Calculate patch vectors and normal (same as radiation model)
1255 vec3 anchor = v0;
1256 vec3 normal = cross(v1 - v0, v2 - v0);
1257 normal.normalize();
1258
1259 vec3 a = v1 - v0; // First edge vector
1260 vec3 b = v3 - v0; // Second edge vector
1261
1262 // Ray-plane intersection
1263 float denom = direction * normal;
1264 if (std::abs(denom) > EPSILON) { // Not parallel to plane
1265 float t = (anchor - origin) * normal / denom;
1266
1267 if (t > EPSILON && t < 1e8f) { // Valid intersection distance
1268 // Find intersection point
1269 vec3 p = origin + direction * t;
1270 vec3 d = p - anchor;
1271
1272 // Project onto patch coordinate system
1273 float ddota = d * a;
1274 float ddotb = d * b;
1275
1276 // Check if point is within patch bounds (with epsilon tolerance for edge cases)
1277 if (ddota >= -EPSILON && ddota <= (a * a) + EPSILON && ddotb >= -EPSILON && ddotb <= (b * b) + EPSILON) {
1278 distance = t;
1279 return true;
1280 }
1281 }
1282 }
1283
1284 return false;
1285}
1286
1287#ifdef HELIOS_CUDA_AVAILABLE
1288#include <vector_types.h> // For make_float3
1289
1296std::vector<CollisionDetection::HitResult> CollisionDetection::castRaysGPU(const std::vector<RayQuery> &ray_queries, RayTracingStats &stats) {
1297 std::vector<HitResult> results;
1298 results.resize(ray_queries.size());
1299
1300 if (ray_queries.empty() || !gpu_acceleration_enabled) {
1301 return castRaysSoA(ray_queries, stats); // Use CPU implementation
1302 }
1303
1304 auto start = std::chrono::high_resolution_clock::now();
1305
1306 // Ensure BVH is built for GPU acceleration
1307 if (bvh_nodes.empty()) {
1308 if (printmessages) {
1309 }
1310 buildBVH();
1311 if (bvh_nodes.empty()) {
1312 helios_runtime_error("ERROR: BVH construction failed - no geometry available for ray tracing. Ensure primitives are properly added to the collision detection system.");
1313 }
1314 }
1315
1316 // Prepare ray data for GPU
1317 std::vector<float> ray_origins(ray_queries.size() * 3);
1318 std::vector<float> ray_directions(ray_queries.size() * 3);
1319 std::vector<float> ray_max_distances(ray_queries.size());
1320
1321 for (size_t i = 0; i < ray_queries.size(); i++) {
1322 vec3 normalized_dir = ray_queries[i].direction;
1323 if (normalized_dir.magnitude() > 1e-8f) {
1324 normalized_dir = normalized_dir / normalized_dir.magnitude();
1325 }
1326
1327 ray_origins[i * 3] = ray_queries[i].origin.x;
1328 ray_origins[i * 3 + 1] = ray_queries[i].origin.y;
1329 ray_origins[i * 3 + 2] = ray_queries[i].origin.z;
1330
1331 ray_directions[i * 3] = normalized_dir.x;
1332 ray_directions[i * 3 + 1] = normalized_dir.y;
1333 ray_directions[i * 3 + 2] = normalized_dir.z;
1334
1335 ray_max_distances[i] = ray_queries[i].max_distance;
1336 }
1337
1338 // Get all primitives for GPU ray tracing (handle all primitive types like CPU)
1339 std::vector<uint> all_primitives = context->getAllUUIDs();
1340 std::vector<unsigned int> primitive_indices(all_primitives.size());
1341 std::vector<int> primitive_types(all_primitives.size());
1342 std::vector<float3> primitive_vertices;
1343 std::vector<unsigned int> vertex_offsets(all_primitives.size());
1344
1345 size_t triangle_count = 0;
1346 size_t patch_count = 0;
1347 size_t voxel_count = 0;
1348
1349 if (printmessages) {
1350 }
1351
1352 // Prepare primitive data arrays for GPU
1353 size_t vertex_index = 0;
1354 for (size_t i = 0; i < all_primitives.size(); i++) {
1355 primitive_indices[i] = all_primitives[i];
1356 vertex_offsets[i] = vertex_index;
1357
1358 PrimitiveType ptype = context->getPrimitiveType(all_primitives[i]);
1359 primitive_types[i] = static_cast<int>(ptype);
1360
1361 std::vector<vec3> vertices = context->getPrimitiveVertices(all_primitives[i]);
1362
1363 if (ptype == PRIMITIVE_TYPE_TRIANGLE) {
1364 triangle_count++;
1365 if (vertices.size() >= 3) {
1366 // Add 3 vertices for triangle, pad to 4 for alignment
1367 for (int v = 0; v < 3; v++) {
1368 primitive_vertices.push_back(make_float3(vertices[v].x, vertices[v].y, vertices[v].z));
1369 }
1370 primitive_vertices.push_back(make_float3(0, 0, 0)); // Padding
1371 vertex_index += 4;
1372 } else {
1373 if (printmessages) {
1374 std::cout << "WARNING: Triangle primitive " << all_primitives[i] << " has " << vertices.size() << " vertices" << std::endl;
1375 }
1376 // Add zeros for invalid triangle
1377 for (int v = 0; v < 4; v++) {
1378 primitive_vertices.push_back(make_float3(0, 0, 0));
1379 }
1380 vertex_index += 4;
1381 }
1382 } else if (ptype == PRIMITIVE_TYPE_PATCH) {
1383 patch_count++;
1384 if (vertices.size() >= 4) {
1385 // Add 4 vertices for patch
1386 for (int v = 0; v < 4; v++) {
1387 primitive_vertices.push_back(make_float3(vertices[v].x, vertices[v].y, vertices[v].z));
1388 }
1389 vertex_index += 4;
1390 } else {
1391 if (printmessages) {
1392 std::cout << "WARNING: Patch primitive " << all_primitives[i] << " has " << vertices.size() << " vertices" << std::endl;
1393 }
1394 // Add zeros for invalid patch
1395 for (int v = 0; v < 4; v++) {
1396 primitive_vertices.push_back(make_float3(0, 0, 0));
1397 }
1398 vertex_index += 4;
1399 }
1400 } else if (ptype == PRIMITIVE_TYPE_VOXEL) {
1401 voxel_count++;
1402 // For voxels, compute AABB from 8 corner vertices
1403 // vertices was already retrieved above, reuse it
1404 vec3 voxel_min = vertices[0];
1405 vec3 voxel_max = vertices[0];
1406
1407 // Find min/max coordinates from all 8 vertices
1408 for (const auto &vertex: vertices) {
1409 voxel_min.x = std::min(voxel_min.x, vertex.x);
1410 voxel_min.y = std::min(voxel_min.y, vertex.y);
1411 voxel_min.z = std::min(voxel_min.z, vertex.z);
1412 voxel_max.x = std::max(voxel_max.x, vertex.x);
1413 voxel_max.y = std::max(voxel_max.y, vertex.y);
1414 voxel_max.z = std::max(voxel_max.z, vertex.z);
1415 }
1416
1417 // Store as: [min.x, min.y, min.z, max.x, max.y, max.z, 0, 0]
1418 primitive_vertices.push_back(make_float3(voxel_min.x, voxel_min.y, voxel_min.z)); // v0 = min
1419 primitive_vertices.push_back(make_float3(voxel_max.x, voxel_max.y, voxel_max.z)); // v1 = max
1420 primitive_vertices.push_back(make_float3(0, 0, 0)); // v2 = padding
1421 primitive_vertices.push_back(make_float3(0, 0, 0)); // v3 = padding
1422 vertex_index += 4;
1423 } else {
1424 // Unknown primitive type - add padding
1425 for (int v = 0; v < 4; v++) {
1426 primitive_vertices.push_back(make_float3(0, 0, 0));
1427 }
1428 vertex_index += 4;
1429 }
1430 }
1431
1432 if (printmessages) {
1433 }
1434
1435 if (all_primitives.empty()) {
1436 if (printmessages) {
1437 std::cout << "No primitives found for GPU ray tracing, falling back to CPU" << std::endl;
1438 }
1439 return castRaysSoA(ray_queries, stats);
1440 }
1441
1442
1443 // Prepare result arrays
1444 std::vector<float> hit_distances(ray_queries.size());
1445 std::vector<unsigned int> hit_primitive_ids(ray_queries.size());
1446 std::vector<unsigned int> hit_counts(ray_queries.size());
1447
1448 // Convert CPU BVH nodes to GPU format
1449 std::vector<GPUBVHNode> gpu_bvh_nodes(bvh_nodes.size());
1450 for (size_t i = 0; i < bvh_nodes.size(); i++) {
1451 gpu_bvh_nodes[i].aabb_min = make_float3(bvh_nodes[i].aabb_min.x, bvh_nodes[i].aabb_min.y, bvh_nodes[i].aabb_min.z);
1452 gpu_bvh_nodes[i].aabb_max = make_float3(bvh_nodes[i].aabb_max.x, bvh_nodes[i].aabb_max.y, bvh_nodes[i].aabb_max.z);
1453 gpu_bvh_nodes[i].left_child = bvh_nodes[i].left_child;
1454 gpu_bvh_nodes[i].right_child = bvh_nodes[i].right_child;
1455 gpu_bvh_nodes[i].primitive_start = bvh_nodes[i].primitive_start;
1456 gpu_bvh_nodes[i].primitive_count = bvh_nodes[i].primitive_count;
1457 gpu_bvh_nodes[i].is_leaf = bvh_nodes[i].is_leaf ? 1 : 0;
1458 gpu_bvh_nodes[i].padding = 0;
1459
1460 // Debug problematic BVH nodes
1461 if (bvh_nodes[i].is_leaf && printmessages && i < 3) {
1462 std::cout << "BVH leaf node " << i << ": primitive_start=" << bvh_nodes[i].primitive_start << ", primitive_count=" << bvh_nodes[i].primitive_count << ", max_index=" << (bvh_nodes[i].primitive_start + bvh_nodes[i].primitive_count - 1)
1463 << std::endl;
1464 }
1465 }
1466
1467 // Launch high-performance GPU ray intersection kernel
1468 launchRayPrimitiveIntersection(gpu_bvh_nodes.data(), // BVH nodes
1469 static_cast<int>(gpu_bvh_nodes.size()), // Node count
1470 primitive_indices.data(), // Primitive indices
1471 static_cast<int>(all_primitives.size()), // Primitive count
1472 primitive_types.data(), // Primitive types
1473 primitive_vertices.data(), // Primitive vertices (all types)
1474 vertex_offsets.data(), // Vertex offsets
1475 static_cast<int>(primitive_vertices.size()),
1476 ray_origins.data(), // Ray origins
1477 ray_directions.data(), // Ray directions
1478 ray_max_distances.data(), // Ray max distances
1479 static_cast<int>(ray_queries.size()), // Number of rays
1480 hit_distances.data(), // Output: hit distances
1481 hit_primitive_ids.data(), // Output: hit primitive IDs
1482 hit_counts.data(), // Output: hit counts
1483 true // Find closest hit
1484 );
1485
1486 // Convert GPU results back to HitResult format
1487 size_t hit_count = 0;
1488 for (size_t i = 0; i < ray_queries.size(); i++) {
1489 if (hit_counts[i] > 0 && hit_distances[i] <= ray_queries[i].max_distance) {
1490 results[i].hit = true;
1491 results[i].primitive_UUID = hit_primitive_ids[i];
1492 results[i].distance = hit_distances[i];
1493 results[i].intersection_point = ray_queries[i].origin + ray_queries[i].direction * hit_distances[i];
1494 hit_count++;
1495 } else {
1496 results[i].hit = false;
1497 results[i].primitive_UUID = 0;
1498 results[i].distance = std::numeric_limits<float>::max();
1499 }
1500 }
1501
1502 auto end = std::chrono::high_resolution_clock::now();
1503 double elapsed = std::chrono::duration<double, std::milli>(end - start).count();
1504
1505 stats.total_rays_cast = ray_queries.size();
1506 stats.total_hits = hit_count;
1507 double rays_per_second = ray_queries.size() / (elapsed / 1000.0);
1508
1509
1510 return results;
1511}
1512
1513// ================================================================
1514// SIMD-OPTIMIZED RAY TRACING METHODS
1515// ================================================================
1516
1517uint32_t CollisionDetection::rayAABBIntersectSIMD(const vec3 *ray_origins, const vec3 *ray_directions, const vec3 *aabb_mins, const vec3 *aabb_maxs, float *t_mins, float *t_maxs, int count) {
1518#ifdef __AVX2__
1519 if (count == 8) {
1520 // AVX2 implementation for 8 rays at once
1521 uint32_t hit_mask = 0;
1522
1523 for (int i = 0; i < 8; i += 8) {
1524 // Load 8 ray origins
1525 __m256 orig_x = _mm256_set_ps(ray_origins[i + 7].x, ray_origins[i + 6].x, ray_origins[i + 5].x, ray_origins[i + 4].x, ray_origins[i + 3].x, ray_origins[i + 2].x, ray_origins[i + 1].x, ray_origins[i + 0].x);
1526 __m256 orig_y = _mm256_set_ps(ray_origins[i + 7].y, ray_origins[i + 6].y, ray_origins[i + 5].y, ray_origins[i + 4].y, ray_origins[i + 3].y, ray_origins[i + 2].y, ray_origins[i + 1].y, ray_origins[i + 0].y);
1527 __m256 orig_z = _mm256_set_ps(ray_origins[i + 7].z, ray_origins[i + 6].z, ray_origins[i + 5].z, ray_origins[i + 4].z, ray_origins[i + 3].z, ray_origins[i + 2].z, ray_origins[i + 1].z, ray_origins[i + 0].z);
1528
1529 // Load 8 ray directions
1530 __m256 dir_x = _mm256_set_ps(ray_directions[i + 7].x, ray_directions[i + 6].x, ray_directions[i + 5].x, ray_directions[i + 4].x, ray_directions[i + 3].x, ray_directions[i + 2].x, ray_directions[i + 1].x, ray_directions[i + 0].x);
1531 __m256 dir_y = _mm256_set_ps(ray_directions[i + 7].y, ray_directions[i + 6].y, ray_directions[i + 5].y, ray_directions[i + 4].y, ray_directions[i + 3].y, ray_directions[i + 2].y, ray_directions[i + 1].y, ray_directions[i + 0].y);
1532 __m256 dir_z = _mm256_set_ps(ray_directions[i + 7].z, ray_directions[i + 6].z, ray_directions[i + 5].z, ray_directions[i + 4].z, ray_directions[i + 3].z, ray_directions[i + 2].z, ray_directions[i + 1].z, ray_directions[i + 0].z);
1533
1534 // Load 8 AABB mins
1535 __m256 aabb_min_x = _mm256_set_ps(aabb_mins[i + 7].x, aabb_mins[i + 6].x, aabb_mins[i + 5].x, aabb_mins[i + 4].x, aabb_mins[i + 3].x, aabb_mins[i + 2].x, aabb_mins[i + 1].x, aabb_mins[i + 0].x);
1536 __m256 aabb_min_y = _mm256_set_ps(aabb_mins[i + 7].y, aabb_mins[i + 6].y, aabb_mins[i + 5].y, aabb_mins[i + 4].y, aabb_mins[i + 3].y, aabb_mins[i + 2].y, aabb_mins[i + 1].y, aabb_mins[i + 0].y);
1537 __m256 aabb_min_z = _mm256_set_ps(aabb_mins[i + 7].z, aabb_mins[i + 6].z, aabb_mins[i + 5].z, aabb_mins[i + 4].z, aabb_mins[i + 3].z, aabb_mins[i + 2].z, aabb_mins[i + 1].z, aabb_mins[i + 0].z);
1538
1539 // Load 8 AABB maxs
1540 __m256 aabb_max_x = _mm256_set_ps(aabb_maxs[i + 7].x, aabb_maxs[i + 6].x, aabb_maxs[i + 5].x, aabb_maxs[i + 4].x, aabb_maxs[i + 3].x, aabb_maxs[i + 2].x, aabb_maxs[i + 1].x, aabb_maxs[i + 0].x);
1541 __m256 aabb_max_y = _mm256_set_ps(aabb_maxs[i + 7].y, aabb_maxs[i + 6].y, aabb_maxs[i + 5].y, aabb_maxs[i + 4].y, aabb_maxs[i + 3].y, aabb_maxs[i + 2].y, aabb_maxs[i + 1].y, aabb_maxs[i + 0].y);
1542 __m256 aabb_max_z = _mm256_set_ps(aabb_maxs[i + 7].z, aabb_maxs[i + 6].z, aabb_maxs[i + 5].z, aabb_maxs[i + 4].z, aabb_maxs[i + 3].z, aabb_maxs[i + 2].z, aabb_maxs[i + 1].z, aabb_maxs[i + 0].z);
1543
1544 // Calculate inverse directions
1545 __m256 inv_dir_x = _mm256_div_ps(_mm256_set1_ps(1.0f), dir_x);
1546 __m256 inv_dir_y = _mm256_div_ps(_mm256_set1_ps(1.0f), dir_y);
1547 __m256 inv_dir_z = _mm256_div_ps(_mm256_set1_ps(1.0f), dir_z);
1548
1549 // Calculate intersection distances for X axis
1550 __m256 t1_x = _mm256_mul_ps(_mm256_sub_ps(aabb_min_x, orig_x), inv_dir_x);
1551 __m256 t2_x = _mm256_mul_ps(_mm256_sub_ps(aabb_max_x, orig_x), inv_dir_x);
1552 __m256 tmin_x = _mm256_min_ps(t1_x, t2_x);
1553 __m256 tmax_x = _mm256_max_ps(t1_x, t2_x);
1554
1555 // Calculate intersection distances for Y axis
1556 __m256 t1_y = _mm256_mul_ps(_mm256_sub_ps(aabb_min_y, orig_y), inv_dir_y);
1557 __m256 t2_y = _mm256_mul_ps(_mm256_sub_ps(aabb_max_y, orig_y), inv_dir_y);
1558 __m256 tmin_y = _mm256_min_ps(t1_y, t2_y);
1559 __m256 tmax_y = _mm256_max_ps(t1_y, t2_y);
1560
1561 // Calculate intersection distances for Z axis
1562 __m256 t1_z = _mm256_mul_ps(_mm256_sub_ps(aabb_min_z, orig_z), inv_dir_z);
1563 __m256 t2_z = _mm256_mul_ps(_mm256_sub_ps(aabb_max_z, orig_z), inv_dir_z);
1564 __m256 tmin_z = _mm256_min_ps(t1_z, t2_z);
1565 __m256 tmax_z = _mm256_max_ps(t1_z, t2_z);
1566
1567 // Find intersection interval
1568 __m256 t_min_final = _mm256_max_ps(_mm256_max_ps(tmin_x, tmin_y), tmin_z);
1569 __m256 t_max_final = _mm256_min_ps(_mm256_min_ps(tmax_x, tmax_y), tmax_z);
1570
1571 // Store results
1572 _mm256_store_ps(&t_mins[i], t_min_final);
1573 _mm256_store_ps(&t_maxs[i], t_max_final);
1574
1575 // Check for intersection: t_max >= 0 && t_min <= t_max
1576 __m256 zero = _mm256_set1_ps(0.0f);
1577 __m256 hits = _mm256_and_ps(_mm256_cmp_ps(t_max_final, zero, _CMP_GE_OS), _mm256_cmp_ps(t_min_final, t_max_final, _CMP_LE_OS));
1578
1579 // Convert to bitmask
1580 hit_mask |= _mm256_movemask_ps(hits);
1581 }
1582
1583 return hit_mask;
1584 }
1585#endif
1586
1587#ifdef __SSE4_1__
1588 if (count == 4) {
1589 // SSE implementation for 4 rays at once
1590 uint32_t hit_mask = 0;
1591
1592 for (int i = 0; i < 4; i += 4) {
1593 // Load 4 ray origins
1594 __m128 orig_x = _mm_set_ps(ray_origins[i + 3].x, ray_origins[i + 2].x, ray_origins[i + 1].x, ray_origins[i + 0].x);
1595 __m128 orig_y = _mm_set_ps(ray_origins[i + 3].y, ray_origins[i + 2].y, ray_origins[i + 1].y, ray_origins[i + 0].y);
1596 __m128 orig_z = _mm_set_ps(ray_origins[i + 3].z, ray_origins[i + 2].z, ray_origins[i + 1].z, ray_origins[i + 0].z);
1597
1598 // Load 4 ray directions
1599 __m128 dir_x = _mm_set_ps(ray_directions[i + 3].x, ray_directions[i + 2].x, ray_directions[i + 1].x, ray_directions[i + 0].x);
1600 __m128 dir_y = _mm_set_ps(ray_directions[i + 3].y, ray_directions[i + 2].y, ray_directions[i + 1].y, ray_directions[i + 0].y);
1601 __m128 dir_z = _mm_set_ps(ray_directions[i + 3].z, ray_directions[i + 2].z, ray_directions[i + 1].z, ray_directions[i + 0].z);
1602
1603 // Load 4 AABB mins
1604 __m128 aabb_min_x = _mm_set_ps(aabb_mins[i + 3].x, aabb_mins[i + 2].x, aabb_mins[i + 1].x, aabb_mins[i + 0].x);
1605 __m128 aabb_min_y = _mm_set_ps(aabb_mins[i + 3].y, aabb_mins[i + 2].y, aabb_mins[i + 1].y, aabb_mins[i + 0].y);
1606 __m128 aabb_min_z = _mm_set_ps(aabb_mins[i + 3].z, aabb_mins[i + 2].z, aabb_mins[i + 1].z, aabb_mins[i + 0].z);
1607
1608 // Load 4 AABB maxs
1609 __m128 aabb_max_x = _mm_set_ps(aabb_maxs[i + 3].x, aabb_maxs[i + 2].x, aabb_maxs[i + 1].x, aabb_maxs[i + 0].x);
1610 __m128 aabb_max_y = _mm_set_ps(aabb_maxs[i + 3].y, aabb_maxs[i + 2].y, aabb_maxs[i + 1].y, aabb_maxs[i + 0].y);
1611 __m128 aabb_max_z = _mm_set_ps(aabb_maxs[i + 3].z, aabb_maxs[i + 2].z, aabb_maxs[i + 1].z, aabb_maxs[i + 0].z);
1612
1613 // Calculate inverse directions
1614 __m128 inv_dir_x = _mm_div_ps(_mm_set1_ps(1.0f), dir_x);
1615 __m128 inv_dir_y = _mm_div_ps(_mm_set1_ps(1.0f), dir_y);
1616 __m128 inv_dir_z = _mm_div_ps(_mm_set1_ps(1.0f), dir_z);
1617
1618 // Calculate intersection distances for X axis
1619 __m128 t1_x = _mm_mul_ps(_mm_sub_ps(aabb_min_x, orig_x), inv_dir_x);
1620 __m128 t2_x = _mm_mul_ps(_mm_sub_ps(aabb_max_x, orig_x), inv_dir_x);
1621 __m128 tmin_x = _mm_min_ps(t1_x, t2_x);
1622 __m128 tmax_x = _mm_max_ps(t1_x, t2_x);
1623
1624 // Calculate intersection distances for Y axis
1625 __m128 t1_y = _mm_mul_ps(_mm_sub_ps(aabb_min_y, orig_y), inv_dir_y);
1626 __m128 t2_y = _mm_mul_ps(_mm_sub_ps(aabb_max_y, orig_y), inv_dir_y);
1627 __m128 tmin_y = _mm_min_ps(t1_y, t2_y);
1628 __m128 tmax_y = _mm_max_ps(t1_y, t2_y);
1629
1630 // Calculate intersection distances for Z axis
1631 __m128 t1_z = _mm_mul_ps(_mm_sub_ps(aabb_min_z, orig_z), inv_dir_z);
1632 __m128 t2_z = _mm_mul_ps(_mm_sub_ps(aabb_max_z, orig_z), inv_dir_z);
1633 __m128 tmin_z = _mm_min_ps(t1_z, t2_z);
1634 __m128 tmax_z = _mm_max_ps(t1_z, t2_z);
1635
1636 // Find intersection interval
1637 __m128 t_min_final = _mm_max_ps(_mm_max_ps(tmin_x, tmin_y), tmin_z);
1638 __m128 t_max_final = _mm_min_ps(_mm_min_ps(tmax_x, tmax_y), tmax_z);
1639
1640 // Store results
1641 _mm_store_ps(&t_mins[i], t_min_final);
1642 _mm_store_ps(&t_maxs[i], t_max_final);
1643
1644 // Check for intersection: t_max >= 0 && t_min <= t_max
1645 __m128 zero = _mm_set1_ps(0.0f);
1646 __m128 hits = _mm_and_ps(_mm_cmpge_ps(t_max_final, zero), _mm_cmple_ps(t_min_final, t_max_final));
1647
1648 // Convert to bitmask
1649 hit_mask |= _mm_movemask_ps(hits);
1650 }
1651
1652 return hit_mask;
1653 }
1654#endif
1655
1656 // Fallback to scalar implementation
1657 uint32_t hit_mask = 0;
1658 for (int i = 0; i < count; ++i) {
1659 if (rayAABBIntersect(ray_origins[i], ray_directions[i], aabb_mins[i], aabb_maxs[i], t_mins[i], t_maxs[i])) {
1660 hit_mask |= (1 << i);
1661 }
1662 }
1663 return hit_mask;
1664}
1665
1666void CollisionDetection::traverseBVHSIMD(const vec3 *ray_origins, const vec3 *ray_directions, int count, HitResult *results) {
1667 if (bvh_nodes.empty()) {
1668 // Initialize all results as misses
1669 for (int i = 0; i < count; ++i) {
1670 results[i] = HitResult();
1671 }
1672 return;
1673 }
1674
1675 // Determine SIMD batch size based on available instructions
1676 int simd_batch_size = 1; // Default to scalar
1677#ifdef __AVX2__
1678 simd_batch_size = 8;
1679#elif defined(__SSE4_1__)
1680 simd_batch_size = 4;
1681#endif
1682
1683 // Process rays in SIMD batches
1684 for (int batch_start = 0; batch_start < count; batch_start += simd_batch_size) {
1685 int batch_count = std::min(simd_batch_size, count - batch_start);
1686
1687 // Initialize batch results
1688 for (int i = 0; i < batch_count; ++i) {
1689 results[batch_start + i] = HitResult();
1690 }
1691
1692 if (batch_count >= simd_batch_size && simd_batch_size > 1) {
1693 // SIMD-optimized batch processing
1694 traverseBVHSIMDImpl(&ray_origins[batch_start], &ray_directions[batch_start], batch_count, &results[batch_start]);
1695 } else {
1696 // Process remaining rays individually
1697 for (int i = 0; i < batch_count; ++i) {
1698 int ray_idx = batch_start + i;
1699 results[ray_idx] = castRay(RayQuery(ray_origins[ray_idx], ray_directions[ray_idx]));
1700 }
1701 }
1702 }
1703}
1704
1705void CollisionDetection::traverseBVHSIMDImpl(const vec3 *ray_origins, const vec3 *ray_directions, int count, HitResult *results) {
1706 const size_t MAX_STACK_SIZE = 64;
1707
1708 // Per-ray data structures - using aligned arrays for SIMD efficiency
1709 alignas(32) uint32_t node_stacks[8][MAX_STACK_SIZE]; // 8 stacks for up to 8 rays
1710 alignas(32) uint32_t stack_tops[8] = {0}; // Current stack positions
1711 alignas(32) float closest_distances[8]; // Closest hit distances per ray
1712 alignas(32) bool ray_active[8]; // Which rays are still active
1713
1714 // Initialize per-ray state
1715 for (int i = 0; i < count; ++i) {
1716 node_stacks[i][0] = 0; // Start with root node
1717 stack_tops[i] = 1;
1718 closest_distances[i] = std::numeric_limits<float>::max();
1719 ray_active[i] = true;
1720 results[i] = HitResult(); // Initialize as miss
1721 }
1722
1723 // Main traversal loop - continue while any ray is active
1724 while (true) {
1725 bool any_active = false;
1726 for (int i = 0; i < count; ++i) {
1727 if (ray_active[i] && stack_tops[i] > 0) {
1728 any_active = true;
1729 break;
1730 }
1731 }
1732 if (!any_active)
1733 break;
1734
1735 // Collect next nodes to test for active rays
1736 alignas(32) vec3 test_aabb_mins[8];
1737 alignas(32) vec3 test_aabb_maxs[8];
1738 alignas(32) uint32_t test_node_indices[8];
1739 alignas(32) int test_ray_indices[8];
1740 int test_count = 0;
1741
1742 for (int i = 0; i < count; ++i) {
1743 if (ray_active[i] && stack_tops[i] > 0) {
1744 uint32_t node_idx = node_stacks[i][--stack_tops[i]];
1745 const BVHNode &node = bvh_nodes[node_idx];
1746
1747 test_aabb_mins[test_count] = node.aabb_min;
1748 test_aabb_maxs[test_count] = node.aabb_max;
1749 test_node_indices[test_count] = node_idx;
1750 test_ray_indices[test_count] = i;
1751 test_count++;
1752
1753 if (test_count == count)
1754 break; // Batch is full
1755 }
1756 }
1757
1758 if (test_count == 0)
1759 break;
1760
1761 // Prepare ray data for SIMD intersection test
1762 alignas(32) vec3 batch_origins[8];
1763 alignas(32) vec3 batch_directions[8];
1764 alignas(32) float t_mins[8];
1765 alignas(32) float t_maxs[8];
1766
1767 for (int i = 0; i < test_count; ++i) {
1768 int ray_idx = test_ray_indices[i];
1769 batch_origins[i] = ray_origins[ray_idx];
1770 batch_directions[i] = ray_directions[ray_idx];
1771 }
1772
1773 // Perform SIMD AABB intersection test
1774 uint32_t hit_mask = rayAABBIntersectSIMD(batch_origins, batch_directions, test_aabb_mins, test_aabb_maxs, t_mins, t_maxs, test_count);
1775
1776 // Process intersection results
1777 for (int i = 0; i < test_count; ++i) {
1778 if (!(hit_mask & (1 << i)))
1779 continue; // Ray missed this AABB
1780
1781 int ray_idx = test_ray_indices[i];
1782 uint32_t node_idx = test_node_indices[i];
1783 const BVHNode &node = bvh_nodes[node_idx];
1784
1785 if (t_mins[i] > closest_distances[ray_idx])
1786 continue; // Beyond closest hit
1787
1788 if (node.is_leaf) {
1789 // Test primitives in leaf node
1790 for (uint32_t prim_idx = node.primitive_start; prim_idx < node.primitive_start + node.primitive_count; ++prim_idx) {
1791
1792 uint32_t primitive_id = primitive_indices[prim_idx];
1793
1794 // Use thread-safe primitive intersection
1795 HitResult prim_result = intersectPrimitiveThreadSafe(batch_origins[i], batch_directions[i], primitive_id, closest_distances[ray_idx]);
1796 if (prim_result.hit && prim_result.distance < closest_distances[ray_idx]) {
1797 closest_distances[ray_idx] = prim_result.distance;
1798 results[ray_idx] = prim_result;
1799 }
1800 }
1801 } else {
1802 // Add child nodes to stack (if space available)
1803 if (stack_tops[ray_idx] < MAX_STACK_SIZE - 2) {
1804 node_stacks[ray_idx][stack_tops[ray_idx]++] = node.left_child;
1805 node_stacks[ray_idx][stack_tops[ray_idx]++] = node.right_child;
1806 } else {
1807 // Stack overflow - mark ray as inactive to prevent infinite loop
1808 ray_active[ray_idx] = false;
1809 }
1810 }
1811 }
1812 }
1813}
1814#endif
1815
1816
1817bool CollisionDetection::rayAABBIntersectPrimitive(const helios::vec3 &origin, const helios::vec3 &direction, const helios::vec3 &aabb_min, const helios::vec3 &aabb_max, float &distance) {
1818 // Ray-AABB intersection using slab method
1819 // Optimized version with early termination
1820
1821 const float EPSILON = 1e-8f;
1822
1823 // Calculate t values for each slab
1824 float t_min_x = (aabb_min.x - origin.x) / direction.x;
1825 float t_max_x = (aabb_max.x - origin.x) / direction.x;
1826
1827 // Handle negative direction components
1828 if (direction.x < 0.0f) {
1829 float temp = t_min_x;
1830 t_min_x = t_max_x;
1831 t_max_x = temp;
1832 }
1833
1834 float t_min_y = (aabb_min.y - origin.y) / direction.y;
1835 float t_max_y = (aabb_max.y - origin.y) / direction.y;
1836
1837 if (direction.y < 0.0f) {
1838 float temp = t_min_y;
1839 t_min_y = t_max_y;
1840 t_max_y = temp;
1841 }
1842
1843 // Check for early termination in X-Y
1844 float t_min = std::max(t_min_x, t_min_y);
1845 float t_max = std::min(t_max_x, t_max_y);
1846
1847 if (t_min > t_max) {
1848 return false; // No intersection
1849 }
1850
1851 float t_min_z = (aabb_min.z - origin.z) / direction.z;
1852 float t_max_z = (aabb_max.z - origin.z) / direction.z;
1853
1854 if (direction.z < 0.0f) {
1855 float temp = t_min_z;
1856 t_min_z = t_max_z;
1857 t_max_z = temp;
1858 }
1859
1860 // Final intersection test
1861 t_min = std::max(t_min, t_min_z);
1862 t_max = std::min(t_max, t_max_z);
1863
1864 if (t_min > t_max || t_max < EPSILON) {
1865 return false; // No intersection or behind ray
1866 }
1867
1868 // Set distance to closest intersection point
1869 distance = (t_min > EPSILON) ? t_min : t_max;
1870
1871 return distance > EPSILON;
1872}