1.3.64
 
Loading...
Searching...
No Matches
CollisionDetection_RayTracing.cpp
Go to the documentation of this file.
1
16#include <chrono>
17#include <functional>
18#include <limits>
19#include <queue>
20#include <stack>
21#include <thread>
22#include "CollisionDetection.h"
23
24// SIMD headers
25#ifdef __AVX2__
26#include <immintrin.h>
27#elif defined(__SSE4_1__)
28#include <smmintrin.h>
29#elif defined(__SSE2__)
30#include <emmintrin.h>
31#endif
32
33// MSVC prefetch support
34#ifdef _MSC_VER
35#include <intrin.h>
36#endif
37#include <algorithm>
38
39#ifdef HELIOS_CUDA_AVAILABLE
40#include <cuda_runtime.h>
41#endif
42
43using namespace helios;
44
45#ifdef HELIOS_CUDA_AVAILABLE
46// GPU BVH node structure (must match the one in .cu file)
47struct GPUBVHNode {
48 float3 aabb_min, aabb_max;
49 unsigned int left_child, right_child;
50 unsigned int primitive_start, primitive_count;
51 unsigned int is_leaf, padding;
52};
53
54// External CUDA functions
55extern "C" {
56void launchBVHTraversal(void *h_nodes, int node_count, unsigned int *h_primitive_indices, int primitive_count, float *h_primitive_aabb_min, float *h_primitive_aabb_max, float *h_query_aabb_min, float *h_query_aabb_max, int num_queries,
57 unsigned int *h_results, unsigned int *h_result_counts, int max_results_per_query);
58bool launchVoxelRayPathLengths(int num_rays, float *h_ray_origins, float *h_ray_directions, float grid_center_x, float grid_center_y, float grid_center_z, float grid_size_x, float grid_size_y, float grid_size_z, int grid_divisions_x,
59 int grid_divisions_y, int grid_divisions_z, int primitive_count, int *h_voxel_ray_counts, float *h_voxel_path_lengths, int *h_voxel_transmitted, int *h_voxel_hit_before, int *h_voxel_hit_after, int *h_voxel_hit_inside);
60// Warp-efficient GPU kernels
61void launchWarpEfficientBVH(void *h_bvh_soa_gpu, unsigned int *h_primitive_indices, int primitive_count, float *h_primitive_aabb_min, float *h_primitive_aabb_max, float *h_ray_origins, float *h_ray_directions, float *h_ray_max_distances,
62 int num_rays, unsigned int *h_results, unsigned int *h_result_counts, int max_results_per_ray);
63// High-performance ray-triangle intersection kernel
64void launchRayPrimitiveIntersection(void *h_bvh_nodes, int node_count, unsigned int *h_primitive_indices, int primitive_count, int *h_primitive_types, float3 *h_primitive_vertices, unsigned int *h_vertex_offsets, int total_vertex_count,
65 float *h_ray_origins, float *h_ray_directions, float *h_ray_max_distances, int num_rays, float *h_hit_distances, unsigned int *h_hit_primitive_ids, unsigned int *h_hit_counts, bool find_closest_hit);
66}
67
68// Helper function to convert helios::vec3 to float3
69inline float3 heliosVecToFloat3(const helios::vec3 &v) {
70 return make_float3(v.x, v.y, v.z);
71}
72#endif
73
74// -------- GENERIC RAY-TRACING IMPLEMENTATIONS --------
75
77 return castRay(ray_query.origin, ray_query.direction, ray_query.max_distance, ray_query.target_UUIDs);
78}
79
80CollisionDetection::HitResult CollisionDetection::castRay(const vec3 &origin, const vec3 &direction, float max_distance, const std::vector<uint> &target_UUIDs) {
81 HitResult result;
82
83 // Normalize direction vector
84 vec3 ray_direction = direction;
85 if (ray_direction.magnitude() < 1e-8f) {
86 return result; // Invalid direction
87 }
88 ray_direction = ray_direction / ray_direction.magnitude();
89
90 // Ensure BVH is current before ray casting (handles automatic rebuilds)
91 const_cast<CollisionDetection *>(this)->ensureBVHCurrent();
92
93 // CRITICAL FIX: Use BVH traversal when available for performance
94 // This fixes the 10x+ performance regression by replacing brute-force primitive testing
95 if (!bvh_nodes.empty()) {
96 // Build primitive cache for thread-safe access if not already built
97 if (primitive_cache.empty()) {
98 buildPrimitiveCache();
99 }
100
101 // Use BVH traversal for both filtered and unfiltered queries - BVH supports UUID filtering
102 RayQuery query(origin, ray_direction, max_distance, target_UUIDs);
103 return castRayBVHTraversal(query);
104 }
105
106 // FALLBACK: Brute-force primitive testing (only when no BVH available or target UUIDs specified)
107 // This is the slow path that was causing the performance regression
108 std::vector<uint> search_primitives;
109 if (target_UUIDs.empty()) {
110 // Use cached primitives if available (thread-safe), otherwise use context (thread-unsafe)
111 if (!primitive_cache.empty()) {
112 // Use cached primitive IDs for thread-safe operation
113 search_primitives.reserve(primitive_cache.size());
114 for (const auto &cached_pair: primitive_cache) {
115 search_primitives.push_back(cached_pair.first);
116 }
117 } else {
118 // Fall back to context call (thread-unsafe)
119 search_primitives = context->getAllUUIDs();
120 }
121 } else {
122 search_primitives = target_UUIDs;
123 }
124
125 // Find nearest intersection
126 float nearest_distance = std::numeric_limits<float>::max();
127 if (max_distance > 0) {
128 nearest_distance = max_distance;
129 }
130
131 uint hit_primitive = 0;
132 bool found_intersection = false;
133
134 // Test each primitive for intersection
135 for (uint candidate_uuid: search_primitives) {
136 float intersection_distance;
137 bool hit = false;
138
139 // Use thread-safe cached intersection if available
140 if (!primitive_cache.empty()) {
141 HitResult primitive_result = intersectPrimitiveThreadSafe(origin, ray_direction, candidate_uuid, max_distance);
142 if (primitive_result.hit) {
143 hit = true;
144 intersection_distance = primitive_result.distance;
145 }
146 } else {
147 // Fall back to thread-unsafe context call
148 if (!context->doesPrimitiveExist(candidate_uuid)) {
149 continue;
150 }
151 hit = rayPrimitiveIntersection(origin, ray_direction, candidate_uuid, intersection_distance);
152 }
153
154 if (hit) {
155 // Check distance constraints
156 if (intersection_distance > 1e-6f && // Avoid self-intersection
157 intersection_distance < nearest_distance) { // Find nearest
158
159 nearest_distance = intersection_distance;
160 hit_primitive = candidate_uuid;
161 found_intersection = true;
162 }
163 }
164 }
165
166 // Fill result
167 if (found_intersection) {
168 result.hit = true;
169 result.distance = nearest_distance;
170 result.primitive_UUID = hit_primitive;
171 result.intersection_point = origin + ray_direction * nearest_distance;
172
173 // Calculate surface normal
174 try {
175 PrimitiveType type = context->getPrimitiveType(hit_primitive);
176 std::vector<vec3> vertices = context->getPrimitiveVertices(hit_primitive);
177
178 if (type == PRIMITIVE_TYPE_TRIANGLE && vertices.size() >= 3) {
179 // Calculate triangle normal
180 vec3 v0 = vertices[0];
181 vec3 v1 = vertices[1];
182 vec3 v2 = vertices[2];
183 vec3 edge1 = v1 - v0;
184 vec3 edge2 = v2 - v0;
185 result.normal = cross(edge1, edge2);
186 result.normal = result.normal / result.normal.magnitude();
187 } else if (type == PRIMITIVE_TYPE_PATCH && vertices.size() >= 4) {
188 // Calculate patch normal (assuming quad)
189 vec3 v0 = vertices[0];
190 vec3 v1 = vertices[1];
191 vec3 v2 = vertices[2];
192 vec3 edge1 = v1 - v0;
193 vec3 edge2 = v2 - v0;
194 result.normal = cross(edge1, edge2);
195 result.normal = result.normal / result.normal.magnitude();
196 } else {
197 // Default normal (pointing back along ray)
198 result.normal = make_vec3(-ray_direction.x, -ray_direction.y, -ray_direction.z);
199 }
200 } catch (const std::exception &e) {
201 // If we can't get surface normal, use default
202 result.normal = make_vec3(-ray_direction.x, -ray_direction.y, -ray_direction.z);
203 }
204 }
205
206 return result;
207}
208
209
210std::vector<CollisionDetection::HitResult> CollisionDetection::castRays(const std::vector<RayQuery> &ray_queries, RayTracingStats *stats) {
211 std::vector<HitResult> results;
212 results.reserve(ray_queries.size());
213
214 // Initialize statistics
215 RayTracingStats local_stats;
216 local_stats.total_rays_cast = ray_queries.size();
217
218 // Smart CPU/GPU selection based on ray count and scene complexity
219 // Performance analysis shows CPU is faster for small batches (<1000 rays) due to
220 // GPU launch overhead, while GPU provides significant speedup for large batches
221 const size_t GPU_BATCH_THRESHOLD = 1000000; // Use GPU for batches larger than 1M rays
222 const size_t MIN_PRIMITIVES_FOR_GPU = 500; // Minimum scene complexity for GPU
223
224#ifdef HELIOS_CUDA_AVAILABLE
225 bool use_gpu = gpu_acceleration_enabled && ray_queries.size() >= GPU_BATCH_THRESHOLD && d_bvh_nodes != nullptr && !primitive_indices.empty() && primitive_indices.size() >= MIN_PRIMITIVES_FOR_GPU;
226
227 if (use_gpu) {
228 // Use GPU acceleration for large batches with sufficient scene complexity
229 castRaysGPU(ray_queries, results, local_stats);
230 } else {
231 // Use CPU for small batches or simple scenes
232 castRaysCPU(ray_queries, results, local_stats);
233 }
234#else
235 // Use CPU implementation when GPU not available
236 castRaysCPU(ray_queries, results, local_stats);
237#endif
238
239 // Copy statistics to output parameter if provided
240 if (stats != nullptr) {
241 *stats = local_stats;
242 }
243
244 return results;
245}
246
247void CollisionDetection::castRaysCPU(const std::vector<RayQuery> &ray_queries, std::vector<HitResult> &results, RayTracingStats &stats) {
248 // Use optimized batch processing - fail explicitly if there are issues
249 results = castRaysOptimized(ray_queries, &stats);
250}
251
252#ifdef HELIOS_CUDA_AVAILABLE
253void CollisionDetection::castRaysGPU(const std::vector<RayQuery> &ray_queries, std::vector<HitResult> &results, RayTracingStats &stats) {
254 // Use the high-performance GPU ray-triangle intersection implementation
255 results = castRaysGPU(ray_queries, stats);
256}
257#endif
258
259std::vector<std::vector<std::vector<std::vector<CollisionDetection::HitResult>>>> CollisionDetection::performGridRayIntersection(const vec3 &grid_center, const vec3 &grid_size, const helios::int3 &grid_divisions,
260 const std::vector<RayQuery> &ray_queries) {
261
262 // Initialize result grid
263 std::vector<std::vector<std::vector<std::vector<HitResult>>>> grid_results;
264 grid_results.resize(grid_divisions.x);
265 for (int i = 0; i < grid_divisions.x; i++) {
266 grid_results[i].resize(grid_divisions.y);
267 for (int j = 0; j < grid_divisions.y; j++) {
268 grid_results[i][j].resize(grid_divisions.z);
269 }
270 }
271
272 // Calculate voxel size
273 vec3 voxel_size = make_vec3(grid_size.x / float(grid_divisions.x), grid_size.y / float(grid_divisions.y), grid_size.z / float(grid_divisions.z));
274
275 // Process each ray
276 for (const auto &query: ray_queries) {
277 HitResult hit_result = castRay(query);
278
279 if (hit_result.hit) {
280 // Determine which voxel the hit point falls into
281 vec3 relative_pos = hit_result.intersection_point - (grid_center - grid_size * 0.5f);
282
283 int voxel_i = int(relative_pos.x / voxel_size.x);
284 int voxel_j = int(relative_pos.y / voxel_size.y);
285 int voxel_k = int(relative_pos.z / voxel_size.z);
286
287 // Check bounds
288 if (voxel_i >= 0 && voxel_i < grid_divisions.x && voxel_j >= 0 && voxel_j < grid_divisions.y && voxel_k >= 0 && voxel_k < grid_divisions.z) {
289
290 grid_results[voxel_i][voxel_j][voxel_k].push_back(hit_result);
291 }
292 }
293 }
294
295 return grid_results;
296}
297
298std::vector<std::vector<CollisionDetection::HitResult>> CollisionDetection::calculateVoxelPathLengths(const vec3 &scan_origin, const std::vector<vec3> &ray_directions, const std::vector<vec3> &voxel_centers, const std::vector<vec3> &voxel_sizes) {
299
300 if (ray_directions.empty()) {
301 if (printmessages) {
302 std::cout << "WARNING (CollisionDetection::calculateVoxelPathLengths): No rays provided" << std::endl;
303 }
304 return std::vector<std::vector<HitResult>>();
305 }
306
307 if (voxel_centers.size() != voxel_sizes.size()) {
308 helios_runtime_error("ERROR (CollisionDetection::calculateVoxelPathLengths): voxel_centers and voxel_sizes vectors must have same size");
309 }
310
311 if (voxel_centers.empty()) {
312 if (printmessages) {
313 std::cout << "WARNING (CollisionDetection::calculateVoxelPathLengths): No voxels provided" << std::endl;
314 }
315 return std::vector<std::vector<HitResult>>();
316 }
317
318 const size_t num_rays = ray_directions.size();
319 const size_t num_voxels = voxel_centers.size();
320
321 if (printmessages) {
322 std::cout << "Calculating voxel path lengths for " << num_rays << " rays through " << num_voxels << " voxels..." << std::endl;
323 }
324
325 // Initialize result structure - one vector of HitResults per voxel
326 std::vector<std::vector<HitResult>> result(num_voxels);
327
328// OpenMP parallel loop over rays for performance
329#pragma omp parallel for schedule(dynamic, 1000)
330 for (int ray_idx = 0; ray_idx < static_cast<int>(num_rays); ++ray_idx) {
331 const vec3 &ray_direction = ray_directions[ray_idx];
332
333 // Process each voxel for this ray
334 for (size_t voxel_idx = 0; voxel_idx < num_voxels; ++voxel_idx) {
335 const vec3 &voxel_center = voxel_centers[voxel_idx];
336 const vec3 &voxel_size = voxel_sizes[voxel_idx];
337
338 // Calculate voxel AABB from center and size
339 const vec3 half_size = voxel_size * 0.5f;
340 const vec3 voxel_min = voxel_center - half_size;
341 const vec3 voxel_max = voxel_center + half_size;
342
343 // Perform ray-AABB intersection test
344 float t_min, t_max;
345 if (rayAABBIntersect(scan_origin, ray_direction, voxel_min, voxel_max, t_min, t_max)) {
346 // Calculate path length: t_max - max(0, t_min)
347 const float path_length = t_max - std::max(0.0f, t_min);
348
349 if (path_length > 1e-6f) { // Only count meaningful intersections
350 // Create HitResult with path length information
351 HitResult hit_result;
352 hit_result.hit = false; // No actual primitive hit, just voxel traversal
353 hit_result.distance = -1.0f; // Not applicable for voxel traversal
354 hit_result.primitive_UUID = 0; // No primitive
355 hit_result.intersection_point = make_vec3(0, 0, 0); // Not applicable
356 hit_result.normal = make_vec3(0, 0, 0); // Not applicable
357 hit_result.path_length = path_length; // This is what we want!
358
359// Thread-safe update of results
360#pragma omp critical
361 {
362 result[voxel_idx].push_back(hit_result);
363 }
364 }
365 }
366 }
367 }
368
369 if (printmessages) {
370 size_t total_intersections = 0;
371 for (size_t i = 0; i < num_voxels; ++i) {
372 total_intersections += result[i].size();
373 }
374 std::cout << "Completed voxel path length calculations. Total ray-voxel intersections: " << total_intersections << std::endl;
375 }
376
377 return result;
378}
379
380void CollisionDetection::calculateRayPathLengthsDetailed(const vec3 &grid_center, const vec3 &grid_size, const helios::int3 &grid_divisions, const std::vector<vec3> &ray_origins, const std::vector<vec3> &ray_directions,
381 std::vector<HitResult> &hit_results) {
382
383 hit_results.clear();
384 hit_results.reserve(ray_origins.size());
385
386 if (ray_origins.size() != ray_directions.size()) {
387 helios_runtime_error("ERROR (CollisionDetection::calculateRayPathLengthsDetailed): ray_origins and ray_directions must have the same size");
388 return;
389 }
390
391 // Also update the existing voxel data structures
392 calculateVoxelRayPathLengths(grid_center, grid_size, grid_divisions, ray_origins, ray_directions);
393
394 // Cast each ray and collect detailed results
395 for (size_t i = 0; i < ray_origins.size(); i++) {
396 RayQuery query(ray_origins[i], ray_directions[i]);
397 HitResult result = castRay(query);
398 hit_results.push_back(result);
399 }
400}
401
402// ================================================================
403// PHASE 2 OPTIMIZATION METHODS: Structure-of-Arrays & Quantization
404// ================================================================
405
407 if (mode == bvh_optimization_mode) {
408 return; // No change needed
409 }
410
411 BVHOptimizationMode old_mode = bvh_optimization_mode;
412 bvh_optimization_mode = mode;
413
414 // Build optimized structures when mode changes
415 if (old_mode != mode && !bvh_nodes.empty()) {
416 if (printmessages) {
417 std::cout << "CollisionDetection: Converting BVH from mode " << static_cast<int>(old_mode) << " to mode " << static_cast<int>(mode) << std::endl;
418 }
419
420 // Build the optimized structures immediately
421 ensureOptimizedBVH();
422
423 if (printmessages) {
424 auto memory_stats = getBVHMemoryUsage();
425 std::cout << "CollisionDetection: Memory usage - SoA: " << memory_stats.soa_memory_bytes << " bytes, Quantized: " << memory_stats.quantized_memory_bytes << " bytes (" << memory_stats.quantized_reduction_percent << "% reduction)"
426 << std::endl;
427 }
428 }
429}
430
434
435void CollisionDetection::convertBVHLayout(BVHOptimizationMode from_mode, BVHOptimizationMode to_mode) {
436 // With only SOA_UNCOMPRESSED mode remaining, no conversion needed
437 return;
438}
439
440
441std::vector<CollisionDetection::HitResult> CollisionDetection::castRaysOptimized(const std::vector<RayQuery> &ray_queries, RayTracingStats *stats) {
442 if (ray_queries.empty()) {
443 return {};
444 }
445
446 // Ensure BVH is current and optimized structures are available
447 ensureBVHCurrent();
448 ensureOptimizedBVH();
449
450 // Build primitive cache for high-performance thread-safe primitive intersection
451 if (primitive_cache.empty()) {
452 buildPrimitiveCache();
453 }
454
455 RayTracingStats local_stats;
456 std::vector<HitResult> results;
457 results.reserve(ray_queries.size());
458
459 auto start_time = std::chrono::high_resolution_clock::now();
460
461 // Dispatch to appropriate optimized method based on current mode
462 switch (bvh_optimization_mode) {
464 results = castRaysSoA(ray_queries, local_stats);
465 break;
466 }
467
468 auto end_time = std::chrono::high_resolution_clock::now();
469 auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
470
471 if (stats) {
472 *stats = local_stats;
473 }
474
475 return results;
476}
477
479 if (ray_stream.packets.empty()) {
480 return true;
481 }
482
483 RayTracingStats combined_stats;
484 bool success = true;
485
486 for (auto &packet: ray_stream.packets) {
487 // Convert packet to ray queries
488 auto queries = packet.toRayQueries();
489
490 // Process the packet using optimized ray casting
491 RayTracingStats packet_stats;
492 auto results = castRaysOptimized(queries, &packet_stats);
493
494 if (results.size() != queries.size()) {
495 success = false;
496 continue;
497 }
498
499 // Store results back in packet
500 packet.results = std::move(results);
501
502 // Accumulate statistics
503 combined_stats.total_rays_cast += packet_stats.total_rays_cast;
504 combined_stats.total_hits += packet_stats.total_hits;
505 combined_stats.bvh_nodes_visited += packet_stats.bvh_nodes_visited;
506 combined_stats.average_ray_distance = (combined_stats.average_ray_distance * (combined_stats.total_rays_cast - packet_stats.total_rays_cast) + packet_stats.average_ray_distance * packet_stats.total_rays_cast) / combined_stats.total_rays_cast;
507 }
508
509 if (stats) {
510 *stats = combined_stats;
511 }
512
513 if (printmessages && success) {
514 std::cout << "CollisionDetection: Processed " << ray_stream.packets.size() << " ray packets (" << ray_stream.total_rays << " total rays)" << std::endl;
515 }
516
517 return success;
518}
519
520CollisionDetection::MemoryUsageStats CollisionDetection::getBVHMemoryUsage() const {
521 // Ensure optimized structures are built before calculating memory usage
522 const_cast<CollisionDetection *>(this)->ensureOptimizedBVH();
523
524 MemoryUsageStats stats;
525
526 // Calculate SoA memory usage
527 stats.soa_memory_bytes = bvh_nodes_soa.getMemoryUsage();
528
529 // With quantized mode removed, set quantized values to 0
530 stats.quantized_memory_bytes = 0;
531 stats.quantized_reduction_percent = 0.0f;
532
533 return stats;
534}
535
536std::vector<CollisionDetection::HitResult> CollisionDetection::castRaysSoA(const std::vector<RayQuery> &ray_queries, RayTracingStats &stats) {
537 std::vector<HitResult> results;
538 results.reserve(ray_queries.size());
539
540 if (bvh_nodes_soa.node_count == 0) {
541 // Return empty results if no SoA BVH, but still set statistics
542 results.resize(ray_queries.size());
543 stats.total_rays_cast = ray_queries.size();
544 stats.total_hits = 0;
545 stats.bvh_nodes_visited = 0;
546 stats.average_ray_distance = 0.0f;
547 return results;
548 }
549
550 stats.total_rays_cast = ray_queries.size();
551 stats.total_hits = 0;
552 stats.average_ray_distance = 0.0;
553
554 // Resize results vector for parallel access
555 results.resize(ray_queries.size());
556
557// OpenMP parallel ray processing for high-performance SoA traversal
558#pragma omp parallel
559 {
560 RayTracingStats local_stats = {}; // Thread-local statistics
561
562#pragma omp for schedule(guided, 32)
563 for (int i = 0; i < static_cast<int>(ray_queries.size()); ++i) {
564 HitResult result = castRaySoATraversal(ray_queries[i], local_stats);
565 results[i] = result;
566
567 if (result.hit) {
568 local_stats.total_hits++;
569 local_stats.average_ray_distance += result.distance;
570 }
571 }
572
573// Combine thread-local statistics atomically
574#pragma omp atomic
575 stats.total_hits += local_stats.total_hits;
576
577#pragma omp atomic
578 stats.average_ray_distance += local_stats.average_ray_distance;
579
580#pragma omp atomic
581 stats.bvh_nodes_visited += local_stats.bvh_nodes_visited;
582 }
583
584 if (stats.total_hits > 0) {
585 stats.average_ray_distance /= stats.total_hits;
586 }
587
588 return results;
589}
590
591// Optimized BVH traversal methods using Structure-of-Arrays layout
592
593#include "CollisionDetection.h"
594
595using namespace helios;
596
597CollisionDetection::HitResult CollisionDetection::castRaySoATraversal(const RayQuery &query, RayTracingStats &stats) {
598 HitResult result;
599
600 if (bvh_nodes_soa.node_count == 0 || bvh_nodes_soa.aabb_mins.empty()) {
601 return result; // No BVH built
602 }
603
604 // Stack-based traversal (more cache-friendly than recursion)
605 std::stack<size_t> node_stack;
606 node_stack.push(0); // Start from root
607
608 float closest_distance = (query.max_distance > 0) ? query.max_distance : std::numeric_limits<float>::max();
609
610 // Safety limit to prevent infinite loops
611 const size_t MAX_TRAVERSAL_STEPS = 10000000; // 10M steps should be more than enough
612 size_t traversal_steps = 0;
613
614 while (!node_stack.empty()) {
615 // Safety check to prevent infinite loops
616 if (++traversal_steps > MAX_TRAVERSAL_STEPS) {
617 if (printmessages) {
618 std::cout << "WARNING: BVH traversal exceeded maximum steps, terminating ray" << std::endl;
619 }
620 break;
621 }
622
623 size_t node_idx = node_stack.top();
624 node_stack.pop();
625 stats.bvh_nodes_visited++;
626
627 // Bounds check for node index
628 if (node_idx >= bvh_nodes_soa.node_count) {
629 if (printmessages) {
630 std::cout << "WARNING: Invalid node index " << node_idx << " >= " << bvh_nodes_soa.node_count << std::endl;
631 }
632 continue;
633 }
634
635 // Prefetch data for better cache performance
636 if (node_idx + 1 < bvh_nodes_soa.node_count) {
637#ifdef __GNUC__
638 __builtin_prefetch(&bvh_nodes_soa.aabb_mins[node_idx + 1], 0, 1);
639 __builtin_prefetch(&bvh_nodes_soa.aabb_maxs[node_idx + 1], 0, 1);
640#elif defined(_MSC_VER)
641 _mm_prefetch(reinterpret_cast<const char *>(&bvh_nodes_soa.aabb_mins[node_idx + 1]), _MM_HINT_T1);
642 _mm_prefetch(reinterpret_cast<const char *>(&bvh_nodes_soa.aabb_maxs[node_idx + 1]), _MM_HINT_T1);
643#endif
644 }
645
646 // AABB intersection test using SoA layout
647 if (!aabbIntersectSoA(query.origin, query.direction, closest_distance, node_idx)) {
648 continue;
649 }
650
651 // Check if leaf node
652 if (bvh_nodes_soa.is_leaf_flags[node_idx]) {
653 // Process primitives in this leaf
654 uint32_t primitive_start = bvh_nodes_soa.primitive_starts[node_idx];
655 uint32_t primitive_count = bvh_nodes_soa.primitive_counts[node_idx];
656
657 for (uint32_t i = 0; i < primitive_count; ++i) {
658 uint primitive_id = primitive_indices[primitive_start + i];
659
660 // Skip if not in target list (if specified)
661 if (!query.target_UUIDs.empty()) {
662 bool found = false;
663 for (uint target: query.target_UUIDs) {
664 if (primitive_id == target) {
665 found = true;
666 break;
667 }
668 }
669 if (!found)
670 continue;
671 }
672
673 // Use high-performance cached primitive intersection
674 HitResult primitive_result = intersectPrimitiveThreadSafe(query.origin, query.direction, primitive_id, closest_distance);
675 if (primitive_result.hit && primitive_result.distance < closest_distance) {
676 result = primitive_result;
677 closest_distance = primitive_result.distance;
678 }
679 }
680 } else {
681 // Internal node - add children to stack
682 uint32_t left_child = bvh_nodes_soa.left_children[node_idx];
683 uint32_t right_child = bvh_nodes_soa.right_children[node_idx];
684
685 if (left_child != 0xFFFFFFFF && left_child < bvh_nodes_soa.node_count) {
686 node_stack.push(left_child);
687 }
688 if (right_child != 0xFFFFFFFF && right_child < bvh_nodes_soa.node_count) {
689 node_stack.push(right_child);
690 }
691 }
692 }
693
694 return result;
695}
696
697
698bool CollisionDetection::aabbIntersectSoA(const helios::vec3 &ray_origin, const helios::vec3 &ray_direction, float max_distance, size_t node_index) const {
699 // Direct access to SoA arrays for optimal memory usage
700 const vec3 &aabb_min = bvh_nodes_soa.aabb_mins[node_index];
701 const vec3 &aabb_max = bvh_nodes_soa.aabb_maxs[node_index];
702
703#ifdef __SSE4_1__
704 // SIMD-optimized ray-AABB intersection for better performance
705 __m128 ray_orig = _mm_set_ps(0.0f, ray_origin.z, ray_origin.y, ray_origin.x);
706 __m128 ray_dir = _mm_set_ps(0.0f, ray_direction.z, ray_direction.y, ray_direction.x);
707 __m128 aabb_min_vec = _mm_set_ps(0.0f, aabb_min.z, aabb_min.y, aabb_min.x);
708 __m128 aabb_max_vec = _mm_set_ps(0.0f, aabb_max.z, aabb_max.y, aabb_max.x);
709
710 // Compute inverse ray direction
711 __m128 inv_dir = _mm_div_ps(_mm_set1_ps(1.0f), ray_dir);
712
713 // Compute t1 and t2 for all axes
714 __m128 t1 = _mm_mul_ps(_mm_sub_ps(aabb_min_vec, ray_orig), inv_dir);
715 __m128 t2 = _mm_mul_ps(_mm_sub_ps(aabb_max_vec, ray_orig), inv_dir);
716
717 // Get min and max for each axis
718 __m128 tmin = _mm_min_ps(t1, t2);
719 __m128 tmax = _mm_max_ps(t1, t2);
720
721 // Extract components
722 float tmin_vals[4], tmax_vals[4];
723 _mm_store_ps(tmin_vals, tmin);
724 _mm_store_ps(tmax_vals, tmax);
725
726 float t_near = std::max({tmin_vals[0], tmin_vals[1], tmin_vals[2], 0.0f});
727 float t_far = std::min({tmax_vals[0], tmax_vals[1], tmax_vals[2], max_distance});
728
729 return t_near <= t_far;
730#else
731 // Fallback scalar implementation
732 float inv_dir_x = 1.0f / ray_direction.x;
733 float inv_dir_y = 1.0f / ray_direction.y;
734 float inv_dir_z = 1.0f / ray_direction.z;
735
736 float t1_x = (aabb_min.x - ray_origin.x) * inv_dir_x;
737 float t2_x = (aabb_max.x - ray_origin.x) * inv_dir_x;
738 float t1_y = (aabb_min.y - ray_origin.y) * inv_dir_y;
739 float t2_y = (aabb_max.y - ray_origin.y) * inv_dir_y;
740 float t1_z = (aabb_min.z - ray_origin.z) * inv_dir_z;
741 float t2_z = (aabb_max.z - ray_origin.z) * inv_dir_z;
742
743 float tmin_x = std::min(t1_x, t2_x);
744 float tmax_x = std::max(t1_x, t2_x);
745 float tmin_y = std::min(t1_y, t2_y);
746 float tmax_y = std::max(t1_y, t2_y);
747 float tmin_z = std::min(t1_z, t2_z);
748 float tmax_z = std::max(t1_z, t2_z);
749
750 float t_near = std::max({tmin_x, tmin_y, tmin_z, 0.0f});
751 float t_far = std::min({tmax_x, tmax_y, tmax_z, max_distance});
752
753 return t_near <= t_far;
754#endif
755}
756
757
758// Basic BVH traversal using standard node structure
759CollisionDetection::HitResult CollisionDetection::castRayBVHTraversal(const RayQuery &query) {
760 HitResult result;
761
762 if (bvh_nodes.empty()) {
763 return result; // No BVH built
764 }
765
766 // Stack-based traversal using standard BVH nodes
767 std::stack<size_t> node_stack;
768 node_stack.push(0); // Start from root
769
770 float closest_distance = (query.max_distance > 0) ? query.max_distance : std::numeric_limits<float>::max();
771
772 while (!node_stack.empty()) {
773 size_t node_idx = node_stack.top();
774 node_stack.pop();
775
776 if (node_idx >= bvh_nodes.size()) {
777 if (printmessages) {
778 std::cout << "ERROR: Invalid BVH node index " << node_idx << " >= " << bvh_nodes.size() << " nodes" << std::endl;
779 }
780 result.hit = false;
781 return result;
782 }
783
784 const BVHNode &node = bvh_nodes[node_idx];
785
786 // AABB intersection test - this provides the early miss detection that was missing
787 if (!rayAABBIntersect(query.origin, query.direction, node.aabb_min, node.aabb_max)) {
788 continue; // Ray misses this node's bounding box - skip entire subtree
789 }
790
791 if (node.is_leaf) {
792 // Process primitives in this leaf
793 for (uint32_t i = 0; i < node.primitive_count; ++i) {
794 if (node.primitive_start + i >= primitive_indices.size()) {
795 if (printmessages) {
796 std::cout << "ERROR: Invalid BVH primitive index " << (node.primitive_start + i) << " >= " << primitive_indices.size() << " primitives" << std::endl;
797 }
798 result.hit = false;
799 return result;
800 }
801
802 uint primitive_id = primitive_indices[node.primitive_start + i];
803
804 // Skip if not in target list (if specified)
805 if (!query.target_UUIDs.empty()) {
806 bool found = false;
807 for (uint target: query.target_UUIDs) {
808 if (primitive_id == target) {
809 found = true;
810 break;
811 }
812 }
813 if (!found)
814 continue;
815 }
816
817 // Perform primitive intersection test
818 HitResult primitive_hit = intersectPrimitive(query, primitive_id);
819 if (primitive_hit.hit && primitive_hit.distance < closest_distance) {
820 result = primitive_hit;
821 closest_distance = primitive_hit.distance;
822 }
823 }
824 } else {
825 // Internal node - add children to stack for traversal
826 if (node.left_child != 0xFFFFFFFF && node.left_child < bvh_nodes.size()) {
827 node_stack.push(node.left_child);
828 }
829 if (node.right_child != 0xFFFFFFFF && node.right_child < bvh_nodes.size()) {
830 node_stack.push(node.right_child);
831 }
832 }
833 }
834
835 return result;
836}
837
838// Helper method to test ray-AABB intersection
839bool CollisionDetection::rayAABBIntersect(const vec3 &ray_origin, const vec3 &ray_direction, const vec3 &aabb_min, const vec3 &aabb_max) const {
840 // Robust ray-AABB intersection using slab method with proper axis-aligned ray handling
841 const float EPSILON = 1e-8f;
842
843 float tmin = 0.0f; // Ray starts at origin
844 float tmax = std::numeric_limits<float>::max(); // No maximum distance limit
845
846 // Handle X slab
847 if (std::abs(ray_direction.x) > EPSILON) {
848 float inv_dir_x = 1.0f / ray_direction.x;
849 float t1 = (aabb_min.x - ray_origin.x) * inv_dir_x;
850 float t2 = (aabb_max.x - ray_origin.x) * inv_dir_x;
851
852 float slab_tmin = std::min(t1, t2);
853 float slab_tmax = std::max(t1, t2);
854
855 tmin = std::max(tmin, slab_tmin);
856 tmax = std::min(tmax, slab_tmax);
857
858 if (tmin > tmax)
859 return false; // Early exit if no intersection
860 } else {
861 // Ray is parallel to X slab - check if ray origin is within X bounds
862 if (ray_origin.x < aabb_min.x || ray_origin.x > aabb_max.x) {
863 return false;
864 }
865 }
866
867 // Handle Y slab
868 if (std::abs(ray_direction.y) > EPSILON) {
869 float inv_dir_y = 1.0f / ray_direction.y;
870 float t1 = (aabb_min.y - ray_origin.y) * inv_dir_y;
871 float t2 = (aabb_max.y - ray_origin.y) * inv_dir_y;
872
873 float slab_tmin = std::min(t1, t2);
874 float slab_tmax = std::max(t1, t2);
875
876 tmin = std::max(tmin, slab_tmin);
877 tmax = std::min(tmax, slab_tmax);
878
879 if (tmin > tmax)
880 return false; // Early exit if no intersection
881 } else {
882 // Ray is parallel to Y slab - check if ray origin is within Y bounds
883 if (ray_origin.y < aabb_min.y || ray_origin.y > aabb_max.y) {
884 return false;
885 }
886 }
887
888 // Handle Z slab
889 if (std::abs(ray_direction.z) > EPSILON) {
890 float inv_dir_z = 1.0f / ray_direction.z;
891 float t1 = (aabb_min.z - ray_origin.z) * inv_dir_z;
892 float t2 = (aabb_max.z - ray_origin.z) * inv_dir_z;
893
894 float slab_tmin = std::min(t1, t2);
895 float slab_tmax = std::max(t1, t2);
896
897 tmin = std::max(tmin, slab_tmin);
898 tmax = std::min(tmax, slab_tmax);
899
900 if (tmin > tmax)
901 return false; // Early exit if no intersection
902 } else {
903 // Ray is parallel to Z slab - check if ray origin is within Z bounds
904 if (ray_origin.z < aabb_min.z || ray_origin.z > aabb_max.z) {
905 return false;
906 }
907 }
908
909 return tmin <= tmax;
910}
911
912// Helper method to intersect with individual primitive (reuses existing logic)
913CollisionDetection::HitResult CollisionDetection::intersectPrimitive(const RayQuery &query, uint primitive_id) {
914 // PERFORMANCE FIX: Use direct context call instead of expensive cached primitive data
915 // This avoids the need to build and maintain a primitive cache
916 HitResult result;
917
918 float distance;
919 if (rayPrimitiveIntersection(query.origin, query.direction, primitive_id, distance)) {
920 result.hit = true;
921 result.distance = distance;
922 result.primitive_UUID = primitive_id;
923 result.intersection_point = query.origin + query.direction * distance;
924
925 // Calculate surface normal directly from context (minimal overhead)
926 PrimitiveType type = context->getPrimitiveType(primitive_id);
927 std::vector<vec3> vertices = context->getPrimitiveVertices(primitive_id);
928
929 if (type == PRIMITIVE_TYPE_TRIANGLE && vertices.size() >= 3) {
930 vec3 edge1 = vertices[1] - vertices[0];
931 vec3 edge2 = vertices[2] - vertices[0];
932 result.normal = cross(edge1, edge2);
933 if (result.normal.magnitude() > 1e-8f) {
934 result.normal = result.normal / result.normal.magnitude();
935 } else {
936 result.normal = make_vec3(-query.direction.x, -query.direction.y, -query.direction.z);
937 }
938 } else if (type == PRIMITIVE_TYPE_PATCH && vertices.size() >= 3) {
939 vec3 edge1 = vertices[1] - vertices[0];
940 vec3 edge2 = vertices[2] - vertices[0];
941 result.normal = cross(edge1, edge2);
942 if (result.normal.magnitude() > 1e-8f) {
943 result.normal = result.normal / result.normal.magnitude();
944 } else {
945 result.normal = make_vec3(-query.direction.x, -query.direction.y, -query.direction.z);
946 }
947 } else {
948 // Default normal (opposite to ray direction)
949 result.normal = make_vec3(-query.direction.x, -query.direction.y, -query.direction.z);
950 }
951 }
952
953 return result;
954}
955
956CollisionDetection::HitResult CollisionDetection::intersectPrimitiveThreadSafe(const vec3 &origin, const vec3 &direction, uint primitive_id, float max_distance) {
957 HitResult result;
958
959 // Check if we have cached primitive data for this primitive
960 auto it = primitive_cache.find(primitive_id);
961 if (it == primitive_cache.end()) {
962 // Primitive not in cache - this shouldn't happen in optimized paths
963 // This indicates that the context was modified after the cache was built
964 // Fall back to thread-unsafe context call (only safe for sequential code)
965 // For parallel regions, this could cause issues, but it's better than crashing
966 float distance;
967 if (rayPrimitiveIntersection(origin, direction, primitive_id, distance)) {
968 result.hit = true;
969 result.distance = distance;
970 result.primitive_UUID = primitive_id;
971 result.intersection_point = origin + direction * distance;
972
973 // Calculate surface normal for uncached primitive
974 PrimitiveType type = context->getPrimitiveType(primitive_id);
975 std::vector<vec3> vertices = context->getPrimitiveVertices(primitive_id);
976
977 if (type == PRIMITIVE_TYPE_TRIANGLE && vertices.size() >= 3) {
978 vec3 edge1 = vertices[1] - vertices[0];
979 vec3 edge2 = vertices[2] - vertices[0];
980 result.normal = cross(edge1, edge2);
981 if (result.normal.magnitude() > 1e-8f) {
982 result.normal = result.normal / result.normal.magnitude();
983 // Ensure normal points towards ray origin (for LiDAR compatibility)
984 vec3 to_origin = origin - result.intersection_point;
985 if (result.normal * to_origin < 0) {
986 result.normal = result.normal * -1.0f;
987 }
988 } else {
989 result.normal = make_vec3(-direction.x, -direction.y, -direction.z);
990 result.normal = result.normal / result.normal.magnitude();
991 }
992 } else if (type == PRIMITIVE_TYPE_PATCH && vertices.size() >= 4) {
993 vec3 edge1 = vertices[1] - vertices[0];
994 vec3 edge2 = vertices[2] - vertices[0];
995 result.normal = cross(edge1, edge2);
996 if (result.normal.magnitude() > 1e-8f) {
997 result.normal = result.normal / result.normal.magnitude();
998 // Ensure normal points towards ray origin (for LiDAR compatibility)
999 vec3 to_origin = origin - result.intersection_point;
1000 if (result.normal * to_origin < 0) {
1001 result.normal = result.normal * -1.0f;
1002 }
1003 } else {
1004 result.normal = make_vec3(-direction.x, -direction.y, -direction.z);
1005 result.normal = result.normal / result.normal.magnitude();
1006 }
1007 } else {
1008 result.normal = make_vec3(-direction.x, -direction.y, -direction.z);
1009 result.normal = result.normal / result.normal.magnitude();
1010 }
1011 }
1012 return result;
1013 }
1014
1015 const CachedPrimitive &cached = it->second;
1016
1017 // Perform intersection test based on primitive type
1018 if (cached.type == PRIMITIVE_TYPE_TRIANGLE && cached.vertices.size() >= 3) {
1019 float distance;
1020 if (triangleIntersect(origin, direction, cached.vertices[0], cached.vertices[1], cached.vertices[2], distance)) {
1021 if (distance > 1e-6f && (max_distance <= 0 || distance < max_distance)) {
1022 result.hit = true;
1023 result.distance = distance;
1024 result.primitive_UUID = primitive_id;
1025 result.intersection_point = origin + direction * distance;
1026
1027 // Calculate triangle normal
1028 vec3 edge1 = cached.vertices[1] - cached.vertices[0];
1029 vec3 edge2 = cached.vertices[2] - cached.vertices[0];
1030 result.normal = cross(edge1, edge2);
1031 if (result.normal.magnitude() > 1e-8f) {
1032 result.normal = result.normal / result.normal.magnitude();
1033 // Ensure normal points towards ray origin (for LiDAR compatibility)
1034 vec3 to_origin = origin - result.intersection_point;
1035 if (result.normal * to_origin < 0) {
1036 result.normal = result.normal * -1.0f;
1037 }
1038 } else {
1039 result.normal = make_vec3(-direction.x, -direction.y, -direction.z);
1040 result.normal = result.normal / result.normal.magnitude();
1041 }
1042 }
1043 }
1044 } else if (cached.type == PRIMITIVE_TYPE_PATCH && cached.vertices.size() >= 4) {
1045 float distance;
1046 if (patchIntersect(origin, direction, cached.vertices[0], cached.vertices[1], cached.vertices[2], cached.vertices[3], distance)) {
1047 if (distance > 1e-6f && (max_distance <= 0 || distance < max_distance)) {
1048 result.hit = true;
1049 result.distance = distance;
1050 result.primitive_UUID = primitive_id;
1051 result.intersection_point = origin + direction * distance;
1052
1053 // Calculate patch normal (use v0, v1, v2 like the original code)
1054 vec3 edge1 = cached.vertices[1] - cached.vertices[0];
1055 vec3 edge2 = cached.vertices[2] - cached.vertices[0];
1056 result.normal = cross(edge1, edge2);
1057 if (result.normal.magnitude() > 1e-8f) {
1058 result.normal = result.normal / result.normal.magnitude();
1059 // Ensure normal points towards ray origin (for LiDAR compatibility)
1060 vec3 to_origin = origin - result.intersection_point;
1061 if (result.normal * to_origin < 0) {
1062 result.normal = result.normal * -1.0f;
1063 }
1064 } else {
1065 result.normal = make_vec3(-direction.x, -direction.y, -direction.z);
1066 result.normal = result.normal / result.normal.magnitude();
1067 }
1068 }
1069 }
1070 } else if (cached.type == PRIMITIVE_TYPE_VOXEL && cached.vertices.size() == 8) {
1071 // Voxel (AABB) intersection using slab method
1072 // Calculate AABB from 8 vertices
1073 vec3 aabb_min = cached.vertices[0];
1074 vec3 aabb_max = cached.vertices[0];
1075
1076 for (int i = 1; i < 8; i++) {
1077 aabb_min.x = std::min(aabb_min.x, cached.vertices[i].x);
1078 aabb_min.y = std::min(aabb_min.y, cached.vertices[i].y);
1079 aabb_min.z = std::min(aabb_min.z, cached.vertices[i].z);
1080 aabb_max.x = std::max(aabb_max.x, cached.vertices[i].x);
1081 aabb_max.y = std::max(aabb_max.y, cached.vertices[i].y);
1082 aabb_max.z = std::max(aabb_max.z, cached.vertices[i].z);
1083 }
1084
1085 // Ray-AABB intersection using slab method
1086 float t_near = -std::numeric_limits<float>::max();
1087 float t_far = std::numeric_limits<float>::max();
1088
1089 // Check intersection with each slab (X, Y, Z)
1090 for (int axis = 0; axis < 3; axis++) {
1091 float ray_dir_component = (axis == 0) ? direction.x : (axis == 1) ? direction.y : direction.z;
1092 float ray_orig_component = (axis == 0) ? origin.x : (axis == 1) ? origin.y : origin.z;
1093 float aabb_min_component = (axis == 0) ? aabb_min.x : (axis == 1) ? aabb_min.y : aabb_min.z;
1094 float aabb_max_component = (axis == 0) ? aabb_max.x : (axis == 1) ? aabb_max.y : aabb_max.z;
1095
1096 if (std::abs(ray_dir_component) < 1e-8f) {
1097 // Ray is parallel to slab
1098 if (ray_orig_component < aabb_min_component || ray_orig_component > aabb_max_component) {
1099 return result; // Ray is outside slab and parallel - no intersection
1100 }
1101 } else {
1102 // Calculate intersection distances for this slab
1103 float t1 = (aabb_min_component - ray_orig_component) / ray_dir_component;
1104 float t2 = (aabb_max_component - ray_orig_component) / ray_dir_component;
1105
1106 // Ensure t1 <= t2
1107 if (t1 > t2) {
1108 std::swap(t1, t2);
1109 }
1110
1111 // Update near and far intersection distances
1112 t_near = std::max(t_near, t1);
1113 t_far = std::min(t_far, t2);
1114
1115 // Early exit if no intersection possible
1116 if (t_near > t_far) {
1117 return result;
1118 }
1119 }
1120 }
1121
1122 // Check if intersection is in front of ray origin and within max distance
1123 if (t_far >= 0.0f) {
1124 // Use t_near if it's positive (ray starts outside box), otherwise t_far (ray starts inside box)
1125 float intersection_distance = (t_near >= 1e-6f) ? t_near : t_far;
1126 if (intersection_distance >= 1e-6f && (max_distance <= 0 || intersection_distance < max_distance)) {
1127 result.hit = true;
1128 result.distance = intersection_distance;
1129 result.primitive_UUID = primitive_id;
1130 result.intersection_point = origin + direction * intersection_distance;
1131
1132 // Calculate normal based on which face was hit
1133 // Determine which face of the voxel was hit by examining intersection point
1134 vec3 hit_point = result.intersection_point;
1135 vec3 box_center = (aabb_min + aabb_max) * 0.5f;
1136 vec3 box_extent = (aabb_max - aabb_min) * 0.5f;
1137
1138 // Find which face the hit point is closest to
1139 vec3 local_hit = hit_point - box_center;
1140 vec3 abs_local_hit = make_vec3(std::abs(local_hit.x), std::abs(local_hit.y), std::abs(local_hit.z));
1141
1142 // Determine which axis has the largest relative coordinate (closest to face)
1143 float rel_x = abs_local_hit.x / box_extent.x;
1144 float rel_y = abs_local_hit.y / box_extent.y;
1145 float rel_z = abs_local_hit.z / box_extent.z;
1146
1147 if (rel_x >= rel_y && rel_x >= rel_z) {
1148 // Hit X face
1149 result.normal = make_vec3((local_hit.x > 0) ? 1.0f : -1.0f, 0.0f, 0.0f);
1150 } else if (rel_y >= rel_z) {
1151 // Hit Y face
1152 result.normal = make_vec3(0.0f, (local_hit.y > 0) ? 1.0f : -1.0f, 0.0f);
1153 } else {
1154 // Hit Z face
1155 result.normal = make_vec3(0.0f, 0.0f, (local_hit.z > 0) ? 1.0f : -1.0f);
1156 }
1157 }
1158 }
1159 }
1160 // Add other primitive types as needed (DISK, etc.)
1161
1162 return result;
1163}
1164
1165void CollisionDetection::buildPrimitiveCache() {
1166 primitive_cache.clear();
1167
1168 // Get all primitive UUIDs from context
1169 std::vector<uint> all_primitives = context->getAllUUIDs();
1170
1171 // Cache primitive data for thread-safe access
1172 for (uint primitive_id: all_primitives) {
1173 if (context->doesPrimitiveExist(primitive_id)) {
1174 try {
1175 PrimitiveType type = context->getPrimitiveType(primitive_id);
1176 std::vector<vec3> vertices = context->getPrimitiveVertices(primitive_id);
1177
1178 primitive_cache[primitive_id] = CachedPrimitive(type, vertices);
1179 } catch (const std::exception &e) {
1180 // Skip this primitive if it no longer exists or can't be accessed
1181 // This can happen when UUIDs from previous contexts persist
1182 if (printmessages) {
1183 std::cout << "Warning: Skipping primitive " << primitive_id << " in cache build (not accessible: " << e.what() << ")" << std::endl;
1184 }
1185 continue;
1186 }
1187 }
1188 }
1189
1190 if (printmessages) {
1191 }
1192}
1193
1194bool CollisionDetection::triangleIntersect(const vec3 &origin, const vec3 &direction, const vec3 &v0, const vec3 &v1, const vec3 &v2, float &distance) {
1195 // Möller-Trumbore triangle intersection algorithm (optimized - no vec3 temporaries)
1196 // Note: Using 1e-5f to match LiDAR CUDA kernel tolerance for edge-case rays
1197 const float EPSILON = 1e-5f;
1198
1199 // Compute triangle edges directly as components (avoid vec3 constructors)
1200 float edge1_x = v1.x - v0.x, edge1_y = v1.y - v0.y, edge1_z = v1.z - v0.z;
1201 float edge2_x = v2.x - v0.x, edge2_y = v2.y - v0.y, edge2_z = v2.z - v0.z;
1202
1203 // Cross product: h = direction × edge2 (computed directly)
1204 float h_x = direction.y * edge2_z - direction.z * edge2_y;
1205 float h_y = direction.z * edge2_x - direction.x * edge2_z;
1206 float h_z = direction.x * edge2_y - direction.y * edge2_x;
1207
1208 // Dot product: a = edge1 · h
1209 float a = edge1_x * h_x + edge1_y * h_y + edge1_z * h_z;
1210
1211 if (a > -EPSILON && a < EPSILON) {
1212 return false; // Ray is parallel to triangle
1213 }
1214
1215 float f = 1.0f / a;
1216
1217 // Vector s = origin - v0 (computed as components)
1218 float s_x = origin.x - v0.x, s_y = origin.y - v0.y, s_z = origin.z - v0.z;
1219
1220 // u = f * (s · h)
1221 float u = f * (s_x * h_x + s_y * h_y + s_z * h_z);
1222
1223 if (u < -EPSILON || u > 1.0f + EPSILON) {
1224 return false;
1225 }
1226
1227 // Cross product: q = s × edge1 (computed directly)
1228 float q_x = s_y * edge1_z - s_z * edge1_y;
1229 float q_y = s_z * edge1_x - s_x * edge1_z;
1230 float q_z = s_x * edge1_y - s_y * edge1_x;
1231
1232 // v = f * (direction · q)
1233 float v = f * (direction.x * q_x + direction.y * q_y + direction.z * q_z);
1234
1235 if (v < -EPSILON || u + v > 1.0f + EPSILON) {
1236 return false;
1237 }
1238
1239 // t = f * (edge2 · q) - computed directly as dot product
1240 float t = f * (edge2_x * q_x + edge2_y * q_y + edge2_z * q_z);
1241
1242 if (t > EPSILON) {
1243 distance = t;
1244 return true;
1245 }
1246
1247 return false; // Line intersection but not ray intersection
1248}
1249
1250bool CollisionDetection::patchIntersect(const vec3 &origin, const vec3 &direction, const vec3 &v0, const vec3 &v1, const vec3 &v2, const vec3 &v3, float &distance) {
1251 // Patch (quadrilateral) intersection using radiation model algorithm
1252 const float EPSILON = 1e-8f;
1253
1254 // Calculate patch vectors and normal (same as radiation model)
1255 vec3 anchor = v0;
1256 vec3 normal = cross(v1 - v0, v2 - v0);
1257 normal.normalize();
1258
1259 vec3 a = v1 - v0; // First edge vector
1260 vec3 b = v3 - v0; // Second edge vector
1261
1262 // Ray-plane intersection
1263 float denom = direction * normal;
1264 if (std::abs(denom) > EPSILON) { // Not parallel to plane
1265 float t = (anchor - origin) * normal / denom;
1266
1267 if (t > EPSILON && t < 1e8f) { // Valid intersection distance
1268 // Find intersection point
1269 vec3 p = origin + direction * t;
1270 vec3 d = p - anchor;
1271
1272 // Project onto patch coordinate system
1273 float ddota = d * a;
1274 float ddotb = d * b;
1275
1276 // Check if point is within patch bounds (with epsilon tolerance for edge cases)
1277 if (ddota >= -EPSILON && ddota <= (a * a) + EPSILON && ddotb >= -EPSILON && ddotb <= (b * b) + EPSILON) {
1278 distance = t;
1279 return true;
1280 }
1281 }
1282 }
1283
1284 return false;
1285}
1286
1287#ifdef HELIOS_CUDA_AVAILABLE
1288#include <vector_types.h> // For make_float3
1289
1296std::vector<CollisionDetection::HitResult> CollisionDetection::castRaysGPU(const std::vector<RayQuery> &ray_queries, RayTracingStats &stats) {
1297 std::vector<HitResult> results;
1298 results.resize(ray_queries.size());
1299
1300 if (ray_queries.empty() || !gpu_acceleration_enabled) {
1301 return castRaysSoA(ray_queries, stats); // Use CPU implementation
1302 }
1303
1304 auto start = std::chrono::high_resolution_clock::now();
1305
1306 // Ensure BVH is built for GPU acceleration
1307 if (bvh_nodes.empty()) {
1308 if (printmessages) {
1309 }
1310 buildBVH();
1311 if (bvh_nodes.empty()) {
1312 helios_runtime_error("ERROR: BVH construction failed - no geometry available for ray tracing. Ensure primitives are properly added to the collision detection system.");
1313 }
1314 }
1315
1316 // Prepare ray data for GPU
1317 std::vector<float> ray_origins(ray_queries.size() * 3);
1318 std::vector<float> ray_directions(ray_queries.size() * 3);
1319 std::vector<float> ray_max_distances(ray_queries.size());
1320
1321 for (size_t i = 0; i < ray_queries.size(); i++) {
1322 vec3 normalized_dir = ray_queries[i].direction;
1323 if (normalized_dir.magnitude() > 1e-8f) {
1324 normalized_dir = normalized_dir / normalized_dir.magnitude();
1325 }
1326
1327 ray_origins[i * 3] = ray_queries[i].origin.x;
1328 ray_origins[i * 3 + 1] = ray_queries[i].origin.y;
1329 ray_origins[i * 3 + 2] = ray_queries[i].origin.z;
1330
1331 ray_directions[i * 3] = normalized_dir.x;
1332 ray_directions[i * 3 + 1] = normalized_dir.y;
1333 ray_directions[i * 3 + 2] = normalized_dir.z;
1334
1335 ray_max_distances[i] = ray_queries[i].max_distance;
1336 }
1337
1338 // CRITICAL: Use the SAME primitive_indices that the BVH was built with!
1339 // Building a fresh array from getAllUUIDs() causes index mismatches
1340 // because getAllUUIDs() order may differ from buildBVH() order
1341 std::vector<int> primitive_types(primitive_indices.size());
1342 std::vector<float3> primitive_vertices;
1343 std::vector<unsigned int> vertex_offsets(primitive_indices.size());
1344
1345 size_t triangle_count = 0;
1346 size_t patch_count = 0;
1347 size_t voxel_count = 0;
1348
1349 if (printmessages) {
1350 }
1351
1352 // Prepare primitive data arrays for GPU using existing primitive_indices
1353 size_t vertex_index = 0;
1354 for (size_t i = 0; i < primitive_indices.size(); i++) {
1355 vertex_offsets[i] = vertex_index;
1356
1357 PrimitiveType ptype = context->getPrimitiveType(primitive_indices[i]);
1358 primitive_types[i] = static_cast<int>(ptype);
1359
1360 std::vector<vec3> vertices = context->getPrimitiveVertices(primitive_indices[i]);
1361
1362 if (ptype == PRIMITIVE_TYPE_TRIANGLE) {
1363 triangle_count++;
1364 if (vertices.size() >= 3) {
1365 // Add 3 vertices for triangle, pad to 4 for alignment
1366 for (int v = 0; v < 3; v++) {
1367 primitive_vertices.push_back(make_float3(vertices[v].x, vertices[v].y, vertices[v].z));
1368 }
1369 primitive_vertices.push_back(make_float3(0, 0, 0)); // Padding
1370 vertex_index += 4;
1371 } else {
1372 if (printmessages) {
1373 std::cout << "WARNING: Triangle primitive " << primitive_indices[i] << " has " << vertices.size() << " vertices" << std::endl;
1374 }
1375 // Add zeros for invalid triangle
1376 for (int v = 0; v < 4; v++) {
1377 primitive_vertices.push_back(make_float3(0, 0, 0));
1378 }
1379 vertex_index += 4;
1380 }
1381 } else if (ptype == PRIMITIVE_TYPE_PATCH) {
1382 patch_count++;
1383 if (vertices.size() >= 4) {
1384 // Add 4 vertices for patch
1385 for (int v = 0; v < 4; v++) {
1386 primitive_vertices.push_back(make_float3(vertices[v].x, vertices[v].y, vertices[v].z));
1387 }
1388 vertex_index += 4;
1389 } else {
1390 if (printmessages) {
1391 std::cout << "WARNING: Patch primitive " << primitive_indices[i] << " has " << vertices.size() << " vertices" << std::endl;
1392 }
1393 // Add zeros for invalid patch
1394 for (int v = 0; v < 4; v++) {
1395 primitive_vertices.push_back(make_float3(0, 0, 0));
1396 }
1397 vertex_index += 4;
1398 }
1399 } else if (ptype == PRIMITIVE_TYPE_VOXEL) {
1400 voxel_count++;
1401 // For voxels, compute AABB from 8 corner vertices
1402 // vertices was already retrieved above, reuse it
1403 vec3 voxel_min = vertices[0];
1404 vec3 voxel_max = vertices[0];
1405
1406 // Find min/max coordinates from all 8 vertices
1407 for (const auto &vertex: vertices) {
1408 voxel_min.x = std::min(voxel_min.x, vertex.x);
1409 voxel_min.y = std::min(voxel_min.y, vertex.y);
1410 voxel_min.z = std::min(voxel_min.z, vertex.z);
1411 voxel_max.x = std::max(voxel_max.x, vertex.x);
1412 voxel_max.y = std::max(voxel_max.y, vertex.y);
1413 voxel_max.z = std::max(voxel_max.z, vertex.z);
1414 }
1415
1416 // Store as: [min.x, min.y, min.z, max.x, max.y, max.z, 0, 0]
1417 primitive_vertices.push_back(make_float3(voxel_min.x, voxel_min.y, voxel_min.z)); // v0 = min
1418 primitive_vertices.push_back(make_float3(voxel_max.x, voxel_max.y, voxel_max.z)); // v1 = max
1419 primitive_vertices.push_back(make_float3(0, 0, 0)); // v2 = padding
1420 primitive_vertices.push_back(make_float3(0, 0, 0)); // v3 = padding
1421 vertex_index += 4;
1422 } else {
1423 // Unknown primitive type - add padding
1424 for (int v = 0; v < 4; v++) {
1425 primitive_vertices.push_back(make_float3(0, 0, 0));
1426 }
1427 vertex_index += 4;
1428 }
1429 }
1430
1431 if (printmessages) {
1432 }
1433
1434 if (primitive_indices.empty()) {
1435 if (printmessages) {
1436 std::cout << "No primitives found for GPU ray tracing, falling back to CPU" << std::endl;
1437 }
1438 return castRaysSoA(ray_queries, stats);
1439 }
1440
1441
1442 // Prepare result arrays
1443 std::vector<float> hit_distances(ray_queries.size());
1444 std::vector<unsigned int> hit_primitive_ids(ray_queries.size());
1445 std::vector<unsigned int> hit_counts(ray_queries.size());
1446
1447 // Convert CPU BVH nodes to GPU format
1448 std::vector<GPUBVHNode> gpu_bvh_nodes(bvh_nodes.size());
1449 for (size_t i = 0; i < bvh_nodes.size(); i++) {
1450 gpu_bvh_nodes[i].aabb_min = make_float3(bvh_nodes[i].aabb_min.x, bvh_nodes[i].aabb_min.y, bvh_nodes[i].aabb_min.z);
1451 gpu_bvh_nodes[i].aabb_max = make_float3(bvh_nodes[i].aabb_max.x, bvh_nodes[i].aabb_max.y, bvh_nodes[i].aabb_max.z);
1452 gpu_bvh_nodes[i].left_child = bvh_nodes[i].left_child;
1453 gpu_bvh_nodes[i].right_child = bvh_nodes[i].right_child;
1454 gpu_bvh_nodes[i].primitive_start = bvh_nodes[i].primitive_start;
1455 gpu_bvh_nodes[i].primitive_count = bvh_nodes[i].primitive_count;
1456 gpu_bvh_nodes[i].is_leaf = bvh_nodes[i].is_leaf ? 1 : 0;
1457 gpu_bvh_nodes[i].padding = 0;
1458
1459 // Debug problematic BVH nodes
1460 if (bvh_nodes[i].is_leaf && printmessages && i < 3) {
1461 std::cout << "BVH leaf node " << i << ": primitive_start=" << bvh_nodes[i].primitive_start << ", primitive_count=" << bvh_nodes[i].primitive_count << ", max_index=" << (bvh_nodes[i].primitive_start + bvh_nodes[i].primitive_count - 1)
1462 << std::endl;
1463 }
1464 }
1465
1466 // Launch high-performance GPU ray intersection kernel
1467 launchRayPrimitiveIntersection(gpu_bvh_nodes.data(), // BVH nodes
1468 static_cast<int>(gpu_bvh_nodes.size()), // Node count
1469 primitive_indices.data(), // Primitive indices
1470 static_cast<int>(primitive_indices.size()), // Primitive count
1471 primitive_types.data(), // Primitive types
1472 primitive_vertices.data(), // Primitive vertices (all types)
1473 vertex_offsets.data(), // Vertex offsets
1474 static_cast<int>(primitive_vertices.size()),
1475 ray_origins.data(), // Ray origins
1476 ray_directions.data(), // Ray directions
1477 ray_max_distances.data(), // Ray max distances
1478 static_cast<int>(ray_queries.size()), // Number of rays
1479 hit_distances.data(), // Output: hit distances
1480 hit_primitive_ids.data(), // Output: hit primitive IDs
1481 hit_counts.data(), // Output: hit counts
1482 true // Find closest hit
1483 );
1484
1485 // Convert GPU results back to HitResult format
1486 size_t hit_count = 0;
1487 size_t filtered_count = 0; // Hits found but filtered out
1488 for (size_t i = 0; i < ray_queries.size(); i++) {
1489 if (hit_counts[i] > 0 && hit_distances[i] <= ray_queries[i].max_distance) {
1490 results[i].hit = true;
1491 results[i].primitive_UUID = hit_primitive_ids[i];
1492 results[i].distance = hit_distances[i];
1493 results[i].intersection_point = ray_queries[i].origin + ray_queries[i].direction * hit_distances[i];
1494
1495 // Calculate normal for triangles and patches (required for LiDAR)
1496 PrimitiveType ptype = context->getPrimitiveType(hit_primitive_ids[i]);
1497 if (ptype == PRIMITIVE_TYPE_TRIANGLE || ptype == PRIMITIVE_TYPE_PATCH) {
1498 std::vector<vec3> vertices = context->getPrimitiveVertices(hit_primitive_ids[i]);
1499 if (vertices.size() >= 3) {
1500 vec3 edge1 = vertices[1] - vertices[0];
1501 vec3 edge2 = vertices[2] - vertices[0];
1502 results[i].normal = cross(edge1, edge2);
1503 if (results[i].normal.magnitude() > 1e-8f) {
1504 results[i].normal.normalize();
1505 }
1506 }
1507 }
1508
1509 hit_count++;
1510 } else {
1511 if (hit_counts[i] > 0) {
1512 filtered_count++; // Hit was found but distance exceeded max_distance
1513 }
1514 results[i].hit = false;
1515 results[i].primitive_UUID = 0;
1516 results[i].distance = std::numeric_limits<float>::max();
1517 }
1518 }
1519
1520 auto end = std::chrono::high_resolution_clock::now();
1521 double elapsed = std::chrono::duration<double, std::milli>(end - start).count();
1522
1523 stats.total_rays_cast = ray_queries.size();
1524 stats.total_hits = hit_count;
1525 double rays_per_second = ray_queries.size() / (elapsed / 1000.0);
1526
1527
1528 return results;
1529}
1530
1531// ================================================================
1532// SIMD-OPTIMIZED RAY TRACING METHODS
1533// ================================================================
1534
1535uint32_t CollisionDetection::rayAABBIntersectSIMD(const vec3 *ray_origins, const vec3 *ray_directions, const vec3 *aabb_mins, const vec3 *aabb_maxs, float *t_mins, float *t_maxs, int count) {
1536#ifdef __AVX2__
1537 if (count == 8) {
1538 // AVX2 implementation for 8 rays at once
1539 uint32_t hit_mask = 0;
1540
1541 for (int i = 0; i < 8; i += 8) {
1542 // Load 8 ray origins
1543 __m256 orig_x = _mm256_set_ps(ray_origins[i + 7].x, ray_origins[i + 6].x, ray_origins[i + 5].x, ray_origins[i + 4].x, ray_origins[i + 3].x, ray_origins[i + 2].x, ray_origins[i + 1].x, ray_origins[i + 0].x);
1544 __m256 orig_y = _mm256_set_ps(ray_origins[i + 7].y, ray_origins[i + 6].y, ray_origins[i + 5].y, ray_origins[i + 4].y, ray_origins[i + 3].y, ray_origins[i + 2].y, ray_origins[i + 1].y, ray_origins[i + 0].y);
1545 __m256 orig_z = _mm256_set_ps(ray_origins[i + 7].z, ray_origins[i + 6].z, ray_origins[i + 5].z, ray_origins[i + 4].z, ray_origins[i + 3].z, ray_origins[i + 2].z, ray_origins[i + 1].z, ray_origins[i + 0].z);
1546
1547 // Load 8 ray directions
1548 __m256 dir_x = _mm256_set_ps(ray_directions[i + 7].x, ray_directions[i + 6].x, ray_directions[i + 5].x, ray_directions[i + 4].x, ray_directions[i + 3].x, ray_directions[i + 2].x, ray_directions[i + 1].x, ray_directions[i + 0].x);
1549 __m256 dir_y = _mm256_set_ps(ray_directions[i + 7].y, ray_directions[i + 6].y, ray_directions[i + 5].y, ray_directions[i + 4].y, ray_directions[i + 3].y, ray_directions[i + 2].y, ray_directions[i + 1].y, ray_directions[i + 0].y);
1550 __m256 dir_z = _mm256_set_ps(ray_directions[i + 7].z, ray_directions[i + 6].z, ray_directions[i + 5].z, ray_directions[i + 4].z, ray_directions[i + 3].z, ray_directions[i + 2].z, ray_directions[i + 1].z, ray_directions[i + 0].z);
1551
1552 // Load 8 AABB mins
1553 __m256 aabb_min_x = _mm256_set_ps(aabb_mins[i + 7].x, aabb_mins[i + 6].x, aabb_mins[i + 5].x, aabb_mins[i + 4].x, aabb_mins[i + 3].x, aabb_mins[i + 2].x, aabb_mins[i + 1].x, aabb_mins[i + 0].x);
1554 __m256 aabb_min_y = _mm256_set_ps(aabb_mins[i + 7].y, aabb_mins[i + 6].y, aabb_mins[i + 5].y, aabb_mins[i + 4].y, aabb_mins[i + 3].y, aabb_mins[i + 2].y, aabb_mins[i + 1].y, aabb_mins[i + 0].y);
1555 __m256 aabb_min_z = _mm256_set_ps(aabb_mins[i + 7].z, aabb_mins[i + 6].z, aabb_mins[i + 5].z, aabb_mins[i + 4].z, aabb_mins[i + 3].z, aabb_mins[i + 2].z, aabb_mins[i + 1].z, aabb_mins[i + 0].z);
1556
1557 // Load 8 AABB maxs
1558 __m256 aabb_max_x = _mm256_set_ps(aabb_maxs[i + 7].x, aabb_maxs[i + 6].x, aabb_maxs[i + 5].x, aabb_maxs[i + 4].x, aabb_maxs[i + 3].x, aabb_maxs[i + 2].x, aabb_maxs[i + 1].x, aabb_maxs[i + 0].x);
1559 __m256 aabb_max_y = _mm256_set_ps(aabb_maxs[i + 7].y, aabb_maxs[i + 6].y, aabb_maxs[i + 5].y, aabb_maxs[i + 4].y, aabb_maxs[i + 3].y, aabb_maxs[i + 2].y, aabb_maxs[i + 1].y, aabb_maxs[i + 0].y);
1560 __m256 aabb_max_z = _mm256_set_ps(aabb_maxs[i + 7].z, aabb_maxs[i + 6].z, aabb_maxs[i + 5].z, aabb_maxs[i + 4].z, aabb_maxs[i + 3].z, aabb_maxs[i + 2].z, aabb_maxs[i + 1].z, aabb_maxs[i + 0].z);
1561
1562 // Calculate inverse directions
1563 __m256 inv_dir_x = _mm256_div_ps(_mm256_set1_ps(1.0f), dir_x);
1564 __m256 inv_dir_y = _mm256_div_ps(_mm256_set1_ps(1.0f), dir_y);
1565 __m256 inv_dir_z = _mm256_div_ps(_mm256_set1_ps(1.0f), dir_z);
1566
1567 // Calculate intersection distances for X axis
1568 __m256 t1_x = _mm256_mul_ps(_mm256_sub_ps(aabb_min_x, orig_x), inv_dir_x);
1569 __m256 t2_x = _mm256_mul_ps(_mm256_sub_ps(aabb_max_x, orig_x), inv_dir_x);
1570 __m256 tmin_x = _mm256_min_ps(t1_x, t2_x);
1571 __m256 tmax_x = _mm256_max_ps(t1_x, t2_x);
1572
1573 // Calculate intersection distances for Y axis
1574 __m256 t1_y = _mm256_mul_ps(_mm256_sub_ps(aabb_min_y, orig_y), inv_dir_y);
1575 __m256 t2_y = _mm256_mul_ps(_mm256_sub_ps(aabb_max_y, orig_y), inv_dir_y);
1576 __m256 tmin_y = _mm256_min_ps(t1_y, t2_y);
1577 __m256 tmax_y = _mm256_max_ps(t1_y, t2_y);
1578
1579 // Calculate intersection distances for Z axis
1580 __m256 t1_z = _mm256_mul_ps(_mm256_sub_ps(aabb_min_z, orig_z), inv_dir_z);
1581 __m256 t2_z = _mm256_mul_ps(_mm256_sub_ps(aabb_max_z, orig_z), inv_dir_z);
1582 __m256 tmin_z = _mm256_min_ps(t1_z, t2_z);
1583 __m256 tmax_z = _mm256_max_ps(t1_z, t2_z);
1584
1585 // Find intersection interval
1586 __m256 t_min_final = _mm256_max_ps(_mm256_max_ps(tmin_x, tmin_y), tmin_z);
1587 __m256 t_max_final = _mm256_min_ps(_mm256_min_ps(tmax_x, tmax_y), tmax_z);
1588
1589 // Store results
1590 _mm256_store_ps(&t_mins[i], t_min_final);
1591 _mm256_store_ps(&t_maxs[i], t_max_final);
1592
1593 // Check for intersection: t_max >= 0 && t_min <= t_max
1594 __m256 zero = _mm256_set1_ps(0.0f);
1595 __m256 hits = _mm256_and_ps(_mm256_cmp_ps(t_max_final, zero, _CMP_GE_OS), _mm256_cmp_ps(t_min_final, t_max_final, _CMP_LE_OS));
1596
1597 // Convert to bitmask
1598 hit_mask |= _mm256_movemask_ps(hits);
1599 }
1600
1601 return hit_mask;
1602 }
1603#endif
1604
1605#ifdef __SSE4_1__
1606 if (count == 4) {
1607 // SSE implementation for 4 rays at once
1608 uint32_t hit_mask = 0;
1609
1610 for (int i = 0; i < 4; i += 4) {
1611 // Load 4 ray origins
1612 __m128 orig_x = _mm_set_ps(ray_origins[i + 3].x, ray_origins[i + 2].x, ray_origins[i + 1].x, ray_origins[i + 0].x);
1613 __m128 orig_y = _mm_set_ps(ray_origins[i + 3].y, ray_origins[i + 2].y, ray_origins[i + 1].y, ray_origins[i + 0].y);
1614 __m128 orig_z = _mm_set_ps(ray_origins[i + 3].z, ray_origins[i + 2].z, ray_origins[i + 1].z, ray_origins[i + 0].z);
1615
1616 // Load 4 ray directions
1617 __m128 dir_x = _mm_set_ps(ray_directions[i + 3].x, ray_directions[i + 2].x, ray_directions[i + 1].x, ray_directions[i + 0].x);
1618 __m128 dir_y = _mm_set_ps(ray_directions[i + 3].y, ray_directions[i + 2].y, ray_directions[i + 1].y, ray_directions[i + 0].y);
1619 __m128 dir_z = _mm_set_ps(ray_directions[i + 3].z, ray_directions[i + 2].z, ray_directions[i + 1].z, ray_directions[i + 0].z);
1620
1621 // Load 4 AABB mins
1622 __m128 aabb_min_x = _mm_set_ps(aabb_mins[i + 3].x, aabb_mins[i + 2].x, aabb_mins[i + 1].x, aabb_mins[i + 0].x);
1623 __m128 aabb_min_y = _mm_set_ps(aabb_mins[i + 3].y, aabb_mins[i + 2].y, aabb_mins[i + 1].y, aabb_mins[i + 0].y);
1624 __m128 aabb_min_z = _mm_set_ps(aabb_mins[i + 3].z, aabb_mins[i + 2].z, aabb_mins[i + 1].z, aabb_mins[i + 0].z);
1625
1626 // Load 4 AABB maxs
1627 __m128 aabb_max_x = _mm_set_ps(aabb_maxs[i + 3].x, aabb_maxs[i + 2].x, aabb_maxs[i + 1].x, aabb_maxs[i + 0].x);
1628 __m128 aabb_max_y = _mm_set_ps(aabb_maxs[i + 3].y, aabb_maxs[i + 2].y, aabb_maxs[i + 1].y, aabb_maxs[i + 0].y);
1629 __m128 aabb_max_z = _mm_set_ps(aabb_maxs[i + 3].z, aabb_maxs[i + 2].z, aabb_maxs[i + 1].z, aabb_maxs[i + 0].z);
1630
1631 // Calculate inverse directions
1632 __m128 inv_dir_x = _mm_div_ps(_mm_set1_ps(1.0f), dir_x);
1633 __m128 inv_dir_y = _mm_div_ps(_mm_set1_ps(1.0f), dir_y);
1634 __m128 inv_dir_z = _mm_div_ps(_mm_set1_ps(1.0f), dir_z);
1635
1636 // Calculate intersection distances for X axis
1637 __m128 t1_x = _mm_mul_ps(_mm_sub_ps(aabb_min_x, orig_x), inv_dir_x);
1638 __m128 t2_x = _mm_mul_ps(_mm_sub_ps(aabb_max_x, orig_x), inv_dir_x);
1639 __m128 tmin_x = _mm_min_ps(t1_x, t2_x);
1640 __m128 tmax_x = _mm_max_ps(t1_x, t2_x);
1641
1642 // Calculate intersection distances for Y axis
1643 __m128 t1_y = _mm_mul_ps(_mm_sub_ps(aabb_min_y, orig_y), inv_dir_y);
1644 __m128 t2_y = _mm_mul_ps(_mm_sub_ps(aabb_max_y, orig_y), inv_dir_y);
1645 __m128 tmin_y = _mm_min_ps(t1_y, t2_y);
1646 __m128 tmax_y = _mm_max_ps(t1_y, t2_y);
1647
1648 // Calculate intersection distances for Z axis
1649 __m128 t1_z = _mm_mul_ps(_mm_sub_ps(aabb_min_z, orig_z), inv_dir_z);
1650 __m128 t2_z = _mm_mul_ps(_mm_sub_ps(aabb_max_z, orig_z), inv_dir_z);
1651 __m128 tmin_z = _mm_min_ps(t1_z, t2_z);
1652 __m128 tmax_z = _mm_max_ps(t1_z, t2_z);
1653
1654 // Find intersection interval
1655 __m128 t_min_final = _mm_max_ps(_mm_max_ps(tmin_x, tmin_y), tmin_z);
1656 __m128 t_max_final = _mm_min_ps(_mm_min_ps(tmax_x, tmax_y), tmax_z);
1657
1658 // Store results
1659 _mm_store_ps(&t_mins[i], t_min_final);
1660 _mm_store_ps(&t_maxs[i], t_max_final);
1661
1662 // Check for intersection: t_max >= 0 && t_min <= t_max
1663 __m128 zero = _mm_set1_ps(0.0f);
1664 __m128 hits = _mm_and_ps(_mm_cmpge_ps(t_max_final, zero), _mm_cmple_ps(t_min_final, t_max_final));
1665
1666 // Convert to bitmask
1667 hit_mask |= _mm_movemask_ps(hits);
1668 }
1669
1670 return hit_mask;
1671 }
1672#endif
1673
1674 // Fallback to scalar implementation
1675 uint32_t hit_mask = 0;
1676 for (int i = 0; i < count; ++i) {
1677 if (rayAABBIntersect(ray_origins[i], ray_directions[i], aabb_mins[i], aabb_maxs[i], t_mins[i], t_maxs[i])) {
1678 hit_mask |= (1 << i);
1679 }
1680 }
1681 return hit_mask;
1682}
1683
1684void CollisionDetection::traverseBVHSIMD(const vec3 *ray_origins, const vec3 *ray_directions, int count, HitResult *results) {
1685 if (bvh_nodes.empty()) {
1686 // Initialize all results as misses
1687 for (int i = 0; i < count; ++i) {
1688 results[i] = HitResult();
1689 }
1690 return;
1691 }
1692
1693 // Determine SIMD batch size based on available instructions
1694 int simd_batch_size = 1; // Default to scalar
1695#ifdef __AVX2__
1696 simd_batch_size = 8;
1697#elif defined(__SSE4_1__)
1698 simd_batch_size = 4;
1699#endif
1700
1701 // Process rays in SIMD batches
1702 for (int batch_start = 0; batch_start < count; batch_start += simd_batch_size) {
1703 int batch_count = std::min(simd_batch_size, count - batch_start);
1704
1705 // Initialize batch results
1706 for (int i = 0; i < batch_count; ++i) {
1707 results[batch_start + i] = HitResult();
1708 }
1709
1710 if (batch_count >= simd_batch_size && simd_batch_size > 1) {
1711 // SIMD-optimized batch processing
1712 traverseBVHSIMDImpl(&ray_origins[batch_start], &ray_directions[batch_start], batch_count, &results[batch_start]);
1713 } else {
1714 // Process remaining rays individually
1715 for (int i = 0; i < batch_count; ++i) {
1716 int ray_idx = batch_start + i;
1717 results[ray_idx] = castRay(RayQuery(ray_origins[ray_idx], ray_directions[ray_idx]));
1718 }
1719 }
1720 }
1721}
1722
1723void CollisionDetection::traverseBVHSIMDImpl(const vec3 *ray_origins, const vec3 *ray_directions, int count, HitResult *results) {
1724 const size_t MAX_STACK_SIZE = 64;
1725
1726 // Per-ray data structures - using aligned arrays for SIMD efficiency
1727 alignas(32) uint32_t node_stacks[8][MAX_STACK_SIZE]; // 8 stacks for up to 8 rays
1728 alignas(32) uint32_t stack_tops[8] = {0}; // Current stack positions
1729 alignas(32) float closest_distances[8]; // Closest hit distances per ray
1730 alignas(32) bool ray_active[8]; // Which rays are still active
1731
1732 // Initialize per-ray state
1733 for (int i = 0; i < count; ++i) {
1734 node_stacks[i][0] = 0; // Start with root node
1735 stack_tops[i] = 1;
1736 closest_distances[i] = std::numeric_limits<float>::max();
1737 ray_active[i] = true;
1738 results[i] = HitResult(); // Initialize as miss
1739 }
1740
1741 // Main traversal loop - continue while any ray is active
1742 while (true) {
1743 bool any_active = false;
1744 for (int i = 0; i < count; ++i) {
1745 if (ray_active[i] && stack_tops[i] > 0) {
1746 any_active = true;
1747 break;
1748 }
1749 }
1750 if (!any_active)
1751 break;
1752
1753 // Collect next nodes to test for active rays
1754 alignas(32) vec3 test_aabb_mins[8];
1755 alignas(32) vec3 test_aabb_maxs[8];
1756 alignas(32) uint32_t test_node_indices[8];
1757 alignas(32) int test_ray_indices[8];
1758 int test_count = 0;
1759
1760 for (int i = 0; i < count; ++i) {
1761 if (ray_active[i] && stack_tops[i] > 0) {
1762 uint32_t node_idx = node_stacks[i][--stack_tops[i]];
1763 const BVHNode &node = bvh_nodes[node_idx];
1764
1765 test_aabb_mins[test_count] = node.aabb_min;
1766 test_aabb_maxs[test_count] = node.aabb_max;
1767 test_node_indices[test_count] = node_idx;
1768 test_ray_indices[test_count] = i;
1769 test_count++;
1770
1771 if (test_count == count)
1772 break; // Batch is full
1773 }
1774 }
1775
1776 if (test_count == 0)
1777 break;
1778
1779 // Prepare ray data for SIMD intersection test
1780 alignas(32) vec3 batch_origins[8];
1781 alignas(32) vec3 batch_directions[8];
1782 alignas(32) float t_mins[8];
1783 alignas(32) float t_maxs[8];
1784
1785 for (int i = 0; i < test_count; ++i) {
1786 int ray_idx = test_ray_indices[i];
1787 batch_origins[i] = ray_origins[ray_idx];
1788 batch_directions[i] = ray_directions[ray_idx];
1789 }
1790
1791 // Perform SIMD AABB intersection test
1792 uint32_t hit_mask = rayAABBIntersectSIMD(batch_origins, batch_directions, test_aabb_mins, test_aabb_maxs, t_mins, t_maxs, test_count);
1793
1794 // Process intersection results
1795 for (int i = 0; i < test_count; ++i) {
1796 if (!(hit_mask & (1 << i)))
1797 continue; // Ray missed this AABB
1798
1799 int ray_idx = test_ray_indices[i];
1800 uint32_t node_idx = test_node_indices[i];
1801 const BVHNode &node = bvh_nodes[node_idx];
1802
1803 if (t_mins[i] > closest_distances[ray_idx])
1804 continue; // Beyond closest hit
1805
1806 if (node.is_leaf) {
1807 // Test primitives in leaf node
1808 for (uint32_t prim_idx = node.primitive_start; prim_idx < node.primitive_start + node.primitive_count; ++prim_idx) {
1809
1810 uint32_t primitive_id = primitive_indices[prim_idx];
1811
1812 // Use thread-safe primitive intersection
1813 HitResult prim_result = intersectPrimitiveThreadSafe(batch_origins[i], batch_directions[i], primitive_id, closest_distances[ray_idx]);
1814 if (prim_result.hit && prim_result.distance < closest_distances[ray_idx]) {
1815 closest_distances[ray_idx] = prim_result.distance;
1816 results[ray_idx] = prim_result;
1817 }
1818 }
1819 } else {
1820 // Add child nodes to stack (if space available)
1821 if (stack_tops[ray_idx] < MAX_STACK_SIZE - 2) {
1822 node_stacks[ray_idx][stack_tops[ray_idx]++] = node.left_child;
1823 node_stacks[ray_idx][stack_tops[ray_idx]++] = node.right_child;
1824 } else {
1825 // Stack overflow - mark ray as inactive to prevent infinite loop
1826 ray_active[ray_idx] = false;
1827 }
1828 }
1829 }
1830 }
1831}
1832#endif
1833
1834
1835bool CollisionDetection::rayAABBIntersectPrimitive(const helios::vec3 &origin, const helios::vec3 &direction, const helios::vec3 &aabb_min, const helios::vec3 &aabb_max, float &distance) {
1836 // Ray-AABB intersection using slab method
1837 // Optimized version with early termination
1838
1839 const float EPSILON = 1e-8f;
1840
1841 // Calculate t values for each slab
1842 float t_min_x = (aabb_min.x - origin.x) / direction.x;
1843 float t_max_x = (aabb_max.x - origin.x) / direction.x;
1844
1845 // Handle negative direction components
1846 if (direction.x < 0.0f) {
1847 float temp = t_min_x;
1848 t_min_x = t_max_x;
1849 t_max_x = temp;
1850 }
1851
1852 float t_min_y = (aabb_min.y - origin.y) / direction.y;
1853 float t_max_y = (aabb_max.y - origin.y) / direction.y;
1854
1855 if (direction.y < 0.0f) {
1856 float temp = t_min_y;
1857 t_min_y = t_max_y;
1858 t_max_y = temp;
1859 }
1860
1861 // Check for early termination in X-Y
1862 float t_min = std::max(t_min_x, t_min_y);
1863 float t_max = std::min(t_max_x, t_max_y);
1864
1865 if (t_min > t_max) {
1866 return false; // No intersection
1867 }
1868
1869 float t_min_z = (aabb_min.z - origin.z) / direction.z;
1870 float t_max_z = (aabb_max.z - origin.z) / direction.z;
1871
1872 if (direction.z < 0.0f) {
1873 float temp = t_min_z;
1874 t_min_z = t_max_z;
1875 t_max_z = temp;
1876 }
1877
1878 // Final intersection test
1879 t_min = std::max(t_min, t_min_z);
1880 t_max = std::min(t_max, t_max_z);
1881
1882 if (t_min > t_max || t_max < EPSILON) {
1883 return false; // No intersection or behind ray
1884 }
1885
1886 // Set distance to closest intersection point
1887 distance = (t_min > EPSILON) ? t_min : t_max;
1888
1889 return distance > EPSILON;
1890}