1.3.72
 
Loading...
Searching...
No Matches
OptiX8Backend.cpp
Go to the documentation of this file.
1
16#include "OptiX8Backend.h"
17
18// OptiX function table definition (must be in exactly one .cpp)
19#include <optix_function_table_definition.h>
20
21#include "Context.h"
22
23#include <algorithm>
24#include <cfloat>
25#include <fstream>
26
27namespace helios {
28
29// ---------------------------------------------------------------------------
30// Construction / destruction
31// ---------------------------------------------------------------------------
32
33OptiX8Backend::OptiX8Backend() = default;
34
35bool OptiX8Backend::probe() noexcept {
36 try {
37 int device_count = 0;
38 cudaError_t rc = cudaGetDeviceCount(&device_count);
39 if (rc != cudaSuccess || device_count == 0) {
40 return false;
41 }
42 OptixResult optix_rc = optixInit();
43 return (optix_rc == OPTIX_SUCCESS);
44 } catch (...) {
45 return false;
46 }
47}
48
49OptiX8Backend::~OptiX8Backend() {
50 if (is_initialized) {
51 shutdown();
52 }
53}
54
55// ---------------------------------------------------------------------------
56// Lifecycle
57// ---------------------------------------------------------------------------
58
60 // Initialize CUDA
61 CUDA_CHECK(cudaFree(nullptr)); // Force CUDA context initialization
62
63 // Create CUDA stream
64 CUDA_CHECK(cudaStreamCreate(&cuda_stream));
65
66 // Initialize OptiX function table from the loaded driver
67 OPTIX_CHECK(optixInit());
68
69 // Create OptiX device context
70 CUcontext cuda_context = nullptr; // use current context
71 OptixDeviceContextOptions ctx_options = {};
72 ctx_options.logCallbackFunction = [](unsigned int level, const char *tag, const char *message, void *) {
73 if (level <= 2) {
74 std::cerr << "[OptiX][" << tag << "] " << message << "\n";
75 }
76 };
77 ctx_options.logCallbackLevel = 2;
78 OPTIX_CHECK(optixDeviceContextCreate(cuda_context, &ctx_options, &optix_context));
79
80 // Load device code (PTX or OptixIR) and compile module
81 const std::string device_code_path = findDeviceCodeFile();
82 std::ifstream file(device_code_path, std::ios::binary | std::ios::ate);
83 if (!file.is_open()) {
84 helios_runtime_error("ERROR (OptiX8Backend::initialize): Could not open device code file: " + device_code_path);
85 }
86 const std::streamsize file_size = file.tellg();
87 file.seekg(0, std::ios::beg);
88 std::vector<char> device_code(file_size);
89 if (!file.read(device_code.data(), file_size)) {
90 helios_runtime_error("ERROR (OptiX8Backend::initialize): Could not read device code file: " + device_code_path);
91 }
92
93 OptixModuleCompileOptions module_options = {};
94 module_options.maxRegisterCount = OPTIX_COMPILE_DEFAULT_MAX_REGISTER_COUNT;
95#ifndef NDEBUG
96 module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
97 module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
98#else
99 module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
100 module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
101#endif
102
103 OptixPipelineCompileOptions pipeline_options = {};
104 pipeline_options.usesMotionBlur = 0;
105 pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_GAS;
106 pipeline_options.numPayloadValues = 2; // two uint32 for pointer-in-registers
107 pipeline_options.numAttributeValues = 2; // UUID + face in attributes
108 pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
109 pipeline_options.pipelineLaunchParamsVariableName = "params";
110 pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
111
112 char log[4096];
113 size_t log_size = sizeof(log);
114
115 OPTIX_CHECK(optixModuleCreate(
116 optix_context,
117 &module_options,
118 &pipeline_options,
119 device_code.data(),
120 static_cast<size_t>(file_size),
121 log, &log_size,
122 &optix_module));
123
124 // ---- Create program groups ----
125 OptixProgramGroupOptions pg_options = {};
126
127 // Raygen programs
128 auto createRaygen = [&](const char *entry, OptixProgramGroup &pg) {
129 OptixProgramGroupDesc desc = {};
130 desc.kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
131 desc.raygen.module = optix_module;
132 desc.raygen.entryFunctionName = entry;
133 log_size = sizeof(log);
134 OPTIX_CHECK(optixProgramGroupCreate(optix_context, &desc, 1, &pg_options, log, &log_size, &pg));
135 (void)log_size;
136 };
137
138 createRaygen("__raygen__direct", pg_raygen_direct);
139 createRaygen("__raygen__diffuse", pg_raygen_diffuse);
140 createRaygen("__raygen__camera", pg_raygen_camera);
141 createRaygen("__raygen__pixel_label", pg_raygen_pixel_label);
142
143 // Miss programs
144 auto createMiss = [&](const char *entry, OptixProgramGroup &pg) {
145 OptixProgramGroupDesc desc = {};
146 desc.kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
147 desc.miss.module = optix_module;
148 desc.miss.entryFunctionName = entry;
149 log_size = sizeof(log);
150 OPTIX_CHECK(optixProgramGroupCreate(optix_context, &desc, 1, &pg_options, log, &log_size, &pg));
151 (void)log_size;
152 };
153
154 createMiss("__miss__direct", pg_miss_direct);
155 createMiss("__miss__diffuse", pg_miss_diffuse);
156 createMiss("__miss__camera", pg_miss_camera);
157 createMiss("__miss__pixel_label", pg_miss_pixel_label);
158
159 // Hit groups: one per ray type (4 total). All geometry uses __intersection__patch,
160 // which dispatches internally on primitive type. With a single GAS and numSbtRecords=1,
161 // the SBT hit record index = sbt_offset from optixTrace (stride=0 for all ray types).
162
163 auto createHitGroup = [&](const char *ch_entry, const char *is_entry, OptixProgramGroup &pg) {
164 OptixProgramGroupDesc desc = {};
165 desc.kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
166 desc.hitgroup.moduleCH = optix_module;
167 desc.hitgroup.entryFunctionNameCH = ch_entry;
168 desc.hitgroup.moduleIS = optix_module;
169 desc.hitgroup.entryFunctionNameIS = is_entry;
170 desc.hitgroup.moduleAH = nullptr;
171 desc.hitgroup.entryFunctionNameAH = nullptr;
172 log_size = sizeof(log);
173 OPTIX_CHECK(optixProgramGroupCreate(optix_context, &desc, 1, &pg_options, log, &log_size, &pg));
174 (void)log_size;
175 };
176
177 createHitGroup("__closesthit__direct", "__intersection__patch", pg_hit_direct);
178 createHitGroup("__closesthit__diffuse", "__intersection__patch", pg_hit_diffuse);
179 createHitGroup("__closesthit__camera", "__intersection__patch", pg_hit_camera);
180 createHitGroup("__closesthit__pixel_label", "__intersection__patch", pg_hit_pixel_label);
181
182 // ---- Create pipeline ----
183 OptixProgramGroup all_groups[] = {
184 pg_raygen_direct, pg_raygen_diffuse, pg_raygen_camera, pg_raygen_pixel_label,
185 pg_miss_direct, pg_miss_diffuse, pg_miss_camera, pg_miss_pixel_label,
186 pg_hit_direct, pg_hit_diffuse, pg_hit_camera, pg_hit_pixel_label
187 };
188
189 OptixPipelineLinkOptions link_options = {};
190 link_options.maxTraceDepth = 1;
191
192 log_size = sizeof(log);
193 OPTIX_CHECK(optixPipelineCreate(
194 optix_context,
195 &pipeline_options,
196 &link_options,
197 all_groups,
198 sizeof(all_groups) / sizeof(all_groups[0]),
199 log, &log_size,
200 &optix_pipeline));
201
202 // Set pipeline stack sizes using OptiX utilities
203 OptixStackSizes stack_sizes = {};
204 for (auto &pg : all_groups) {
205 OPTIX_CHECK(optixUtilAccumulateStackSizes(pg, &stack_sizes, optix_pipeline));
206 }
207 uint32_t max_trace_depth = 1;
208 uint32_t direct_callable_stack_size_from_traversal = 0;
209 uint32_t direct_callable_stack_size_from_state = 0;
210 uint32_t continuation_stack_size = 0;
211 OPTIX_CHECK(optixUtilComputeStackSizes(
212 &stack_sizes,
213 max_trace_depth,
214 0, // maxCCDepth (no continuation callables)
215 0, // maxDCDepth (no direct callables)
216 &direct_callable_stack_size_from_traversal,
217 &direct_callable_stack_size_from_state,
218 &continuation_stack_size));
219 OPTIX_CHECK(optixPipelineSetStackSize(
220 optix_pipeline,
221 direct_callable_stack_size_from_traversal,
222 direct_callable_stack_size_from_state,
223 continuation_stack_size,
224 1 /* maxTraversableGraphDepth */));
225
226 // Allocate device-side launch params buffer
227 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_params), sizeof(OptiX8LaunchParams)));
228 memset(&h_params, 0, sizeof(h_params));
229
230 is_initialized = true;
231}
232
234 if (!is_initialized) {
235 return;
236 }
237
238 // Synchronize before cleanup
239 if (cuda_stream) {
240 cudaStreamSynchronize(cuda_stream);
241 }
242
243 // Free device buffers
244 freeGeometryBuffers();
245 freeMaterialBuffers();
246
247 auto freePtr = [](CUdeviceptr &ptr) {
248 if (ptr) { cudaFree(reinterpret_cast<void *>(ptr)); ptr = 0; }
249 };
250
251 freePtr(d_radiation_in);
252 freePtr(d_radiation_out_top);
253 freePtr(d_radiation_out_bottom);
254 freePtr(d_scatter_buff_top);
255 freePtr(d_scatter_buff_bottom);
256 freePtr(d_radiation_in_camera);
257 freePtr(d_scatter_buff_top_cam);
258 freePtr(d_scatter_buff_bottom_cam);
259 freePtr(d_radiation_specular);
260 freePtr(d_Rsky);
261 freePtr(d_camera_pixel_label);
262 freePtr(d_camera_pixel_depth);
263 freePtr(d_source_positions);
264 freePtr(d_source_rotations);
265 freePtr(d_source_widths);
266 freePtr(d_source_types);
267 freePtr(d_source_fluxes);
268 freePtr(d_source_fluxes_cam);
269 freePtr(d_diffuse_flux);
270 freePtr(d_diffuse_extinction);
271 freePtr(d_diffuse_peak_dir);
272 freePtr(d_diffuse_dist_norm);
273 freePtr(d_sky_radiance_params);
274 freePtr(d_camera_sky_radiance);
275 freePtr(d_solar_disk_radiance);
276 freePtr(d_band_launch_flag);
277 freePtr(d_mask_data);
278 freePtr(d_mask_sizes);
279 freePtr(d_mask_IDs);
280 freePtr(d_uv_data);
281 freePtr(d_uv_IDs);
282 freePtr(d_params);
283
284 // Free SBT device memory
285 if (d_raygen_records) { cudaFree(reinterpret_cast<void *>(d_raygen_records)); d_raygen_records = 0; }
286 if (d_miss_records) { cudaFree(reinterpret_cast<void *>(d_miss_records)); d_miss_records = 0; }
287 if (d_hitgroup_records) { cudaFree(reinterpret_cast<void *>(d_hitgroup_records)); d_hitgroup_records = 0; }
288
289 // Free GAS
290 if (d_gas_output) { cudaFree(reinterpret_cast<void *>(d_gas_output)); d_gas_output = 0; }
291
292 // Destroy program groups
293 auto destroyPG = [](OptixProgramGroup &pg) {
294 if (pg) { optixProgramGroupDestroy(pg); pg = nullptr; }
295 };
296 destroyPG(pg_raygen_direct); destroyPG(pg_raygen_diffuse);
297 destroyPG(pg_raygen_camera); destroyPG(pg_raygen_pixel_label);
298 destroyPG(pg_miss_direct); destroyPG(pg_miss_diffuse);
299 destroyPG(pg_miss_camera); destroyPG(pg_miss_pixel_label);
300 destroyPG(pg_hit_direct); destroyPG(pg_hit_diffuse);
301 destroyPG(pg_hit_camera); destroyPG(pg_hit_pixel_label);
302
303 if (optix_pipeline) { optixPipelineDestroy(optix_pipeline); optix_pipeline = nullptr; }
304 if (optix_module) { optixModuleDestroy(optix_module); optix_module = nullptr; }
305 if (optix_context) { optixDeviceContextDestroy(optix_context); optix_context = nullptr; }
306 if (cuda_stream) { cudaStreamDestroy(cuda_stream); cuda_stream = nullptr; }
307
308 is_initialized = false;
309}
310
311// ---------------------------------------------------------------------------
312// Geometry management
313// ---------------------------------------------------------------------------
314
316 validateGeometryBeforeUpload(geometry);
317
318 // Validate that all primitive types are supported by the OptiX 8.1 backend.
319 // Supported: patch (0), triangle (1), tile (3). Disk (2), voxel (4), and
320 // bbox (5) intersection programs are not yet implemented.
321 if (geometry.disk_count > 0) {
322 helios_runtime_error("ERROR (OptiX8Backend::updateGeometry): Scene contains disk primitives, "
323 "which are not yet supported by the OptiX 8.1 backend.");
324 }
325 if (geometry.voxel_count > 0) {
326 helios_runtime_error("ERROR (OptiX8Backend::updateGeometry): Scene contains voxel primitives, "
327 "which are not yet supported by the OptiX 8.1 backend.");
328 }
329
330 freeGeometryBuffers();
331
332 auto upload = [this](CUdeviceptr &d_ptr, const void *src, size_t bytes) {
333 if (bytes > 0 && src) {
334 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_ptr), bytes));
335 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_ptr), src, bytes, cudaMemcpyHostToDevice));
336 }
337 };
338
339 upload(d_transform_matrix, geometry.transform_matrices.data(), geometry.transform_matrices.size() * sizeof(float));
340 upload(d_primitive_type, geometry.primitive_types.data(), geometry.primitive_types.size() * sizeof(uint32_t));
341 upload(d_primitive_positions, geometry.primitive_positions.data(), geometry.primitive_positions.size() * sizeof(uint32_t));
342 upload(d_primitive_uuid_arr, geometry.primitive_UUIDs.data(), geometry.primitive_UUIDs.size() * sizeof(uint32_t));
343 upload(d_primitiveID, geometry.primitive_IDs.data(), geometry.primitive_IDs.size() * sizeof(uint32_t));
344 upload(d_objectID, geometry.object_IDs.data(), geometry.object_IDs.size() * sizeof(uint32_t));
345 upload(d_twosided_flag, geometry.twosided_flags.data(), geometry.twosided_flags.size() * sizeof(char));
346 upload(d_primitive_solid_fraction, geometry.solid_fractions.data(), geometry.solid_fractions.size() * sizeof(float));
347
348 // object_subdivisions: vector<helios::int2> → flat int32 array (2 ints per prim)
349 if (!geometry.object_subdivisions.empty()) {
350 const size_t bytes = geometry.object_subdivisions.size() * sizeof(helios::int2);
351 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_object_subdivisions), bytes));
352 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_object_subdivisions),
353 geometry.object_subdivisions.data(), bytes, cudaMemcpyHostToDevice));
354 }
355
356 // Per-type geometry
357 if (geometry.patch_count > 0) {
358 upload(d_patch_vertices, geometry.patches.vertices.data(),
359 geometry.patches.vertices.size() * sizeof(helios::vec3));
360 upload(d_patch_UUIDs, geometry.patches.UUIDs.data(),
361 geometry.patches.UUIDs.size() * sizeof(uint32_t));
362 }
363 if (geometry.triangles.count > 0) {
364 upload(d_triangle_vertices, geometry.triangles.vertices.data(),
365 geometry.triangles.vertices.size() * sizeof(helios::vec3));
366 upload(d_triangle_UUIDs, geometry.triangles.UUIDs.data(),
367 geometry.triangles.UUIDs.size() * sizeof(uint32_t));
368 }
369 if (geometry.disk_count > 0) {
370 upload(d_disk_centers, geometry.disk_centers.data(), geometry.disk_centers.size() * sizeof(helios::vec3));
371 upload(d_disk_radii, geometry.disk_radii.data(), geometry.disk_radii.size() * sizeof(float));
372 upload(d_disk_normals, geometry.disk_normals.data(), geometry.disk_normals.size() * sizeof(helios::vec3));
373 upload(d_disk_UUIDs, geometry.disk_UUIDs.data(), geometry.disk_UUIDs.size() * sizeof(uint32_t));
374 }
375 if (geometry.tiles.count > 0) {
376 upload(d_tile_vertices, geometry.tiles.vertices.data(),
377 geometry.tiles.vertices.size() * sizeof(helios::vec3));
378 upload(d_tile_UUIDs, geometry.tiles.UUIDs.data(),
379 geometry.tiles.UUIDs.size() * sizeof(uint32_t));
380 }
381 if (geometry.voxels.count > 0) {
382 upload(d_voxel_vertices, geometry.voxels.vertices.data(),
383 geometry.voxels.vertices.size() * sizeof(helios::vec3));
384 upload(d_voxel_UUIDs, geometry.voxels.UUIDs.data(),
385 geometry.voxels.UUIDs.size() * sizeof(uint32_t));
386 }
387 if (geometry.bbox_count > 0) {
388 upload(d_bbox_vertices, geometry.bboxes.vertices.data(),
389 geometry.bboxes.vertices.size() * sizeof(helios::vec3));
390 upload(d_bbox_UUIDs, geometry.bboxes.UUIDs.data(),
391 geometry.bboxes.UUIDs.size() * sizeof(uint32_t));
392 }
393
394 // Extend primitive_type and primitive_uuid arrays to include bbox entries (type=5).
395 // OptiX AABB indices are global: indices [0, Nprims) are real primitives,
396 // indices [Nprims, Nprims+Nbboxes) are bbox faces. The intersection dispatch
397 // program reads params.primitive_type[optixGetPrimitiveIndex()] and needs
398 // type-5 entries at those positions.
399 if (geometry.bbox_count > 0) {
400 const size_t Nprims = geometry.primitive_count;
401 const size_t Nbboxes = geometry.bbox_count;
402
403 freeCUdeviceptr(d_primitive_type);
404 std::vector<uint32_t> ext_types(geometry.primitive_types);
405 ext_types.resize(Nprims + Nbboxes, 5u);
406 upload(d_primitive_type, ext_types.data(), ext_types.size() * sizeof(uint32_t));
407
408 freeCUdeviceptr(d_primitive_uuid_arr);
409 std::vector<uint32_t> ext_uuids(geometry.primitive_UUIDs);
410 ext_uuids.insert(ext_uuids.end(),
411 geometry.bboxes.UUIDs.begin(), geometry.bboxes.UUIDs.end());
412 upload(d_primitive_uuid_arr, ext_uuids.data(), ext_uuids.size() * sizeof(uint32_t));
413 }
414
415 // ---- Texture mask and UV data ----
416 {
417 // Compute per-mask offsets (cumulative start index into mask_data)
418 std::vector<uint32_t> mask_offsets;
419 uint32_t cumulative = 0;
420 for (const auto &sz : geometry.mask_sizes) {
421 mask_offsets.push_back(cumulative);
422 cumulative += static_cast<uint32_t>(sz.x) * static_cast<uint32_t>(sz.y);
423 }
424
425 // Convert vector<bool> to flat uint8 array (1=opaque, 0=transparent)
426 std::vector<uint8_t> mask_data_u8(cumulative);
427 for (uint32_t i = 0; i < cumulative; ++i) {
428 mask_data_u8[i] = geometry.mask_data[i] ? 1u : 0u;
429 }
430
431 // Reformat UV data to flat [Nprims * 4] array (4 UV vertices per primitive).
432 // uv_IDs[p] >= 0 flags whether primitive p has custom UV data; all textured
433 // primitives store exactly 4 UV vertices in uv_data (triangles are padded).
434 const size_t Np = geometry.primitive_count;
435 std::vector<helios::vec2> uv_flat(Np * 4, helios::make_vec2(0.f, 0.f));
436 size_t uv_read = 0;
437 for (size_t p = 0; p < Np; ++p) {
438 if (!geometry.uv_IDs.empty() && geometry.uv_IDs[p] >= 0) {
439 for (int v = 0; v < 4 && uv_read < geometry.uv_data.size(); ++v) {
440 uv_flat[p * 4 + v] = geometry.uv_data[uv_read++];
441 }
442 }
443 }
444
445 freeCUdeviceptr(d_mask_data);
446 freeCUdeviceptr(d_mask_offsets);
447 freeCUdeviceptr(d_mask_sizes);
448 freeCUdeviceptr(d_mask_IDs);
449 freeCUdeviceptr(d_uv_data);
450 freeCUdeviceptr(d_uv_IDs);
451
452 upload(d_mask_data, mask_data_u8.data(), mask_data_u8.size() * sizeof(uint8_t));
453 upload(d_mask_offsets, mask_offsets.data(), mask_offsets.size() * sizeof(uint32_t));
454 upload(d_mask_sizes, geometry.mask_sizes.data(), geometry.mask_sizes.size() * sizeof(helios::int2));
455 upload(d_mask_IDs, geometry.mask_IDs.data(), geometry.mask_IDs.size() * sizeof(int32_t));
456 upload(d_uv_data, uv_flat.data(), uv_flat.size() * sizeof(helios::vec2));
457 upload(d_uv_IDs, geometry.uv_IDs.data(), geometry.uv_IDs.size() * sizeof(int32_t));
458 }
459
460 // Update h_params device pointers
461 const uint32_t Nprims = static_cast<uint32_t>(geometry.primitive_count);
462 h_params.transform_matrix = reinterpret_cast<float *>(d_transform_matrix);
463 h_params.primitive_type = reinterpret_cast<uint32_t *>(d_primitive_type);
464 h_params.primitive_positions = reinterpret_cast<uint32_t *>(d_primitive_positions);
465 h_params.primitive_uuid = reinterpret_cast<uint32_t *>(d_primitive_uuid_arr);
466 h_params.primitiveID = reinterpret_cast<uint32_t *>(d_primitiveID);
467 h_params.objectID = reinterpret_cast<uint32_t *>(d_objectID);
468 h_params.object_subdivisions = reinterpret_cast<int32_t *>(d_object_subdivisions);
469 h_params.twosided_flag = reinterpret_cast<int8_t *>(d_twosided_flag);
470 h_params.primitive_solid_fraction = reinterpret_cast<float *>(d_primitive_solid_fraction);
471 h_params.patch_vertices = reinterpret_cast<float3 *>(d_patch_vertices);
472 h_params.patch_UUIDs = reinterpret_cast<uint32_t *>(d_patch_UUIDs);
473 h_params.triangle_vertices = reinterpret_cast<float3 *>(d_triangle_vertices);
474 h_params.triangle_UUIDs = reinterpret_cast<uint32_t *>(d_triangle_UUIDs);
475 h_params.disk_centers = reinterpret_cast<float3 *>(d_disk_centers);
476 h_params.disk_radii = reinterpret_cast<float *>(d_disk_radii);
477 h_params.disk_normals = reinterpret_cast<float3 *>(d_disk_normals);
478 h_params.disk_UUIDs = reinterpret_cast<uint32_t *>(d_disk_UUIDs);
479 h_params.tile_vertices = reinterpret_cast<float3 *>(d_tile_vertices);
480 h_params.tile_UUIDs = reinterpret_cast<uint32_t *>(d_tile_UUIDs);
481 h_params.voxel_vertices = reinterpret_cast<float3 *>(d_voxel_vertices);
482 h_params.voxel_UUIDs = reinterpret_cast<uint32_t *>(d_voxel_UUIDs);
483 h_params.bbox_vertices = reinterpret_cast<float3 *>(d_bbox_vertices);
484 h_params.bbox_UUIDs = reinterpret_cast<uint32_t *>(d_bbox_UUIDs);
485 h_params.Nprimitives = Nprims;
486 h_params.bbox_UUID_base = geometry.bbox_UUID_base;
487 h_params.periodic_flag = make_float2(geometry.periodic_flag.x, geometry.periodic_flag.y);
488 h_params.mask_data = reinterpret_cast<uint8_t *>(d_mask_data);
489 h_params.mask_offsets = reinterpret_cast<uint32_t *>(d_mask_offsets);
490 h_params.mask_sizes = reinterpret_cast<int32_t *>(d_mask_sizes);
491 h_params.mask_IDs = reinterpret_cast<int32_t *>(d_mask_IDs);
492 h_params.uv_data = reinterpret_cast<float2 *>(d_uv_data);
493 h_params.uv_IDs = reinterpret_cast<int32_t *>(d_uv_IDs);
494
495 // Store counts
496 current_primitive_count = geometry.primitive_count;
497 current_patch_count = geometry.patch_count;
498 current_triangle_count = geometry.triangle_count;
499 current_disk_count = geometry.disk_count;
500 current_tile_count = geometry.tile_count;
501 current_voxel_count = geometry.voxel_count;
502 current_bbox_count = geometry.bbox_count;
503
504 buildAABBs(geometry);
505}
506
508 if (current_primitive_count == 0) {
509 helios_runtime_error("ERROR (OptiX8Backend::buildAccelerationStructure): No geometry uploaded. Call updateGeometry() first.");
510 }
511 buildGAS(static_cast<uint32_t>(current_primitive_count + current_bbox_count));
512 buildSBT();
513}
514
515// ---------------------------------------------------------------------------
516// Materials
517// ---------------------------------------------------------------------------
518
520 freeMaterialBuffers();
521
522 auto upload = [this](CUdeviceptr &d_ptr, const void *src, size_t bytes) {
523 if (bytes > 0 && src) {
524 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_ptr), bytes));
525 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_ptr), src, bytes, cudaMemcpyHostToDevice));
526 }
527 };
528
529 // rho/tau: always allocate at least Nprims*Nbands elements (zeroed) to prevent null pointer
530 // dereference in device code when Nsources==0 (diffuse-only scenarios).
531 {
532 const size_t Nprims = current_primitive_count;
533 const size_t Nbands = materials.num_bands;
534 const size_t alloc = std::max(materials.reflectivity.size(), std::max(Nprims * Nbands, (size_t)1));
535 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_rho), alloc * sizeof(float)));
536 CUDA_CHECK(cudaMemset(reinterpret_cast<void *>(d_rho), 0, alloc * sizeof(float)));
537 if (!materials.reflectivity.empty()) {
538 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_rho), materials.reflectivity.data(),
539 materials.reflectivity.size() * sizeof(float), cudaMemcpyHostToDevice));
540 }
541 }
542 {
543 const size_t Nprims = current_primitive_count;
544 const size_t Nbands = materials.num_bands;
545 const size_t alloc = std::max(materials.transmissivity.size(), std::max(Nprims * Nbands, (size_t)1));
546 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_tau), alloc * sizeof(float)));
547 CUDA_CHECK(cudaMemset(reinterpret_cast<void *>(d_tau), 0, alloc * sizeof(float)));
548 if (!materials.transmissivity.empty()) {
549 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_tau), materials.transmissivity.data(),
550 materials.transmissivity.size() * sizeof(float), cudaMemcpyHostToDevice));
551 }
552 }
553 if (!materials.reflectivity_cam.empty())
554 upload(d_rho_cam, materials.reflectivity_cam.data(), materials.reflectivity_cam.size() * sizeof(float));
555 if (!materials.transmissivity_cam.empty())
556 upload(d_tau_cam, materials.transmissivity_cam.data(), materials.transmissivity_cam.size() * sizeof(float));
557 if (!materials.specular_exponent.empty())
558 upload(d_specular_exponent, materials.specular_exponent.data(), materials.specular_exponent.size() * sizeof(float));
559 if (!materials.specular_scale.empty())
560 upload(d_specular_scale, materials.specular_scale.data(), materials.specular_scale.size() * sizeof(float));
561
562 // Allocate radiation energy buffers (Nprims × Nbands_global)
563 const size_t Nprims = current_primitive_count;
564 const size_t Nbands = materials.num_bands;
565 const size_t rad_bytes = Nprims * Nbands * sizeof(float);
566
567 reallocDevice(d_radiation_in, rad_bytes);
568 reallocDevice(d_radiation_out_top, rad_bytes);
569 reallocDevice(d_radiation_out_bottom, rad_bytes);
570 reallocDevice(d_scatter_buff_top, rad_bytes);
571 reallocDevice(d_scatter_buff_bottom, rad_bytes);
572
573 current_band_count = Nbands;
574 current_source_count = materials.num_sources;
575 current_camera_count = materials.num_cameras;
576
577 // Update h_params
578 h_params.rho = reinterpret_cast<float *>(d_rho);
579 h_params.tau = reinterpret_cast<float *>(d_tau);
580 h_params.rho_cam = reinterpret_cast<float *>(d_rho_cam);
581 h_params.tau_cam = reinterpret_cast<float *>(d_tau_cam);
582 h_params.specular_exponent = reinterpret_cast<float *>(d_specular_exponent);
583 h_params.specular_scale = reinterpret_cast<float *>(d_specular_scale);
584 h_params.radiation_in = reinterpret_cast<float *>(d_radiation_in);
585 h_params.radiation_out_top = reinterpret_cast<float *>(d_radiation_out_top);
586 h_params.radiation_out_bottom = reinterpret_cast<float *>(d_radiation_out_bottom);
587 h_params.scatter_buff_top = reinterpret_cast<float *>(d_scatter_buff_top);
588 h_params.scatter_buff_bottom = reinterpret_cast<float *>(d_scatter_buff_bottom);
589 h_params.Nsources = static_cast<uint32_t>(materials.num_sources);
590 h_params.Ncameras = static_cast<uint32_t>(materials.num_cameras);
591 h_params.Nbands_global = static_cast<uint32_t>(Nbands);
592}
593
594// ---------------------------------------------------------------------------
595// Sources
596// ---------------------------------------------------------------------------
597
598void OptiX8Backend::updateSources(const std::vector<RayTracingSource> &sources) {
599 auto freePtr = [this](CUdeviceptr &ptr) { freeCUdeviceptr(ptr); };
600 freePtr(d_source_positions);
601 freePtr(d_source_rotations);
602 freePtr(d_source_widths);
603 freePtr(d_source_types);
604 // d_source_fluxes is managed by uploadSourceFluxes()
605
606 const size_t Nsources = sources.size();
607 if (Nsources == 0) {
608 current_source_count = 0;
609 h_params.Nsources = 0;
610 return;
611 }
612
613 std::vector<float3> positions(Nsources);
614 std::vector<float3> rotations(Nsources);
615 std::vector<float2> widths(Nsources);
616 std::vector<uint32_t> types(Nsources);
617
618 for (size_t i = 0; i < Nsources; i++) {
619 positions[i] = make_float3(sources[i].position.x, sources[i].position.y, sources[i].position.z);
620 rotations[i] = make_float3(sources[i].rotation.x, sources[i].rotation.y, sources[i].rotation.z);
621 widths[i] = make_float2(sources[i].width.x, sources[i].width.y);
622 types[i] = sources[i].type;
623 }
624
625 auto upload = [this](CUdeviceptr &d_ptr, const void *src, size_t bytes) {
626 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_ptr), bytes));
627 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_ptr), src, bytes, cudaMemcpyHostToDevice));
628 };
629
630 upload(d_source_positions, positions.data(), Nsources * sizeof(float3));
631 upload(d_source_rotations, rotations.data(), Nsources * sizeof(float3));
632 upload(d_source_widths, widths.data(), Nsources * sizeof(float2));
633 upload(d_source_types, types.data(), Nsources * sizeof(uint32_t));
634
635 // Upload camera-weighted source fluxes (full 3D buffer [source][band][camera])
636 // This is needed during direct ray tracing for specular accumulation in __miss__direct.
637 // Layout matches OptiX 6: flattened as [src0_band0_cam0, src0_band0_cam1, ..., src0_band1_cam0, ...]
638 freeCUdeviceptr(d_source_fluxes_cam);
639 std::vector<float> fluxes_cam;
640 for (size_t i = 0; i < Nsources; i++) {
641 for (float f : sources[i].fluxes_cam) {
642 fluxes_cam.push_back(f);
643 }
644 }
645 if (!fluxes_cam.empty()) {
646 upload(d_source_fluxes_cam, fluxes_cam.data(), fluxes_cam.size() * sizeof(float));
647 h_params.source_fluxes_cam = reinterpret_cast<float *>(d_source_fluxes_cam);
648 }
649
650 current_source_count = Nsources;
651
652 h_params.Nsources = static_cast<uint32_t>(Nsources);
653 h_params.source_positions = reinterpret_cast<float3 *>(d_source_positions);
654 h_params.source_rotations = reinterpret_cast<float3 *>(d_source_rotations);
655 h_params.source_widths = reinterpret_cast<float2 *>(d_source_widths);
656 h_params.source_types = reinterpret_cast<uint32_t *>(d_source_types);
657}
658
659// ---------------------------------------------------------------------------
660// Diffuse / sky
661// ---------------------------------------------------------------------------
662
663void OptiX8Backend::updateDiffuseRadiation(const std::vector<float> &flux, const std::vector<float> &extinction,
664 const std::vector<helios::vec3> &peak_dir,
665 const std::vector<float> &dist_norm,
666 const std::vector<float> &sky_energy) {
667 freeCUdeviceptr(d_diffuse_flux);
668 freeCUdeviceptr(d_diffuse_extinction);
669 freeCUdeviceptr(d_diffuse_peak_dir);
670 freeCUdeviceptr(d_diffuse_dist_norm);
671 freeCUdeviceptr(d_Rsky);
672
673 auto upload_f = [this](CUdeviceptr &ptr, const std::vector<float> &v) {
674 if (!v.empty()) {
675 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&ptr), v.size() * sizeof(float)));
676 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(ptr), v.data(),
677 v.size() * sizeof(float), cudaMemcpyHostToDevice));
678 }
679 };
680 upload_f(d_diffuse_flux, flux);
681 upload_f(d_diffuse_extinction, extinction);
682 upload_f(d_diffuse_dist_norm, dist_norm);
683 upload_f(d_Rsky, sky_energy);
684
685 if (!peak_dir.empty()) {
686 std::vector<float3> pd_f3(peak_dir.size());
687 for (size_t i = 0; i < peak_dir.size(); i++) {
688 pd_f3[i] = make_float3(peak_dir[i].x, peak_dir[i].y, peak_dir[i].z);
689 }
690 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_diffuse_peak_dir),
691 pd_f3.size() * sizeof(float3)));
692 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_diffuse_peak_dir), pd_f3.data(),
693 pd_f3.size() * sizeof(float3), cudaMemcpyHostToDevice));
694 }
695
696 h_params.diffuse_flux = reinterpret_cast<float *>(d_diffuse_flux);
697 h_params.diffuse_extinction = reinterpret_cast<float *>(d_diffuse_extinction);
698 h_params.diffuse_peak_dir = reinterpret_cast<float3 *>(d_diffuse_peak_dir);
699 h_params.diffuse_dist_norm = reinterpret_cast<float *>(d_diffuse_dist_norm);
700 h_params.Rsky = reinterpret_cast<float *>(d_Rsky);
701}
702
703void OptiX8Backend::updateSkyModel(const std::vector<helios::vec4> &sky_radiance_params,
704 const std::vector<float> &camera_sky_radiance,
705 const helios::vec3 &sun_direction,
706 const std::vector<float> &solar_disk_radiance,
707 float solar_disk_cos_angle) {
708 // Upload sky_radiance_params (helios::vec4 → float4)
709 freeCUdeviceptr(d_sky_radiance_params);
710 if (!sky_radiance_params.empty()) {
711 std::vector<float4> f4(sky_radiance_params.size());
712 for (size_t i = 0; i < sky_radiance_params.size(); i++) {
713 f4[i] = make_float4(sky_radiance_params[i].x, sky_radiance_params[i].y,
714 sky_radiance_params[i].z, sky_radiance_params[i].w);
715 }
716 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_sky_radiance_params), f4.size() * sizeof(float4)));
717 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_sky_radiance_params), f4.data(),
718 f4.size() * sizeof(float4), cudaMemcpyHostToDevice));
719 }
720
721 freeCUdeviceptr(d_camera_sky_radiance);
722 if (!camera_sky_radiance.empty()) {
723 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_camera_sky_radiance),
724 camera_sky_radiance.size() * sizeof(float)));
725 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_camera_sky_radiance), camera_sky_radiance.data(),
726 camera_sky_radiance.size() * sizeof(float), cudaMemcpyHostToDevice));
727 }
728
729 freeCUdeviceptr(d_solar_disk_radiance);
730 if (!solar_disk_radiance.empty()) {
731 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_solar_disk_radiance),
732 solar_disk_radiance.size() * sizeof(float)));
733 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_solar_disk_radiance), solar_disk_radiance.data(),
734 solar_disk_radiance.size() * sizeof(float), cudaMemcpyHostToDevice));
735 }
736
737 h_params.sky_radiance_params = reinterpret_cast<float4 *>(d_sky_radiance_params);
738 h_params.camera_sky_radiance = reinterpret_cast<float *>(d_camera_sky_radiance);
739 h_params.solar_disk_radiance = reinterpret_cast<float *>(d_solar_disk_radiance);
740 h_params.sun_direction = make_float3(sun_direction.x, sun_direction.y, sun_direction.z);
741 h_params.solar_disk_cos_angle = solar_disk_cos_angle;
742}
743
744// ---------------------------------------------------------------------------
745// Ray launching
746// ---------------------------------------------------------------------------
747
749 if (!is_initialized) {
750 helios_runtime_error("ERROR (OptiX8Backend::launchDirectRays): Backend not initialized.");
751 }
752 if (gas_handle == 0) {
753 helios_runtime_error("ERROR (OptiX8Backend::launchDirectRays): No acceleration structure. Call buildAccelerationStructure() first.");
754 }
755
756 applyLaunchParams(launch_params);
757
758 // Upload band_launch_flag (vector<bool> → device bool array)
759 if (d_band_launch_flag) { freeCUdeviceptr(d_band_launch_flag); }
760 const size_t Nbands_g = launch_params.band_launch_flag.size();
761 if (Nbands_g > 0) {
762 std::vector<uint8_t> flags_u8(Nbands_g);
763 for (size_t i = 0; i < Nbands_g; i++) {
764 flags_u8[i] = launch_params.band_launch_flag[i] ? 1u : 0u;
765 }
766 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_band_launch_flag), Nbands_g * sizeof(uint8_t)));
767 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_band_launch_flag), flags_u8.data(),
768 Nbands_g * sizeof(uint8_t), cudaMemcpyHostToDevice));
769 }
770 h_params.band_launch_flag = reinterpret_cast<bool *>(d_band_launch_flag);
771 h_params.traversable = gas_handle;
772
773 // 2D stratification: split rays_per_primitive into dim_x × dim_y grid (same as diffuse)
774 // rays_per_primitive is always n*n (RadiationModel sets it as ceil(sqrt(N))^2)
775 const uint32_t launch_count = launch_params.launch_count;
776 const uint32_t rpp = launch_params.rays_per_primitive;
777 const uint32_t dim_x = static_cast<uint32_t>(sqrtf(static_cast<float>(rpp)));
778 const uint32_t dim_y = (dim_x > 0) ? (rpp / dim_x) : 1u;
779 h_params.launch_dim_x = dim_x;
780 h_params.launch_dim_y = dim_y;
781
782 // prd_pool is unused (PRD allocated on thread stack in raygen)
783 h_params.prd_pool = nullptr;
784
785 // Direct launch uses raygen record 0 (d_raygen_records + 0)
786 OptixShaderBindingTable direct_sbt = sbt;
787 direct_sbt.raygenRecord = d_raygen_records;
788
789 // OptiX depth dimension is limited to 65535; batch if needed
790 const uint32_t MAX_DEPTH = 65535u;
791 uint32_t offset = launch_params.launch_offset;
792 uint32_t remaining = launch_count;
793 while (remaining > 0) {
794 const uint32_t batch = std::min(remaining, MAX_DEPTH);
795 h_params.launch_offset = offset;
796 h_params.launch_count = batch;
797 uploadLaunchParams();
798 OPTIX_CHECK(optixLaunch(optix_pipeline, cuda_stream, d_params,
799 sizeof(OptiX8LaunchParams), &direct_sbt,
800 dim_x, dim_y, batch));
801 CUDA_CHECK(cudaStreamSynchronize(cuda_stream));
802 offset += batch;
803 remaining -= batch;
804 }
805}
806
808 if (!is_initialized) {
809 helios_runtime_error("ERROR (OptiX8Backend::launchDiffuseRays): Backend not initialized.");
810 }
811 if (gas_handle == 0) {
812 helios_runtime_error("ERROR (OptiX8Backend::launchDiffuseRays): No acceleration structure. "
813 "Call buildAccelerationStructure() first.");
814 }
815
816 applyLaunchParams(launch_params);
817
818 // Upload radiation_out buffers (emission + scattered energy from previous iteration).
819 // This must happen here because RadiationModel adds emission to flux_top/bottom AFTER
820 // calling uploadRadiationOut() for the direct-ray scatter, so the params always carry
821 // the most up-to-date radiation_out data.
822 if (!launch_params.radiation_out_top.empty() && d_radiation_out_top) {
823 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_radiation_out_top),
824 launch_params.radiation_out_top.data(),
825 launch_params.radiation_out_top.size() * sizeof(float),
826 cudaMemcpyHostToDevice));
827 }
828 if (!launch_params.radiation_out_bottom.empty() && d_radiation_out_bottom) {
829 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_radiation_out_bottom),
830 launch_params.radiation_out_bottom.data(),
831 launch_params.radiation_out_bottom.size() * sizeof(float),
832 cudaMemcpyHostToDevice));
833 }
834
835 // Upload band_launch_flag
836 if (d_band_launch_flag) { freeCUdeviceptr(d_band_launch_flag); }
837 const size_t Nbands_g = launch_params.band_launch_flag.size();
838 if (Nbands_g > 0) {
839 std::vector<uint8_t> flags_u8(Nbands_g);
840 for (size_t i = 0; i < Nbands_g; i++) {
841 flags_u8[i] = launch_params.band_launch_flag[i] ? 1u : 0u;
842 }
843 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_band_launch_flag),
844 Nbands_g * sizeof(uint8_t)));
845 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_band_launch_flag), flags_u8.data(),
846 Nbands_g * sizeof(uint8_t), cudaMemcpyHostToDevice));
847 }
848 h_params.band_launch_flag = reinterpret_cast<bool *>(d_band_launch_flag);
849 h_params.traversable = gas_handle;
850
851 // Upload diffuse params from launch_params (re-upload each launch since params may vary)
852 freeCUdeviceptr(d_diffuse_flux);
853 freeCUdeviceptr(d_diffuse_extinction);
854 freeCUdeviceptr(d_diffuse_peak_dir);
855 freeCUdeviceptr(d_diffuse_dist_norm);
856 freeCUdeviceptr(d_sky_radiance_params);
857
858 auto upload_f = [this](CUdeviceptr &ptr, const std::vector<float> &v) {
859 if (!v.empty()) {
860 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&ptr), v.size() * sizeof(float)));
861 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(ptr), v.data(),
862 v.size() * sizeof(float), cudaMemcpyHostToDevice));
863 }
864 };
865 upload_f(d_diffuse_flux, launch_params.diffuse_flux);
866 upload_f(d_diffuse_extinction, launch_params.diffuse_extinction);
867 upload_f(d_diffuse_dist_norm, launch_params.diffuse_dist_norm);
868
869 if (!launch_params.diffuse_peak_dir.empty()) {
870 std::vector<float3> pd_f3(launch_params.diffuse_peak_dir.size());
871 for (size_t i = 0; i < launch_params.diffuse_peak_dir.size(); i++) {
872 pd_f3[i] = make_float3(launch_params.diffuse_peak_dir[i].x,
873 launch_params.diffuse_peak_dir[i].y,
874 launch_params.diffuse_peak_dir[i].z);
875 }
876 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_diffuse_peak_dir),
877 pd_f3.size() * sizeof(float3)));
878 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_diffuse_peak_dir), pd_f3.data(),
879 pd_f3.size() * sizeof(float3), cudaMemcpyHostToDevice));
880 }
881
882 if (!launch_params.sky_radiance_params.empty()) {
883 std::vector<float4> sky_f4(launch_params.sky_radiance_params.size());
884 for (size_t i = 0; i < launch_params.sky_radiance_params.size(); i++) {
885 sky_f4[i] = make_float4(launch_params.sky_radiance_params[i].x,
886 launch_params.sky_radiance_params[i].y,
887 launch_params.sky_radiance_params[i].z,
888 launch_params.sky_radiance_params[i].w);
889 }
890 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_sky_radiance_params),
891 sky_f4.size() * sizeof(float4)));
892 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_sky_radiance_params), sky_f4.data(),
893 sky_f4.size() * sizeof(float4), cudaMemcpyHostToDevice));
894 }
895
896 h_params.diffuse_flux = reinterpret_cast<float *>(d_diffuse_flux);
897 h_params.diffuse_extinction = reinterpret_cast<float *>(d_diffuse_extinction);
898 h_params.diffuse_peak_dir = reinterpret_cast<float3 *>(d_diffuse_peak_dir);
899 h_params.diffuse_dist_norm = reinterpret_cast<float *>(d_diffuse_dist_norm);
900 h_params.sky_radiance_params = reinterpret_cast<float4 *>(d_sky_radiance_params);
901
902 // Early return when there are no rays to launch (e.g. diffuseRayCount=0 during scattering)
903 if (launch_params.rays_per_primitive == 0 || launch_params.launch_count == 0) {
904 return;
905 }
906
907 // Set up 2D stratification launch dimensions
908 // rays_per_primitive is always n*n (RadiationModel sets it as ceil(sqrt(N))^2)
909 const uint32_t rpp = launch_params.rays_per_primitive;
910 const uint32_t dim_x = static_cast<uint32_t>(sqrtf(static_cast<float>(rpp)));
911 const uint32_t dim_y = (dim_x > 0) ? (rpp / dim_x) : 1u;
912 h_params.launch_dim_x = dim_x;
913 h_params.launch_dim_y = dim_y;
914 h_params.prd_pool = nullptr;
915
916 // Use diffuse raygen record (index 1 in the raygen records array)
917 OptixShaderBindingTable diffuse_sbt = sbt;
918 diffuse_sbt.raygenRecord = d_raygen_record_diffuse;
919
920 // OptiX depth dimension is limited to 65535; batch if needed
921 const uint32_t MAX_DEPTH = 65535u;
922 const uint32_t launch_count = launch_params.launch_count;
923 uint32_t offset = launch_params.launch_offset;
924 uint32_t remaining = launch_count;
925 while (remaining > 0) {
926 const uint32_t batch = std::min(remaining, MAX_DEPTH);
927 h_params.launch_offset = offset;
928 h_params.launch_count = batch;
929 uploadLaunchParams();
930 OPTIX_CHECK(optixLaunch(optix_pipeline, cuda_stream, d_params,
931 sizeof(OptiX8LaunchParams), &diffuse_sbt,
932 dim_x, dim_y, batch));
933 CUDA_CHECK(cudaStreamSynchronize(cuda_stream));
934 offset += batch;
935 remaining -= batch;
936 }
937}
938
940 if (!is_initialized) {
941 helios_runtime_error("ERROR (OptiX8Backend::launchCameraRays): Backend not initialized.");
942 }
943 if (gas_handle == 0) {
944 helios_runtime_error("ERROR (OptiX8Backend::launchCameraRays): No acceleration structure. "
945 "Call buildAccelerationStructure() first.");
946 }
947
948 applyLaunchParams(launch_params);
949
950 const uint32_t tile_w = launch_params.camera_resolution.x;
951 const uint32_t tile_h = launch_params.camera_resolution.y;
952 const uint32_t full_w = launch_params.camera_resolution_full.x;
953 const uint32_t full_h = launch_params.camera_resolution_full.y;
954 const uint32_t anti_samples = launch_params.antialiasing_samples;
955 const uint32_t Nbands_l = launch_params.num_bands_launch;
956 const uint32_t cam_id = launch_params.camera_id;
957
958 // Allocate/zero radiation_in_camera when starting a new camera or when band count changes.
959 // Multiple tiles for the same camera accumulate into the same buffer without re-zeroing.
960 const size_t Npixels = (size_t)full_w * full_h;
961 const size_t cam_bytes = Npixels * Nbands_l * sizeof(float);
962 if (cam_id != current_camera_launch_id || current_launch_band_count != Nbands_l) {
963 reallocDevice(d_radiation_in_camera, cam_bytes);
964 CUDA_CHECK(cudaMemset(reinterpret_cast<void *>(d_radiation_in_camera), 0, cam_bytes));
965 // Free pixel label/depth buffers from previous camera so getCameraResults skips them
966 // until zeroCameraPixelBuffers() is called for the new camera.
967 freeCUdeviceptr(d_camera_pixel_label);
968 freeCUdeviceptr(d_camera_pixel_depth);
969 h_params.camera_pixel_label = nullptr;
970 h_params.camera_pixel_depth = nullptr;
971 current_camera_launch_id = cam_id;
972 current_launch_band_count = Nbands_l;
973 }
974 h_params.radiation_in_camera = reinterpret_cast<float *>(d_radiation_in_camera);
975
976 // Upload band_launch_flag
977 if (d_band_launch_flag) { freeCUdeviceptr(d_band_launch_flag); }
978 const size_t Nbands_g = launch_params.band_launch_flag.size();
979 if (Nbands_g > 0) {
980 std::vector<uint8_t> flags_u8(Nbands_g);
981 for (size_t i = 0; i < Nbands_g; i++) {
982 flags_u8[i] = launch_params.band_launch_flag[i] ? 1u : 0u;
983 }
984 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_band_launch_flag), Nbands_g * sizeof(uint8_t)));
985 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_band_launch_flag), flags_u8.data(),
986 Nbands_g * sizeof(uint8_t), cudaMemcpyHostToDevice));
987 }
988 h_params.band_launch_flag = reinterpret_cast<bool *>(d_band_launch_flag);
989 h_params.traversable = gas_handle;
990
991 // Camera launch: x=antialiasing_samples, y=tile_width, z=tile_height
992 h_params.launch_dim_x = anti_samples;
993 h_params.launch_dim_y = tile_w;
994
995 uploadLaunchParams();
996
997 OptixShaderBindingTable camera_sbt = sbt;
998 camera_sbt.raygenRecord = d_raygen_record_camera;
999
1000 OPTIX_CHECK(optixLaunch(optix_pipeline, cuda_stream, d_params,
1001 sizeof(OptiX8LaunchParams), &camera_sbt,
1002 anti_samples, tile_w, tile_h));
1003 CUDA_CHECK(cudaStreamSynchronize(cuda_stream));
1004}
1005
1007 if (!is_initialized) {
1008 helios_runtime_error("ERROR (OptiX8Backend::launchPixelLabelRays): Backend not initialized.");
1009 }
1010 if (gas_handle == 0) {
1011 helios_runtime_error("ERROR (OptiX8Backend::launchPixelLabelRays): No acceleration structure. "
1012 "Call buildAccelerationStructure() first.");
1013 }
1014
1015 applyLaunchParams(launch_params);
1016
1017 const uint32_t tile_w = launch_params.camera_resolution.x;
1018 const uint32_t tile_h = launch_params.camera_resolution.y;
1019
1020 // Upload band_launch_flag
1021 if (d_band_launch_flag) { freeCUdeviceptr(d_band_launch_flag); }
1022 const size_t Nbands_g = launch_params.band_launch_flag.size();
1023 if (Nbands_g > 0) {
1024 std::vector<uint8_t> flags_u8(Nbands_g);
1025 for (size_t i = 0; i < Nbands_g; i++) {
1026 flags_u8[i] = launch_params.band_launch_flag[i] ? 1u : 0u;
1027 }
1028 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_band_launch_flag), Nbands_g * sizeof(uint8_t)));
1029 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_band_launch_flag), flags_u8.data(),
1030 Nbands_g * sizeof(uint8_t), cudaMemcpyHostToDevice));
1031 }
1032 h_params.band_launch_flag = reinterpret_cast<bool *>(d_band_launch_flag);
1033 h_params.traversable = gas_handle;
1034
1035 // Pixel label launch: 1 ray per pixel center, no antialiasing
1036 h_params.launch_dim_x = 1u;
1037 h_params.launch_dim_y = tile_w;
1038
1039 uploadLaunchParams();
1040
1041 OptixShaderBindingTable pixel_label_sbt = sbt;
1042 pixel_label_sbt.raygenRecord = d_raygen_record_pixel_label;
1043
1044 OPTIX_CHECK(optixLaunch(optix_pipeline, cuda_stream, d_params,
1045 sizeof(OptiX8LaunchParams), &pixel_label_sbt,
1046 1u, tile_w, tile_h));
1047 CUDA_CHECK(cudaStreamSynchronize(cuda_stream));
1048}
1049
1050// ---------------------------------------------------------------------------
1051// Results retrieval
1052// ---------------------------------------------------------------------------
1053
1055 const size_t Nprims = current_primitive_count;
1056 const size_t Nbands = current_band_count;
1057 const size_t total = Nprims * Nbands;
1058
1059 results.num_primitives = Nprims;
1060 results.num_bands = Nbands;
1061 results.num_sources = current_source_count;
1062 results.num_cameras = current_camera_count;
1063
1064 if (total > 0) {
1065 if (d_radiation_in) results.radiation_in = downloadFloat(d_radiation_in, total);
1066 if (d_radiation_out_top) results.radiation_out_top = downloadFloat(d_radiation_out_top, total);
1067 if (d_radiation_out_bottom) results.radiation_out_bottom = downloadFloat(d_radiation_out_bottom, total);
1068 if (d_scatter_buff_top) results.scatter_buff_top = downloadFloat(d_scatter_buff_top, total);
1069 if (d_scatter_buff_bottom) results.scatter_buff_bottom = downloadFloat(d_scatter_buff_bottom, total);
1070 }
1071
1072 // Camera scatter buffers: sized Nprims × Nbands_launch (may differ from Nbands_global)
1073 if (Nprims > 0 && current_launch_band_count > 0) {
1074 const size_t cam_total = Nprims * current_launch_band_count;
1075 if (d_scatter_buff_top_cam) results.scatter_buff_top_cam = downloadFloat(d_scatter_buff_top_cam, cam_total);
1076 if (d_scatter_buff_bottom_cam) results.scatter_buff_bottom_cam = downloadFloat(d_scatter_buff_bottom_cam, cam_total);
1077 }
1078}
1079
1080void OptiX8Backend::getCameraResults(std::vector<float> &pixel_data, std::vector<uint> &pixel_labels,
1081 std::vector<float> &pixel_depths, uint camera_id,
1082 const helios::int2 &resolution) {
1083 const size_t Npixels = (size_t)resolution.x * resolution.y;
1084
1085 if (d_radiation_in_camera && Npixels > 0 && current_launch_band_count > 0) {
1086 pixel_data = downloadFloat(d_radiation_in_camera, Npixels * current_launch_band_count);
1087 }
1088
1089 if (d_camera_pixel_label && Npixels > 0) {
1090 auto labels_u32 = downloadUInt32(d_camera_pixel_label, Npixels);
1091 pixel_labels.assign(labels_u32.begin(), labels_u32.end());
1092 }
1093
1094 if (d_camera_pixel_depth && Npixels > 0) {
1095 pixel_depths = downloadFloat(d_camera_pixel_depth, Npixels);
1096 }
1097}
1098
1099// ---------------------------------------------------------------------------
1100// Buffer management utilities
1101// ---------------------------------------------------------------------------
1102
1103void OptiX8Backend::zeroRadiationBuffers(size_t launch_band_count) {
1104 const size_t Nprims = current_primitive_count;
1105 if (Nprims == 0 || launch_band_count == 0) return;
1106
1107 if (launch_band_count > current_band_count) {
1108 helios_runtime_error("ERROR (OptiX8Backend::zeroRadiationBuffers): launch_band_count (" +
1109 std::to_string(launch_band_count) + ") exceeds current_band_count (" +
1110 std::to_string(current_band_count) + "). Call updateMaterials() first.");
1111 }
1112
1113 // Reset camera launch ID so camera pixel buffers get re-zeroed in launchCameraRays()
1114 current_camera_launch_id = 0xFFFFFFFFu;
1115
1116 const size_t bytes = Nprims * launch_band_count * sizeof(float);
1117
1118 if (d_radiation_in) CUDA_CHECK(cudaMemset(reinterpret_cast<void *>(d_radiation_in), 0, bytes));
1119 if (d_radiation_out_top) CUDA_CHECK(cudaMemset(reinterpret_cast<void *>(d_radiation_out_top), 0, bytes));
1120 if (d_radiation_out_bottom) CUDA_CHECK(cudaMemset(reinterpret_cast<void *>(d_radiation_out_bottom), 0, bytes));
1121 if (d_scatter_buff_top) CUDA_CHECK(cudaMemset(reinterpret_cast<void *>(d_scatter_buff_top), 0, bytes));
1122 if (d_scatter_buff_bottom) CUDA_CHECK(cudaMemset(reinterpret_cast<void *>(d_scatter_buff_bottom), 0, bytes));
1123
1124 // Zero specular buffer: [source × camera × primitive × launch_band_count]
1125 const size_t specular_size = current_source_count * current_camera_count * Nprims * launch_band_count;
1126 if (specular_size > 0) {
1127 const size_t specular_bytes = specular_size * sizeof(float);
1128 reallocDevice(d_radiation_specular, specular_bytes);
1129 CUDA_CHECK(cudaMemset(reinterpret_cast<void *>(d_radiation_specular), 0, specular_bytes));
1130 h_params.radiation_specular = reinterpret_cast<float *>(d_radiation_specular);
1131 }
1132}
1133
1135 const size_t total_bytes = current_primitive_count * current_band_count * sizeof(float);
1136 if (total_bytes == 0) return;
1137
1138 if (d_scatter_buff_top) CUDA_CHECK(cudaMemset(reinterpret_cast<void *>(d_scatter_buff_top), 0, total_bytes));
1139 if (d_scatter_buff_bottom) CUDA_CHECK(cudaMemset(reinterpret_cast<void *>(d_scatter_buff_bottom), 0, total_bytes));
1140}
1141
1143 const size_t Npixels = (size_t)resolution.x * resolution.y;
1144 if (Npixels == 0) return;
1145
1146 reallocDevice(d_camera_pixel_label, Npixels * sizeof(uint32_t));
1147 reallocDevice(d_camera_pixel_depth, Npixels * sizeof(float));
1148 CUDA_CHECK(cudaMemset(reinterpret_cast<void *>(d_camera_pixel_label), 0, Npixels * sizeof(uint32_t)));
1149 CUDA_CHECK(cudaMemset(reinterpret_cast<void *>(d_camera_pixel_depth), 0, Npixels * sizeof(float)));
1150
1151 h_params.camera_pixel_label = reinterpret_cast<uint32_t *>(d_camera_pixel_label);
1152 h_params.camera_pixel_depth = reinterpret_cast<float *>(d_camera_pixel_depth);
1153}
1154
1156 const size_t total = current_primitive_count * current_band_count * sizeof(float);
1157 if (total == 0) return;
1158 if (d_scatter_buff_top && d_radiation_out_top) {
1159 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_radiation_out_top),
1160 reinterpret_cast<const void *>(d_scatter_buff_top),
1161 total, cudaMemcpyDeviceToDevice));
1162 }
1163 if (d_scatter_buff_bottom && d_radiation_out_bottom) {
1164 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_radiation_out_bottom),
1165 reinterpret_cast<const void *>(d_scatter_buff_bottom),
1166 total, cudaMemcpyDeviceToDevice));
1167 }
1168}
1169
1170void OptiX8Backend::uploadRadiationOut(const std::vector<float> &radiation_out_top,
1171 const std::vector<float> &radiation_out_bottom) {
1172 if (!radiation_out_top.empty() && d_radiation_out_top) {
1173 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_radiation_out_top),
1174 radiation_out_top.data(),
1175 radiation_out_top.size() * sizeof(float),
1176 cudaMemcpyHostToDevice));
1177 }
1178 if (!radiation_out_bottom.empty() && d_radiation_out_bottom) {
1179 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_radiation_out_bottom),
1180 radiation_out_bottom.data(),
1181 radiation_out_bottom.size() * sizeof(float),
1182 cudaMemcpyHostToDevice));
1183 }
1184}
1185
1186void OptiX8Backend::uploadCameraScatterBuffers(const std::vector<float> &scatter_top_cam,
1187 const std::vector<float> &scatter_bottom_cam) {
1188 if (!scatter_top_cam.empty() && d_scatter_buff_top_cam) {
1189 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_scatter_buff_top_cam),
1190 scatter_top_cam.data(),
1191 scatter_top_cam.size() * sizeof(float),
1192 cudaMemcpyHostToDevice));
1193 }
1194 if (!scatter_bottom_cam.empty() && d_scatter_buff_bottom_cam) {
1195 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_scatter_buff_bottom_cam),
1196 scatter_bottom_cam.data(),
1197 scatter_bottom_cam.size() * sizeof(float),
1198 cudaMemcpyHostToDevice));
1199 }
1200}
1201
1202void OptiX8Backend::zeroCameraScatterBuffers(size_t launch_band_count) {
1203 const size_t Nprims = current_primitive_count;
1204 if (Nprims == 0 || launch_band_count == 0) return;
1205
1206 const size_t bytes = Nprims * launch_band_count * sizeof(float);
1207 reallocDevice(d_scatter_buff_top_cam, bytes);
1208 reallocDevice(d_scatter_buff_bottom_cam, bytes);
1209 CUDA_CHECK(cudaMemset(reinterpret_cast<void *>(d_scatter_buff_top_cam), 0, bytes));
1210 CUDA_CHECK(cudaMemset(reinterpret_cast<void *>(d_scatter_buff_bottom_cam), 0, bytes));
1211
1212 h_params.scatter_buff_top_cam = reinterpret_cast<float *>(d_scatter_buff_top_cam);
1213 h_params.scatter_buff_bottom_cam = reinterpret_cast<float *>(d_scatter_buff_bottom_cam);
1214 current_launch_band_count = launch_band_count;
1215}
1216
1217void OptiX8Backend::uploadSourceFluxes(const std::vector<float> &fluxes) {
1218 freeCUdeviceptr(d_source_fluxes);
1219 if (fluxes.empty()) return;
1220
1221 const size_t bytes = fluxes.size() * sizeof(float);
1222 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_source_fluxes), bytes));
1223 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_source_fluxes), fluxes.data(), bytes, cudaMemcpyHostToDevice));
1224 h_params.source_fluxes = reinterpret_cast<float *>(d_source_fluxes);
1225}
1226
1227void OptiX8Backend::uploadSourceFluxesCam(const std::vector<float> &fluxes_cam) {
1228 freeCUdeviceptr(d_source_fluxes_cam);
1229 if (fluxes_cam.empty()) return;
1230
1231 const size_t bytes = fluxes_cam.size() * sizeof(float);
1232 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_source_fluxes_cam), bytes));
1233 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_source_fluxes_cam), fluxes_cam.data(), bytes,
1234 cudaMemcpyHostToDevice));
1235 h_params.source_fluxes_cam = reinterpret_cast<float *>(d_source_fluxes_cam);
1236}
1237
1238// ---------------------------------------------------------------------------
1239// Diagnostics
1240// ---------------------------------------------------------------------------
1241
1243 size_t free_bytes = 0;
1244 size_t total_bytes = 0;
1245 CUDA_CHECK(cudaMemGetInfo(&free_bytes, &total_bytes));
1246 const float free_mb = static_cast<float>(free_bytes) / (1024.0f * 1024.0f);
1247 const float total_mb = static_cast<float>(total_bytes) / (1024.0f * 1024.0f);
1248 std::cout << "GPU memory: " << free_mb << " MB free / " << total_mb << " MB total" << std::endl;
1249}
1250
1251// ---------------------------------------------------------------------------
1252// Private helpers
1253// ---------------------------------------------------------------------------
1254
1255void OptiX8Backend::freeCUdeviceptr(CUdeviceptr &ptr) {
1256 if (ptr) {
1257 cudaFree(reinterpret_cast<void *>(ptr));
1258 ptr = 0;
1259 }
1260}
1261
1262void OptiX8Backend::freeGeometryBuffers() {
1263 freeCUdeviceptr(d_transform_matrix);
1264 freeCUdeviceptr(d_primitive_type);
1265 freeCUdeviceptr(d_primitive_positions);
1266 freeCUdeviceptr(d_primitive_uuid_arr);
1267 freeCUdeviceptr(d_primitiveID);
1268 freeCUdeviceptr(d_objectID);
1269 freeCUdeviceptr(d_object_subdivisions);
1270 freeCUdeviceptr(d_twosided_flag);
1271 freeCUdeviceptr(d_primitive_solid_fraction);
1272 freeCUdeviceptr(d_patch_vertices);
1273 freeCUdeviceptr(d_patch_UUIDs);
1274 freeCUdeviceptr(d_triangle_vertices);
1275 freeCUdeviceptr(d_triangle_UUIDs);
1276 freeCUdeviceptr(d_disk_centers);
1277 freeCUdeviceptr(d_disk_radii);
1278 freeCUdeviceptr(d_disk_normals);
1279 freeCUdeviceptr(d_disk_UUIDs);
1280 freeCUdeviceptr(d_tile_vertices);
1281 freeCUdeviceptr(d_tile_UUIDs);
1282 freeCUdeviceptr(d_voxel_vertices);
1283 freeCUdeviceptr(d_voxel_UUIDs);
1284 freeCUdeviceptr(d_bbox_vertices);
1285 freeCUdeviceptr(d_bbox_UUIDs);
1286 freeCUdeviceptr(d_mask_data);
1287 freeCUdeviceptr(d_mask_offsets);
1288 freeCUdeviceptr(d_mask_sizes);
1289 freeCUdeviceptr(d_mask_IDs);
1290 freeCUdeviceptr(d_uv_data);
1291 freeCUdeviceptr(d_uv_IDs);
1292 freeCUdeviceptr(d_aabbs);
1293 freeCUdeviceptr(d_gas_output);
1294 gas_handle = 0;
1295}
1296
1297void OptiX8Backend::freeMaterialBuffers() {
1298 freeCUdeviceptr(d_rho);
1299 freeCUdeviceptr(d_tau);
1300 freeCUdeviceptr(d_rho_cam);
1301 freeCUdeviceptr(d_tau_cam);
1302 freeCUdeviceptr(d_specular_exponent);
1303 freeCUdeviceptr(d_specular_scale);
1304}
1305
1306void OptiX8Backend::reallocDevice(CUdeviceptr &ptr, size_t bytes) {
1307 freeCUdeviceptr(ptr);
1308 if (bytes > 0) {
1309 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&ptr), bytes));
1310 }
1311}
1312
1313std::vector<float> OptiX8Backend::downloadFloat(CUdeviceptr ptr, size_t count) const {
1314 std::vector<float> result(count);
1315 CUDA_CHECK(cudaMemcpy(result.data(), reinterpret_cast<const void *>(ptr),
1316 count * sizeof(float), cudaMemcpyDeviceToHost));
1317 return result;
1318}
1319
1320std::vector<uint32_t> OptiX8Backend::downloadUInt32(CUdeviceptr ptr, size_t count) const {
1321 std::vector<uint32_t> result(count);
1322 CUDA_CHECK(cudaMemcpy(result.data(), reinterpret_cast<const void *>(ptr),
1323 count * sizeof(uint32_t), cudaMemcpyDeviceToHost));
1324 return result;
1325}
1326
1327void OptiX8Backend::buildAABBs(const RayTracingGeometry &geometry) {
1328 const uint32_t Nprims = static_cast<uint32_t>(geometry.primitive_count);
1329 const uint32_t Nbboxes = static_cast<uint32_t>(geometry.bboxes.UUIDs.size());
1330 const uint32_t Ntotal = Nprims + Nbboxes;
1331 if (Ntotal == 0) return;
1332
1333 std::vector<OptixAabb> aabbs(Ntotal);
1334 const float eps = 1e-5f;
1335
1336 auto store_aabb = [&](uint32_t pos, float mn_x, float mn_y, float mn_z,
1337 float mx_x, float mx_y, float mx_z) {
1338 if (mx_x - mn_x < eps) { mn_x -= eps; mx_x += eps; }
1339 if (mx_y - mn_y < eps) { mn_y -= eps; mx_y += eps; }
1340 if (mx_z - mn_z < eps) { mn_z -= eps; mx_z += eps; }
1341 aabbs[pos] = {mn_x, mn_y, mn_z, mx_x, mx_y, mx_z};
1342 };
1343
1344 // Real primitives — AABB computed via canonical-space vertices + transform matrix
1345 for (uint32_t pos = 0; pos < Nprims; pos++) {
1346 const float *T = &geometry.transform_matrices[pos * 16];
1347 const uint32_t pt = geometry.primitive_types[pos];
1348
1349 float mn_x = FLT_MAX, mn_y = FLT_MAX, mn_z = FLT_MAX;
1350 float mx_x = -FLT_MAX, mx_y = -FLT_MAX, mx_z = -FLT_MAX;
1351
1352 auto expand = [&](float x, float y, float z) {
1353 const float wx = T[0]*x + T[1]*y + T[2]*z + T[3];
1354 const float wy = T[4]*x + T[5]*y + T[6]*z + T[7];
1355 const float wz = T[8]*x + T[9]*y + T[10]*z + T[11];
1356 if (wx < mn_x) mn_x = wx; if (wx > mx_x) mx_x = wx;
1357 if (wy < mn_y) mn_y = wy; if (wy > mx_y) mx_y = wy;
1358 if (wz < mn_z) mn_z = wz; if (wz > mx_z) mx_z = wz;
1359 };
1360
1361 if (pt == 0 || pt == 3) { // Patch or Tile: canonical space [-0.5, 0.5]^2
1362 expand(-0.5f, -0.5f, 0.f); expand( 0.5f, -0.5f, 0.f);
1363 expand(-0.5f, 0.5f, 0.f); expand( 0.5f, 0.5f, 0.f);
1364 } else if (pt == 1) { // Triangle: (0,0,0)-(0,1,0)-(1,1,0)
1365 expand(0.f, 0.f, 0.f); expand(0.f, 1.f, 0.f); expand(1.f, 1.f, 0.f);
1366 } else if (pt == 2) { // Disk: bounding box of unit circle
1367 expand(-0.5f, -0.5f, 0.f); expand( 0.5f, -0.5f, 0.f);
1368 expand(-0.5f, 0.5f, 0.f); expand( 0.5f, 0.5f, 0.f);
1369 } else if (pt == 4) { // Voxel: unit cube [0,1]^3
1370 for (float fx : {0.f, 1.f})
1371 for (float fy : {0.f, 1.f})
1372 for (float fz : {0.f, 1.f})
1373 expand(fx, fy, fz);
1374 } else { // Unknown type: unit box fallback
1375 expand(-0.5f, -0.5f, -0.5f); expand( 0.5f, 0.5f, 0.5f);
1376 }
1377
1378 store_aabb(pos, mn_x, mn_y, mn_z, mx_x, mx_y, mx_z);
1379 }
1380
1381 // Bbox faces: AABB from actual world-space vertices (4 vertices per face)
1382 for (uint32_t b = 0; b < Nbboxes; b++) {
1383 float mn_x = FLT_MAX, mn_y = FLT_MAX, mn_z = FLT_MAX;
1384 float mx_x = -FLT_MAX, mx_y = -FLT_MAX, mx_z = -FLT_MAX;
1385 for (int v = 0; v < 4; v++) {
1386 const helios::vec3 &vtx = geometry.bboxes.vertices[b * 4 + v];
1387 mn_x = std::min(mn_x, vtx.x); mx_x = std::max(mx_x, vtx.x);
1388 mn_y = std::min(mn_y, vtx.y); mx_y = std::max(mx_y, vtx.y);
1389 mn_z = std::min(mn_z, vtx.z); mx_z = std::max(mx_z, vtx.z);
1390 }
1391 store_aabb(Nprims + b, mn_x, mn_y, mn_z, mx_x, mx_y, mx_z);
1392 }
1393
1394 const size_t aabb_bytes = Ntotal * sizeof(OptixAabb);
1395 reallocDevice(d_aabbs, aabb_bytes);
1396 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_aabbs), aabbs.data(), aabb_bytes, cudaMemcpyHostToDevice));
1397}
1398
1399void OptiX8Backend::buildGAS(uint32_t Nprimitives) {
1400 if (d_gas_output) { cudaFree(reinterpret_cast<void *>(d_gas_output)); d_gas_output = 0; }
1401 gas_handle = 0;
1402
1403 if (Nprimitives == 0 || !d_aabbs) return;
1404
1405 const unsigned int build_flags[] = {OPTIX_GEOMETRY_FLAG_NONE};
1406 OptixBuildInputCustomPrimitiveArray ca = {};
1407 ca.aabbBuffers = &d_aabbs;
1408 ca.numPrimitives = Nprimitives;
1409 ca.strideInBytes = sizeof(OptixAabb);
1410 ca.flags = build_flags;
1411 ca.numSbtRecords = 1;
1412
1413 OptixBuildInput bi = {};
1414 bi.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
1415 bi.customPrimitiveArray = ca;
1416
1417 OptixAccelBuildOptions opts = {};
1418 opts.buildFlags = OPTIX_BUILD_FLAG_ALLOW_COMPACTION | OPTIX_BUILD_FLAG_PREFER_FAST_TRACE;
1419 opts.operation = OPTIX_BUILD_OPERATION_BUILD;
1420
1421 OptixAccelBufferSizes sizes = {};
1422 OPTIX_CHECK(optixAccelComputeMemoryUsage(optix_context, &opts, &bi, 1, &sizes));
1423
1424 CUdeviceptr d_temp = 0, d_pre_compact = 0, d_compact_size = 0;
1425 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_temp), sizes.tempSizeInBytes));
1426 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_pre_compact), sizes.outputSizeInBytes));
1427 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_compact_size), sizeof(uint64_t)));
1428
1429 OptixAccelEmitDesc emit = {};
1430 emit.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
1431 emit.result = d_compact_size;
1432
1433 OPTIX_CHECK(optixAccelBuild(optix_context, cuda_stream, &opts, &bi, 1,
1434 d_temp, sizes.tempSizeInBytes,
1435 d_pre_compact, sizes.outputSizeInBytes,
1436 &gas_handle, &emit, 1));
1437 CUDA_CHECK(cudaStreamSynchronize(cuda_stream));
1438
1439 uint64_t compact_size = 0;
1440 CUDA_CHECK(cudaMemcpy(&compact_size, reinterpret_cast<const void *>(d_compact_size),
1441 sizeof(uint64_t), cudaMemcpyDeviceToHost));
1442
1443 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_gas_output), compact_size));
1444 OPTIX_CHECK(optixAccelCompact(optix_context, cuda_stream, gas_handle, d_gas_output, compact_size, &gas_handle));
1445 CUDA_CHECK(cudaStreamSynchronize(cuda_stream));
1446
1447 CUDA_CHECK(cudaFree(reinterpret_cast<void *>(d_temp)));
1448 CUDA_CHECK(cudaFree(reinterpret_cast<void *>(d_pre_compact)));
1449 CUDA_CHECK(cudaFree(reinterpret_cast<void *>(d_compact_size)));
1450
1451 h_params.traversable = gas_handle;
1452}
1453
1454void OptiX8Backend::buildSBT() {
1455 if (d_raygen_records) { cudaFree(reinterpret_cast<void *>(d_raygen_records)); d_raygen_records = 0; }
1456 if (d_miss_records) { cudaFree(reinterpret_cast<void *>(d_miss_records)); d_miss_records = 0; }
1457 if (d_hitgroup_records) { cudaFree(reinterpret_cast<void *>(d_hitgroup_records)); d_hitgroup_records = 0; }
1458
1459 // Raygen records: header only (no data), 4 records (direct, diffuse, camera, pixel_label)
1460 struct alignas(OPTIX_SBT_RECORD_ALIGNMENT) RaygenRecord {
1461 char header[OPTIX_SBT_RECORD_HEADER_SIZE];
1462 };
1463 constexpr int N_rg = 4;
1464 std::vector<RaygenRecord> rg_recs(N_rg);
1465 OPTIX_CHECK(optixSbtRecordPackHeader(pg_raygen_direct, &rg_recs[0]));
1466 OPTIX_CHECK(optixSbtRecordPackHeader(pg_raygen_diffuse, &rg_recs[1]));
1467 OPTIX_CHECK(optixSbtRecordPackHeader(pg_raygen_camera, &rg_recs[2]));
1468 OPTIX_CHECK(optixSbtRecordPackHeader(pg_raygen_pixel_label, &rg_recs[3]));
1469 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_raygen_records), N_rg * sizeof(RaygenRecord)));
1470 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_raygen_records), rg_recs.data(),
1471 N_rg * sizeof(RaygenRecord), cudaMemcpyHostToDevice));
1472
1473 // Cache individual record device pointers (used to select raygen per launch type)
1474 d_raygen_record_direct = d_raygen_records;
1475 d_raygen_record_diffuse = d_raygen_records + 1 * sizeof(RaygenRecord);
1476 d_raygen_record_camera = d_raygen_records + 2 * sizeof(RaygenRecord);
1477 d_raygen_record_pixel_label = d_raygen_records + 3 * sizeof(RaygenRecord);
1478
1479 // Miss records: header only, 4 records
1480 struct alignas(OPTIX_SBT_RECORD_ALIGNMENT) MissRecord {
1481 char header[OPTIX_SBT_RECORD_HEADER_SIZE];
1482 };
1483 constexpr int N_ms = 4;
1484 std::vector<MissRecord> ms_recs(N_ms);
1485 OPTIX_CHECK(optixSbtRecordPackHeader(pg_miss_direct, &ms_recs[0]));
1486 OPTIX_CHECK(optixSbtRecordPackHeader(pg_miss_diffuse, &ms_recs[1]));
1487 OPTIX_CHECK(optixSbtRecordPackHeader(pg_miss_camera, &ms_recs[2]));
1488 OPTIX_CHECK(optixSbtRecordPackHeader(pg_miss_pixel_label, &ms_recs[3]));
1489 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_miss_records), N_ms * sizeof(MissRecord)));
1490 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_miss_records), ms_recs.data(),
1491 N_ms * sizeof(MissRecord), cudaMemcpyHostToDevice));
1492
1493 // Hit group records: header + HitGroupData, 4 records (one per ray type)
1494 struct alignas(OPTIX_SBT_RECORD_ALIGNMENT) HitRecord {
1495 char header[OPTIX_SBT_RECORD_HEADER_SIZE];
1496 HitGroupData data;
1497 };
1498 constexpr int N_hg = 4;
1499 std::vector<HitRecord> hg_recs(N_hg);
1500 for (auto &r : hg_recs) {
1501 r.data.vertices = reinterpret_cast<float3 *>(d_patch_vertices);
1502 r.data.UUIDs = reinterpret_cast<uint32_t *>(d_patch_UUIDs);
1503 r.data.prim_type = 0; // patch
1504 }
1505 OPTIX_CHECK(optixSbtRecordPackHeader(pg_hit_direct, &hg_recs[0]));
1506 OPTIX_CHECK(optixSbtRecordPackHeader(pg_hit_diffuse, &hg_recs[1]));
1507 OPTIX_CHECK(optixSbtRecordPackHeader(pg_hit_camera, &hg_recs[2]));
1508 OPTIX_CHECK(optixSbtRecordPackHeader(pg_hit_pixel_label, &hg_recs[3]));
1509 CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_hitgroup_records), N_hg * sizeof(HitRecord)));
1510 CUDA_CHECK(cudaMemcpy(reinterpret_cast<void *>(d_hitgroup_records), hg_recs.data(),
1511 N_hg * sizeof(HitRecord), cudaMemcpyHostToDevice));
1512
1513 sbt = {};
1514 sbt.raygenRecord = d_raygen_records; // updated per-launch type
1515 sbt.missRecordBase = d_miss_records;
1516 sbt.missRecordStrideInBytes = static_cast<uint32_t>(sizeof(MissRecord));
1517 sbt.missRecordCount = N_ms;
1518 sbt.hitgroupRecordBase = d_hitgroup_records;
1519 sbt.hitgroupRecordStrideInBytes = static_cast<uint32_t>(sizeof(HitRecord));
1520 sbt.hitgroupRecordCount = N_hg;
1521}
1522
1523void OptiX8Backend::uploadLaunchParams() {
1524 CUDA_CHECK(cudaMemcpyAsync(reinterpret_cast<void *>(d_params), &h_params,
1525 sizeof(OptiX8LaunchParams), cudaMemcpyHostToDevice, cuda_stream));
1526}
1527
1528void OptiX8Backend::applyLaunchParams(const RayTracingLaunchParams &params) {
1529 h_params.launch_offset = params.launch_offset;
1530 h_params.launch_count = params.launch_count;
1531 h_params.rays_per_primitive = params.rays_per_primitive;
1532 h_params.random_seed = params.random_seed;
1533 h_params.Nbands_global = params.num_bands_global;
1534 h_params.Nbands_launch = params.num_bands_launch;
1535 h_params.launch_face = params.launch_face;
1536 h_params.scattering_iteration = params.scattering_iteration;
1537 h_params.specular_reflection_enabled = params.specular_reflection_enabled;
1538 h_params.camera_ID = params.camera_id;
1539 h_params.camera_position = make_float3(params.camera_position.x, params.camera_position.y, params.camera_position.z);
1540 h_params.camera_direction = make_float2(params.camera_direction.x, params.camera_direction.y);
1541 h_params.camera_focal_length = params.camera_focal_length;
1542 h_params.camera_lens_diameter = params.camera_lens_diameter;
1543 h_params.FOV_aspect_ratio = params.camera_fov_aspect;
1544 h_params.camera_HFOV = params.camera_HFOV;
1545 h_params.camera_resolution.x = params.camera_resolution.x;
1546 h_params.camera_resolution.y = params.camera_resolution.y;
1547 h_params.camera_viewplane_length= params.camera_viewplane_length;
1548 h_params.camera_pixel_solid_angle = params.camera_pixel_solid_angle;
1549 h_params.camera_pixel_offset.x = params.camera_pixel_offset.x;
1550 h_params.camera_pixel_offset.y = params.camera_pixel_offset.y;
1551 h_params.camera_resolution_full.x = params.camera_resolution_full.x;
1552 h_params.camera_resolution_full.y = params.camera_resolution_full.y;
1553}
1554
1555std::string OptiX8Backend::findDeviceCodeFile() const {
1556 // OptiX 8 uses either PTX or OptixIR (.optixir)
1557 // Use the non-throwing resolver so we can probe each candidate in order
1558 const std::vector<std::string> candidate_names = {
1559 "OptiX8DeviceCode.optixir",
1560 "OptiX8DeviceCode.ptx",
1561 };
1562 for (const auto &name : candidate_names) {
1563 auto path = helios::tryResolveFilePath("plugins/radiation/" + name);
1564 if (!path.empty()) {
1565 return path.string();
1566 }
1567 }
1569 "ERROR (OptiX8Backend::findDeviceCodeFile): Could not find OptiX8DeviceCode.optixir or "
1570 "OptiX8DeviceCode.ptx in the radiation plugin asset directory. "
1571 "Ensure the radiation plugin was built with OptiX 8 support (HELIOS_HAVE_OPTIX8).");
1572 return ""; // unreachable
1573}
1574
1575} // namespace helios