19#include <optix_function_table_definition.h>
33OptiX8Backend::OptiX8Backend() =
default;
38 cudaError_t rc = cudaGetDeviceCount(&device_count);
39 if (rc != cudaSuccess || device_count == 0) {
42 OptixResult optix_rc = optixInit();
43 return (optix_rc == OPTIX_SUCCESS);
49OptiX8Backend::~OptiX8Backend() {
61 CUDA_CHECK(cudaFree(
nullptr));
64 CUDA_CHECK(cudaStreamCreate(&cuda_stream));
67 OPTIX_CHECK(optixInit());
70 CUcontext cuda_context =
nullptr;
71 OptixDeviceContextOptions ctx_options = {};
72 ctx_options.logCallbackFunction = [](
unsigned int level,
const char *tag,
const char *message,
void *) {
74 std::cerr <<
"[OptiX][" << tag <<
"] " << message <<
"\n";
77 ctx_options.logCallbackLevel = 2;
78 OPTIX_CHECK(optixDeviceContextCreate(cuda_context, &ctx_options, &optix_context));
81 const std::string device_code_path = findDeviceCodeFile();
82 std::ifstream file(device_code_path, std::ios::binary | std::ios::ate);
83 if (!file.is_open()) {
84 helios_runtime_error(
"ERROR (OptiX8Backend::initialize): Could not open device code file: " + device_code_path);
86 const std::streamsize file_size = file.tellg();
87 file.seekg(0, std::ios::beg);
88 std::vector<char> device_code(file_size);
89 if (!file.read(device_code.data(), file_size)) {
90 helios_runtime_error(
"ERROR (OptiX8Backend::initialize): Could not read device code file: " + device_code_path);
93 OptixModuleCompileOptions module_options = {};
94 module_options.maxRegisterCount = OPTIX_COMPILE_DEFAULT_MAX_REGISTER_COUNT;
96 module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
97 module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
99 module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
100 module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
103 OptixPipelineCompileOptions pipeline_options = {};
104 pipeline_options.usesMotionBlur = 0;
105 pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_GAS;
106 pipeline_options.numPayloadValues = 2;
107 pipeline_options.numAttributeValues = 2;
108 pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
109 pipeline_options.pipelineLaunchParamsVariableName =
"params";
110 pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
113 size_t log_size =
sizeof(log);
115 OPTIX_CHECK(optixModuleCreate(
120 static_cast<size_t>(file_size),
125 OptixProgramGroupOptions pg_options = {};
128 auto createRaygen = [&](
const char *entry, OptixProgramGroup &pg) {
129 OptixProgramGroupDesc desc = {};
130 desc.kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
131 desc.raygen.module = optix_module;
132 desc.raygen.entryFunctionName = entry;
133 log_size =
sizeof(log);
134 OPTIX_CHECK(optixProgramGroupCreate(optix_context, &desc, 1, &pg_options, log, &log_size, &pg));
138 createRaygen(
"__raygen__direct", pg_raygen_direct);
139 createRaygen(
"__raygen__diffuse", pg_raygen_diffuse);
140 createRaygen(
"__raygen__camera", pg_raygen_camera);
141 createRaygen(
"__raygen__pixel_label", pg_raygen_pixel_label);
144 auto createMiss = [&](
const char *entry, OptixProgramGroup &pg) {
145 OptixProgramGroupDesc desc = {};
146 desc.kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
147 desc.miss.module = optix_module;
148 desc.miss.entryFunctionName = entry;
149 log_size =
sizeof(log);
150 OPTIX_CHECK(optixProgramGroupCreate(optix_context, &desc, 1, &pg_options, log, &log_size, &pg));
154 createMiss(
"__miss__direct", pg_miss_direct);
155 createMiss(
"__miss__diffuse", pg_miss_diffuse);
156 createMiss(
"__miss__camera", pg_miss_camera);
157 createMiss(
"__miss__pixel_label", pg_miss_pixel_label);
163 auto createHitGroup = [&](
const char *ch_entry,
const char *is_entry, OptixProgramGroup &pg) {
164 OptixProgramGroupDesc desc = {};
165 desc.kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
166 desc.hitgroup.moduleCH = optix_module;
167 desc.hitgroup.entryFunctionNameCH = ch_entry;
168 desc.hitgroup.moduleIS = optix_module;
169 desc.hitgroup.entryFunctionNameIS = is_entry;
170 desc.hitgroup.moduleAH =
nullptr;
171 desc.hitgroup.entryFunctionNameAH =
nullptr;
172 log_size =
sizeof(log);
173 OPTIX_CHECK(optixProgramGroupCreate(optix_context, &desc, 1, &pg_options, log, &log_size, &pg));
177 createHitGroup(
"__closesthit__direct",
"__intersection__patch", pg_hit_direct);
178 createHitGroup(
"__closesthit__diffuse",
"__intersection__patch", pg_hit_diffuse);
179 createHitGroup(
"__closesthit__camera",
"__intersection__patch", pg_hit_camera);
180 createHitGroup(
"__closesthit__pixel_label",
"__intersection__patch", pg_hit_pixel_label);
183 OptixProgramGroup all_groups[] = {
184 pg_raygen_direct, pg_raygen_diffuse, pg_raygen_camera, pg_raygen_pixel_label,
185 pg_miss_direct, pg_miss_diffuse, pg_miss_camera, pg_miss_pixel_label,
186 pg_hit_direct, pg_hit_diffuse, pg_hit_camera, pg_hit_pixel_label
189 OptixPipelineLinkOptions link_options = {};
190 link_options.maxTraceDepth = 1;
192 log_size =
sizeof(log);
193 OPTIX_CHECK(optixPipelineCreate(
198 sizeof(all_groups) /
sizeof(all_groups[0]),
203 OptixStackSizes stack_sizes = {};
204 for (
auto &pg : all_groups) {
205 OPTIX_CHECK(optixUtilAccumulateStackSizes(pg, &stack_sizes, optix_pipeline));
207 uint32_t max_trace_depth = 1;
208 uint32_t direct_callable_stack_size_from_traversal = 0;
209 uint32_t direct_callable_stack_size_from_state = 0;
210 uint32_t continuation_stack_size = 0;
211 OPTIX_CHECK(optixUtilComputeStackSizes(
216 &direct_callable_stack_size_from_traversal,
217 &direct_callable_stack_size_from_state,
218 &continuation_stack_size));
219 OPTIX_CHECK(optixPipelineSetStackSize(
221 direct_callable_stack_size_from_traversal,
222 direct_callable_stack_size_from_state,
223 continuation_stack_size,
227 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_params),
sizeof(
OptiX8LaunchParams)));
228 memset(&h_params, 0,
sizeof(h_params));
230 is_initialized =
true;
234 if (!is_initialized) {
240 cudaStreamSynchronize(cuda_stream);
244 freeGeometryBuffers();
245 freeMaterialBuffers();
247 auto freePtr = [](CUdeviceptr &ptr) {
248 if (ptr) { cudaFree(
reinterpret_cast<void *
>(ptr)); ptr = 0; }
251 freePtr(d_radiation_in);
252 freePtr(d_radiation_out_top);
253 freePtr(d_radiation_out_bottom);
254 freePtr(d_scatter_buff_top);
255 freePtr(d_scatter_buff_bottom);
256 freePtr(d_radiation_in_camera);
257 freePtr(d_scatter_buff_top_cam);
258 freePtr(d_scatter_buff_bottom_cam);
259 freePtr(d_radiation_specular);
261 freePtr(d_camera_pixel_label);
262 freePtr(d_camera_pixel_depth);
263 freePtr(d_source_positions);
264 freePtr(d_source_rotations);
265 freePtr(d_source_widths);
266 freePtr(d_source_types);
267 freePtr(d_source_fluxes);
268 freePtr(d_source_fluxes_cam);
269 freePtr(d_diffuse_flux);
270 freePtr(d_diffuse_extinction);
271 freePtr(d_diffuse_peak_dir);
272 freePtr(d_diffuse_dist_norm);
273 freePtr(d_sky_radiance_params);
274 freePtr(d_camera_sky_radiance);
275 freePtr(d_solar_disk_radiance);
276 freePtr(d_band_launch_flag);
277 freePtr(d_mask_data);
278 freePtr(d_mask_sizes);
285 if (d_raygen_records) { cudaFree(
reinterpret_cast<void *
>(d_raygen_records)); d_raygen_records = 0; }
286 if (d_miss_records) { cudaFree(
reinterpret_cast<void *
>(d_miss_records)); d_miss_records = 0; }
287 if (d_hitgroup_records) { cudaFree(
reinterpret_cast<void *
>(d_hitgroup_records)); d_hitgroup_records = 0; }
290 if (d_gas_output) { cudaFree(
reinterpret_cast<void *
>(d_gas_output)); d_gas_output = 0; }
293 auto destroyPG = [](OptixProgramGroup &pg) {
294 if (pg) { optixProgramGroupDestroy(pg); pg =
nullptr; }
296 destroyPG(pg_raygen_direct); destroyPG(pg_raygen_diffuse);
297 destroyPG(pg_raygen_camera); destroyPG(pg_raygen_pixel_label);
298 destroyPG(pg_miss_direct); destroyPG(pg_miss_diffuse);
299 destroyPG(pg_miss_camera); destroyPG(pg_miss_pixel_label);
300 destroyPG(pg_hit_direct); destroyPG(pg_hit_diffuse);
301 destroyPG(pg_hit_camera); destroyPG(pg_hit_pixel_label);
303 if (optix_pipeline) { optixPipelineDestroy(optix_pipeline); optix_pipeline =
nullptr; }
304 if (optix_module) { optixModuleDestroy(optix_module); optix_module =
nullptr; }
305 if (optix_context) { optixDeviceContextDestroy(optix_context); optix_context =
nullptr; }
306 if (cuda_stream) { cudaStreamDestroy(cuda_stream); cuda_stream =
nullptr; }
308 is_initialized =
false;
316 validateGeometryBeforeUpload(geometry);
323 "which are not yet supported by the OptiX 8.1 backend.");
327 "which are not yet supported by the OptiX 8.1 backend.");
330 freeGeometryBuffers();
332 auto upload = [
this](CUdeviceptr &d_ptr,
const void *src,
size_t bytes) {
333 if (bytes > 0 && src) {
334 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_ptr), bytes));
335 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_ptr), src, bytes, cudaMemcpyHostToDevice));
351 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_object_subdivisions), bytes));
352 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_object_subdivisions),
373 upload(d_disk_UUIDs, geometry.
disk_UUIDs.data(), geometry.
disk_UUIDs.size() *
sizeof(uint32_t));
378 upload(d_tile_UUIDs, geometry.
tiles.
UUIDs.data(),
379 geometry.
tiles.
UUIDs.size() *
sizeof(uint32_t));
384 upload(d_voxel_UUIDs, geometry.
voxels.
UUIDs.data(),
403 freeCUdeviceptr(d_primitive_type);
405 ext_types.resize(Nprims + Nbboxes, 5u);
406 upload(d_primitive_type, ext_types.data(), ext_types.size() *
sizeof(uint32_t));
408 freeCUdeviceptr(d_primitive_uuid_arr);
410 ext_uuids.insert(ext_uuids.end(),
412 upload(d_primitive_uuid_arr, ext_uuids.data(), ext_uuids.size() *
sizeof(uint32_t));
418 std::vector<uint32_t> mask_offsets;
419 uint32_t cumulative = 0;
421 mask_offsets.push_back(cumulative);
422 cumulative +=
static_cast<uint32_t
>(sz.x) *
static_cast<uint32_t
>(sz.y);
426 std::vector<uint8_t> mask_data_u8(cumulative);
427 for (uint32_t i = 0; i < cumulative; ++i) {
428 mask_data_u8[i] = geometry.
mask_data[i] ? 1u : 0u;
437 for (
size_t p = 0; p < Np; ++p) {
438 if (!geometry.
uv_IDs.empty() && geometry.
uv_IDs[p] >= 0) {
439 for (
int v = 0; v < 4 && uv_read < geometry.
uv_data.size(); ++v) {
440 uv_flat[p * 4 + v] = geometry.
uv_data[uv_read++];
445 freeCUdeviceptr(d_mask_data);
446 freeCUdeviceptr(d_mask_offsets);
447 freeCUdeviceptr(d_mask_sizes);
448 freeCUdeviceptr(d_mask_IDs);
449 freeCUdeviceptr(d_uv_data);
450 freeCUdeviceptr(d_uv_IDs);
452 upload(d_mask_data, mask_data_u8.data(), mask_data_u8.size() *
sizeof(uint8_t));
453 upload(d_mask_offsets, mask_offsets.data(), mask_offsets.size() *
sizeof(uint32_t));
455 upload(d_mask_IDs, geometry.
mask_IDs.data(), geometry.
mask_IDs.size() *
sizeof(int32_t));
456 upload(d_uv_data, uv_flat.data(), uv_flat.size() *
sizeof(
helios::vec2));
457 upload(d_uv_IDs, geometry.
uv_IDs.data(), geometry.
uv_IDs.size() *
sizeof(int32_t));
461 const uint32_t Nprims =
static_cast<uint32_t
>(geometry.
primitive_count);
463 h_params.
primitive_type =
reinterpret_cast<uint32_t *
>(d_primitive_type);
465 h_params.
primitive_uuid =
reinterpret_cast<uint32_t *
>(d_primitive_uuid_arr);
466 h_params.
primitiveID =
reinterpret_cast<uint32_t *
>(d_primitiveID);
467 h_params.
objectID =
reinterpret_cast<uint32_t *
>(d_objectID);
469 h_params.
twosided_flag =
reinterpret_cast<int8_t *
>(d_twosided_flag);
470 h_params.primitive_solid_fraction =
reinterpret_cast<float *
>(d_primitive_solid_fraction);
471 h_params.
patch_vertices =
reinterpret_cast<float3 *
>(d_patch_vertices);
472 h_params.patch_UUIDs =
reinterpret_cast<uint32_t *
>(d_patch_UUIDs);
474 h_params.triangle_UUIDs =
reinterpret_cast<uint32_t *
>(d_triangle_UUIDs);
475 h_params.disk_centers =
reinterpret_cast<float3 *
>(d_disk_centers);
476 h_params.disk_radii =
reinterpret_cast<float *
>(d_disk_radii);
477 h_params.disk_normals =
reinterpret_cast<float3 *
>(d_disk_normals);
478 h_params.disk_UUIDs =
reinterpret_cast<uint32_t *
>(d_disk_UUIDs);
479 h_params.
tile_vertices =
reinterpret_cast<float3 *
>(d_tile_vertices);
480 h_params.tile_UUIDs =
reinterpret_cast<uint32_t *
>(d_tile_UUIDs);
481 h_params.
voxel_vertices =
reinterpret_cast<float3 *
>(d_voxel_vertices);
482 h_params.voxel_UUIDs =
reinterpret_cast<uint32_t *
>(d_voxel_UUIDs);
483 h_params.
bbox_vertices =
reinterpret_cast<float3 *
>(d_bbox_vertices);
484 h_params.bbox_UUIDs =
reinterpret_cast<uint32_t *
>(d_bbox_UUIDs);
485 h_params.Nprimitives = Nprims;
488 h_params.
mask_data =
reinterpret_cast<uint8_t *
>(d_mask_data);
489 h_params.
mask_offsets =
reinterpret_cast<uint32_t *
>(d_mask_offsets);
490 h_params.
mask_sizes =
reinterpret_cast<int32_t *
>(d_mask_sizes);
491 h_params.
mask_IDs =
reinterpret_cast<int32_t *
>(d_mask_IDs);
492 h_params.
uv_data =
reinterpret_cast<float2 *
>(d_uv_data);
493 h_params.
uv_IDs =
reinterpret_cast<int32_t *
>(d_uv_IDs);
504 buildAABBs(geometry);
508 if (current_primitive_count == 0) {
509 helios_runtime_error(
"ERROR (OptiX8Backend::buildAccelerationStructure): No geometry uploaded. Call updateGeometry() first.");
511 buildGAS(
static_cast<uint32_t
>(current_primitive_count + current_bbox_count));
520 freeMaterialBuffers();
522 auto upload = [
this](CUdeviceptr &d_ptr,
const void *src,
size_t bytes) {
523 if (bytes > 0 && src) {
524 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_ptr), bytes));
525 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_ptr), src, bytes, cudaMemcpyHostToDevice));
532 const size_t Nprims = current_primitive_count;
533 const size_t Nbands = materials.
num_bands;
534 const size_t alloc = std::max(materials.
reflectivity.size(), std::max(Nprims * Nbands, (
size_t)1));
535 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_rho), alloc *
sizeof(
float)));
536 CUDA_CHECK(cudaMemset(
reinterpret_cast<void *
>(d_rho), 0, alloc *
sizeof(
float)));
538 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_rho), materials.
reflectivity.data(),
539 materials.
reflectivity.size() *
sizeof(
float), cudaMemcpyHostToDevice));
543 const size_t Nprims = current_primitive_count;
544 const size_t Nbands = materials.
num_bands;
545 const size_t alloc = std::max(materials.
transmissivity.size(), std::max(Nprims * Nbands, (
size_t)1));
546 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_tau), alloc *
sizeof(
float)));
547 CUDA_CHECK(cudaMemset(
reinterpret_cast<void *
>(d_tau), 0, alloc *
sizeof(
float)));
549 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_tau), materials.
transmissivity.data(),
550 materials.
transmissivity.size() *
sizeof(
float), cudaMemcpyHostToDevice));
563 const size_t Nprims = current_primitive_count;
564 const size_t Nbands = materials.
num_bands;
565 const size_t rad_bytes = Nprims * Nbands *
sizeof(float);
567 reallocDevice(d_radiation_in, rad_bytes);
568 reallocDevice(d_radiation_out_top, rad_bytes);
569 reallocDevice(d_radiation_out_bottom, rad_bytes);
570 reallocDevice(d_scatter_buff_top, rad_bytes);
571 reallocDevice(d_scatter_buff_bottom, rad_bytes);
573 current_band_count = Nbands;
578 h_params.
rho =
reinterpret_cast<float *
>(d_rho);
579 h_params.
tau =
reinterpret_cast<float *
>(d_tau);
580 h_params.
rho_cam =
reinterpret_cast<float *
>(d_rho_cam);
581 h_params.
tau_cam =
reinterpret_cast<float *
>(d_tau_cam);
582 h_params.specular_exponent =
reinterpret_cast<float *
>(d_specular_exponent);
583 h_params.specular_scale =
reinterpret_cast<float *
>(d_specular_scale);
584 h_params.
radiation_in =
reinterpret_cast<float *
>(d_radiation_in);
589 h_params.Nsources =
static_cast<uint32_t
>(materials.
num_sources);
590 h_params.Ncameras =
static_cast<uint32_t
>(materials.
num_cameras);
591 h_params.Nbands_global =
static_cast<uint32_t
>(Nbands);
599 auto freePtr = [
this](CUdeviceptr &ptr) { freeCUdeviceptr(ptr); };
600 freePtr(d_source_positions);
601 freePtr(d_source_rotations);
602 freePtr(d_source_widths);
603 freePtr(d_source_types);
606 const size_t Nsources = sources.size();
608 current_source_count = 0;
609 h_params.Nsources = 0;
613 std::vector<float3> positions(Nsources);
614 std::vector<float3> rotations(Nsources);
615 std::vector<float2> widths(Nsources);
616 std::vector<uint32_t> types(Nsources);
618 for (
size_t i = 0; i < Nsources; i++) {
619 positions[i] = make_float3(sources[i].position.x, sources[i].position.y, sources[i].position.z);
620 rotations[i] = make_float3(sources[i].rotation.x, sources[i].rotation.y, sources[i].rotation.z);
621 widths[i] = make_float2(sources[i].width.x, sources[i].width.y);
622 types[i] = sources[i].type;
625 auto upload = [
this](CUdeviceptr &d_ptr,
const void *src,
size_t bytes) {
626 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_ptr), bytes));
627 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_ptr), src, bytes, cudaMemcpyHostToDevice));
630 upload(d_source_positions, positions.data(), Nsources *
sizeof(float3));
631 upload(d_source_rotations, rotations.data(), Nsources *
sizeof(float3));
632 upload(d_source_widths, widths.data(), Nsources *
sizeof(float2));
633 upload(d_source_types, types.data(), Nsources *
sizeof(uint32_t));
638 freeCUdeviceptr(d_source_fluxes_cam);
639 std::vector<float> fluxes_cam;
640 for (
size_t i = 0; i < Nsources; i++) {
641 for (
float f : sources[i].fluxes_cam) {
642 fluxes_cam.push_back(f);
645 if (!fluxes_cam.empty()) {
646 upload(d_source_fluxes_cam, fluxes_cam.data(), fluxes_cam.size() *
sizeof(
float));
650 current_source_count = Nsources;
652 h_params.Nsources =
static_cast<uint32_t
>(Nsources);
653 h_params.source_positions =
reinterpret_cast<float3 *
>(d_source_positions);
654 h_params.source_rotations =
reinterpret_cast<float3 *
>(d_source_rotations);
655 h_params.source_widths =
reinterpret_cast<float2 *
>(d_source_widths);
656 h_params.source_types =
reinterpret_cast<uint32_t *
>(d_source_types);
664 const std::vector<helios::vec3> &peak_dir,
665 const std::vector<float> &dist_norm,
666 const std::vector<float> &sky_energy) {
667 freeCUdeviceptr(d_diffuse_flux);
668 freeCUdeviceptr(d_diffuse_extinction);
669 freeCUdeviceptr(d_diffuse_peak_dir);
670 freeCUdeviceptr(d_diffuse_dist_norm);
671 freeCUdeviceptr(d_Rsky);
673 auto upload_f = [
this](CUdeviceptr &ptr,
const std::vector<float> &v) {
675 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&ptr), v.size() *
sizeof(
float)));
676 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(ptr), v.data(),
677 v.size() *
sizeof(
float), cudaMemcpyHostToDevice));
680 upload_f(d_diffuse_flux, flux);
681 upload_f(d_diffuse_extinction, extinction);
682 upload_f(d_diffuse_dist_norm, dist_norm);
683 upload_f(d_Rsky, sky_energy);
685 if (!peak_dir.empty()) {
686 std::vector<float3> pd_f3(peak_dir.size());
687 for (
size_t i = 0; i < peak_dir.size(); i++) {
688 pd_f3[i] = make_float3(peak_dir[i].x, peak_dir[i].y, peak_dir[i].z);
690 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_diffuse_peak_dir),
691 pd_f3.size() *
sizeof(float3)));
692 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_diffuse_peak_dir), pd_f3.data(),
693 pd_f3.size() *
sizeof(float3), cudaMemcpyHostToDevice));
696 h_params.
diffuse_flux =
reinterpret_cast<float *
>(d_diffuse_flux);
700 h_params.
Rsky =
reinterpret_cast<float *
>(d_Rsky);
704 const std::vector<float> &camera_sky_radiance,
706 const std::vector<float> &solar_disk_radiance,
707 float solar_disk_cos_angle) {
709 freeCUdeviceptr(d_sky_radiance_params);
710 if (!sky_radiance_params.empty()) {
711 std::vector<float4> f4(sky_radiance_params.size());
712 for (
size_t i = 0; i < sky_radiance_params.size(); i++) {
713 f4[i] = make_float4(sky_radiance_params[i].x, sky_radiance_params[i].y,
714 sky_radiance_params[i].z, sky_radiance_params[i].w);
716 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_sky_radiance_params), f4.size() *
sizeof(float4)));
717 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_sky_radiance_params), f4.data(),
718 f4.size() *
sizeof(float4), cudaMemcpyHostToDevice));
721 freeCUdeviceptr(d_camera_sky_radiance);
722 if (!camera_sky_radiance.empty()) {
723 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_camera_sky_radiance),
724 camera_sky_radiance.size() *
sizeof(
float)));
725 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_camera_sky_radiance), camera_sky_radiance.data(),
726 camera_sky_radiance.size() *
sizeof(
float), cudaMemcpyHostToDevice));
729 freeCUdeviceptr(d_solar_disk_radiance);
730 if (!solar_disk_radiance.empty()) {
731 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_solar_disk_radiance),
732 solar_disk_radiance.size() *
sizeof(
float)));
733 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_solar_disk_radiance), solar_disk_radiance.data(),
734 solar_disk_radiance.size() *
sizeof(
float), cudaMemcpyHostToDevice));
740 h_params.sun_direction = make_float3(sun_direction.
x, sun_direction.
y, sun_direction.
z);
741 h_params.solar_disk_cos_angle = solar_disk_cos_angle;
749 if (!is_initialized) {
752 if (gas_handle == 0) {
753 helios_runtime_error(
"ERROR (OptiX8Backend::launchDirectRays): No acceleration structure. Call buildAccelerationStructure() first.");
756 applyLaunchParams(launch_params);
759 if (d_band_launch_flag) { freeCUdeviceptr(d_band_launch_flag); }
762 std::vector<uint8_t> flags_u8(Nbands_g);
763 for (
size_t i = 0; i < Nbands_g; i++) {
766 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_band_launch_flag), Nbands_g *
sizeof(uint8_t)));
767 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_band_launch_flag), flags_u8.data(),
768 Nbands_g *
sizeof(uint8_t), cudaMemcpyHostToDevice));
771 h_params.traversable = gas_handle;
775 const uint32_t launch_count = launch_params.
launch_count;
777 const uint32_t dim_x =
static_cast<uint32_t
>(sqrtf(
static_cast<float>(rpp)));
778 const uint32_t dim_y = (dim_x > 0) ? (rpp / dim_x) : 1u;
779 h_params.launch_dim_x = dim_x;
780 h_params.launch_dim_y = dim_y;
783 h_params.prd_pool =
nullptr;
786 OptixShaderBindingTable direct_sbt = sbt;
787 direct_sbt.raygenRecord = d_raygen_records;
790 const uint32_t MAX_DEPTH = 65535u;
792 uint32_t remaining = launch_count;
793 while (remaining > 0) {
794 const uint32_t batch = std::min(remaining, MAX_DEPTH);
797 uploadLaunchParams();
798 OPTIX_CHECK(optixLaunch(optix_pipeline, cuda_stream, d_params,
800 dim_x, dim_y, batch));
801 CUDA_CHECK(cudaStreamSynchronize(cuda_stream));
808 if (!is_initialized) {
811 if (gas_handle == 0) {
813 "Call buildAccelerationStructure() first.");
816 applyLaunchParams(launch_params);
823 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_radiation_out_top),
826 cudaMemcpyHostToDevice));
829 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_radiation_out_bottom),
832 cudaMemcpyHostToDevice));
836 if (d_band_launch_flag) { freeCUdeviceptr(d_band_launch_flag); }
839 std::vector<uint8_t> flags_u8(Nbands_g);
840 for (
size_t i = 0; i < Nbands_g; i++) {
843 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_band_launch_flag),
844 Nbands_g *
sizeof(uint8_t)));
845 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_band_launch_flag), flags_u8.data(),
846 Nbands_g *
sizeof(uint8_t), cudaMemcpyHostToDevice));
849 h_params.traversable = gas_handle;
852 freeCUdeviceptr(d_diffuse_flux);
853 freeCUdeviceptr(d_diffuse_extinction);
854 freeCUdeviceptr(d_diffuse_peak_dir);
855 freeCUdeviceptr(d_diffuse_dist_norm);
856 freeCUdeviceptr(d_sky_radiance_params);
858 auto upload_f = [
this](CUdeviceptr &ptr,
const std::vector<float> &v) {
860 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&ptr), v.size() *
sizeof(
float)));
861 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(ptr), v.data(),
862 v.size() *
sizeof(
float), cudaMemcpyHostToDevice));
876 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_diffuse_peak_dir),
877 pd_f3.size() *
sizeof(float3)));
878 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_diffuse_peak_dir), pd_f3.data(),
879 pd_f3.size() *
sizeof(float3), cudaMemcpyHostToDevice));
890 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_sky_radiance_params),
891 sky_f4.size() *
sizeof(float4)));
892 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_sky_radiance_params), sky_f4.data(),
893 sky_f4.size() *
sizeof(float4), cudaMemcpyHostToDevice));
896 h_params.
diffuse_flux =
reinterpret_cast<float *
>(d_diffuse_flux);
910 const uint32_t dim_x =
static_cast<uint32_t
>(sqrtf(
static_cast<float>(rpp)));
911 const uint32_t dim_y = (dim_x > 0) ? (rpp / dim_x) : 1u;
912 h_params.launch_dim_x = dim_x;
913 h_params.launch_dim_y = dim_y;
914 h_params.prd_pool =
nullptr;
917 OptixShaderBindingTable diffuse_sbt = sbt;
918 diffuse_sbt.raygenRecord = d_raygen_record_diffuse;
921 const uint32_t MAX_DEPTH = 65535u;
922 const uint32_t launch_count = launch_params.
launch_count;
924 uint32_t remaining = launch_count;
925 while (remaining > 0) {
926 const uint32_t batch = std::min(remaining, MAX_DEPTH);
929 uploadLaunchParams();
930 OPTIX_CHECK(optixLaunch(optix_pipeline, cuda_stream, d_params,
932 dim_x, dim_y, batch));
933 CUDA_CHECK(cudaStreamSynchronize(cuda_stream));
940 if (!is_initialized) {
943 if (gas_handle == 0) {
945 "Call buildAccelerationStructure() first.");
948 applyLaunchParams(launch_params);
956 const uint32_t cam_id = launch_params.
camera_id;
960 const size_t Npixels = (size_t)full_w * full_h;
961 const size_t cam_bytes = Npixels * Nbands_l *
sizeof(float);
962 if (cam_id != current_camera_launch_id || current_launch_band_count != Nbands_l) {
963 reallocDevice(d_radiation_in_camera, cam_bytes);
964 CUDA_CHECK(cudaMemset(
reinterpret_cast<void *
>(d_radiation_in_camera), 0, cam_bytes));
967 freeCUdeviceptr(d_camera_pixel_label);
968 freeCUdeviceptr(d_camera_pixel_depth);
969 h_params.camera_pixel_label =
nullptr;
970 h_params.camera_pixel_depth =
nullptr;
971 current_camera_launch_id = cam_id;
972 current_launch_band_count = Nbands_l;
977 if (d_band_launch_flag) { freeCUdeviceptr(d_band_launch_flag); }
980 std::vector<uint8_t> flags_u8(Nbands_g);
981 for (
size_t i = 0; i < Nbands_g; i++) {
984 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_band_launch_flag), Nbands_g *
sizeof(uint8_t)));
985 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_band_launch_flag), flags_u8.data(),
986 Nbands_g *
sizeof(uint8_t), cudaMemcpyHostToDevice));
989 h_params.traversable = gas_handle;
992 h_params.launch_dim_x = anti_samples;
993 h_params.launch_dim_y = tile_w;
995 uploadLaunchParams();
997 OptixShaderBindingTable camera_sbt = sbt;
998 camera_sbt.raygenRecord = d_raygen_record_camera;
1000 OPTIX_CHECK(optixLaunch(optix_pipeline, cuda_stream, d_params,
1002 anti_samples, tile_w, tile_h));
1003 CUDA_CHECK(cudaStreamSynchronize(cuda_stream));
1007 if (!is_initialized) {
1010 if (gas_handle == 0) {
1011 helios_runtime_error(
"ERROR (OptiX8Backend::launchPixelLabelRays): No acceleration structure. "
1012 "Call buildAccelerationStructure() first.");
1015 applyLaunchParams(launch_params);
1021 if (d_band_launch_flag) { freeCUdeviceptr(d_band_launch_flag); }
1024 std::vector<uint8_t> flags_u8(Nbands_g);
1025 for (
size_t i = 0; i < Nbands_g; i++) {
1028 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_band_launch_flag), Nbands_g *
sizeof(uint8_t)));
1029 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_band_launch_flag), flags_u8.data(),
1030 Nbands_g *
sizeof(uint8_t), cudaMemcpyHostToDevice));
1033 h_params.traversable = gas_handle;
1036 h_params.launch_dim_x = 1u;
1037 h_params.launch_dim_y = tile_w;
1039 uploadLaunchParams();
1041 OptixShaderBindingTable pixel_label_sbt = sbt;
1042 pixel_label_sbt.raygenRecord = d_raygen_record_pixel_label;
1044 OPTIX_CHECK(optixLaunch(optix_pipeline, cuda_stream, d_params,
1046 1u, tile_w, tile_h));
1047 CUDA_CHECK(cudaStreamSynchronize(cuda_stream));
1055 const size_t Nprims = current_primitive_count;
1056 const size_t Nbands = current_band_count;
1057 const size_t total = Nprims * Nbands;
1065 if (d_radiation_in) results.
radiation_in = downloadFloat(d_radiation_in, total);
1066 if (d_radiation_out_top) results.
radiation_out_top = downloadFloat(d_radiation_out_top, total);
1067 if (d_radiation_out_bottom) results.
radiation_out_bottom = downloadFloat(d_radiation_out_bottom, total);
1068 if (d_scatter_buff_top) results.
scatter_buff_top = downloadFloat(d_scatter_buff_top, total);
1069 if (d_scatter_buff_bottom) results.
scatter_buff_bottom = downloadFloat(d_scatter_buff_bottom, total);
1073 if (Nprims > 0 && current_launch_band_count > 0) {
1074 const size_t cam_total = Nprims * current_launch_band_count;
1075 if (d_scatter_buff_top_cam) results.
scatter_buff_top_cam = downloadFloat(d_scatter_buff_top_cam, cam_total);
1076 if (d_scatter_buff_bottom_cam) results.
scatter_buff_bottom_cam = downloadFloat(d_scatter_buff_bottom_cam, cam_total);
1081 std::vector<float> &pixel_depths,
uint camera_id,
1083 const size_t Npixels = (size_t)resolution.
x * resolution.
y;
1085 if (d_radiation_in_camera && Npixels > 0 && current_launch_band_count > 0) {
1086 pixel_data = downloadFloat(d_radiation_in_camera, Npixels * current_launch_band_count);
1089 if (d_camera_pixel_label && Npixels > 0) {
1090 auto labels_u32 = downloadUInt32(d_camera_pixel_label, Npixels);
1091 pixel_labels.assign(labels_u32.begin(), labels_u32.end());
1094 if (d_camera_pixel_depth && Npixels > 0) {
1095 pixel_depths = downloadFloat(d_camera_pixel_depth, Npixels);
1104 const size_t Nprims = current_primitive_count;
1105 if (Nprims == 0 || launch_band_count == 0)
return;
1107 if (launch_band_count > current_band_count) {
1109 std::to_string(launch_band_count) +
") exceeds current_band_count (" +
1110 std::to_string(current_band_count) +
"). Call updateMaterials() first.");
1114 current_camera_launch_id = 0xFFFFFFFFu;
1116 const size_t bytes = Nprims * launch_band_count *
sizeof(float);
1118 if (d_radiation_in) CUDA_CHECK(cudaMemset(
reinterpret_cast<void *
>(d_radiation_in), 0, bytes));
1119 if (d_radiation_out_top) CUDA_CHECK(cudaMemset(
reinterpret_cast<void *
>(d_radiation_out_top), 0, bytes));
1120 if (d_radiation_out_bottom) CUDA_CHECK(cudaMemset(
reinterpret_cast<void *
>(d_radiation_out_bottom), 0, bytes));
1121 if (d_scatter_buff_top) CUDA_CHECK(cudaMemset(
reinterpret_cast<void *
>(d_scatter_buff_top), 0, bytes));
1122 if (d_scatter_buff_bottom) CUDA_CHECK(cudaMemset(
reinterpret_cast<void *
>(d_scatter_buff_bottom), 0, bytes));
1125 const size_t specular_size = current_source_count * current_camera_count * Nprims * launch_band_count;
1126 if (specular_size > 0) {
1127 const size_t specular_bytes = specular_size *
sizeof(float);
1128 reallocDevice(d_radiation_specular, specular_bytes);
1129 CUDA_CHECK(cudaMemset(
reinterpret_cast<void *
>(d_radiation_specular), 0, specular_bytes));
1135 const size_t total_bytes = current_primitive_count * current_band_count *
sizeof(float);
1136 if (total_bytes == 0)
return;
1138 if (d_scatter_buff_top) CUDA_CHECK(cudaMemset(
reinterpret_cast<void *
>(d_scatter_buff_top), 0, total_bytes));
1139 if (d_scatter_buff_bottom) CUDA_CHECK(cudaMemset(
reinterpret_cast<void *
>(d_scatter_buff_bottom), 0, total_bytes));
1143 const size_t Npixels = (size_t)resolution.
x * resolution.
y;
1144 if (Npixels == 0)
return;
1146 reallocDevice(d_camera_pixel_label, Npixels *
sizeof(uint32_t));
1147 reallocDevice(d_camera_pixel_depth, Npixels *
sizeof(
float));
1148 CUDA_CHECK(cudaMemset(
reinterpret_cast<void *
>(d_camera_pixel_label), 0, Npixels *
sizeof(uint32_t)));
1149 CUDA_CHECK(cudaMemset(
reinterpret_cast<void *
>(d_camera_pixel_depth), 0, Npixels *
sizeof(
float)));
1151 h_params.camera_pixel_label =
reinterpret_cast<uint32_t *
>(d_camera_pixel_label);
1152 h_params.camera_pixel_depth =
reinterpret_cast<float *
>(d_camera_pixel_depth);
1156 const size_t total = current_primitive_count * current_band_count *
sizeof(float);
1157 if (total == 0)
return;
1158 if (d_scatter_buff_top && d_radiation_out_top) {
1159 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_radiation_out_top),
1160 reinterpret_cast<const void *
>(d_scatter_buff_top),
1161 total, cudaMemcpyDeviceToDevice));
1163 if (d_scatter_buff_bottom && d_radiation_out_bottom) {
1164 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_radiation_out_bottom),
1165 reinterpret_cast<const void *
>(d_scatter_buff_bottom),
1166 total, cudaMemcpyDeviceToDevice));
1171 const std::vector<float> &radiation_out_bottom) {
1172 if (!radiation_out_top.empty() && d_radiation_out_top) {
1173 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_radiation_out_top),
1174 radiation_out_top.data(),
1175 radiation_out_top.size() *
sizeof(
float),
1176 cudaMemcpyHostToDevice));
1178 if (!radiation_out_bottom.empty() && d_radiation_out_bottom) {
1179 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_radiation_out_bottom),
1180 radiation_out_bottom.data(),
1181 radiation_out_bottom.size() *
sizeof(
float),
1182 cudaMemcpyHostToDevice));
1187 const std::vector<float> &scatter_bottom_cam) {
1188 if (!scatter_top_cam.empty() && d_scatter_buff_top_cam) {
1189 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_scatter_buff_top_cam),
1190 scatter_top_cam.data(),
1191 scatter_top_cam.size() *
sizeof(
float),
1192 cudaMemcpyHostToDevice));
1194 if (!scatter_bottom_cam.empty() && d_scatter_buff_bottom_cam) {
1195 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_scatter_buff_bottom_cam),
1196 scatter_bottom_cam.data(),
1197 scatter_bottom_cam.size() *
sizeof(
float),
1198 cudaMemcpyHostToDevice));
1203 const size_t Nprims = current_primitive_count;
1204 if (Nprims == 0 || launch_band_count == 0)
return;
1206 const size_t bytes = Nprims * launch_band_count *
sizeof(float);
1207 reallocDevice(d_scatter_buff_top_cam, bytes);
1208 reallocDevice(d_scatter_buff_bottom_cam, bytes);
1209 CUDA_CHECK(cudaMemset(
reinterpret_cast<void *
>(d_scatter_buff_top_cam), 0, bytes));
1210 CUDA_CHECK(cudaMemset(
reinterpret_cast<void *
>(d_scatter_buff_bottom_cam), 0, bytes));
1213 h_params.scatter_buff_bottom_cam =
reinterpret_cast<float *
>(d_scatter_buff_bottom_cam);
1214 current_launch_band_count = launch_band_count;
1218 freeCUdeviceptr(d_source_fluxes);
1219 if (fluxes.empty())
return;
1221 const size_t bytes = fluxes.size() *
sizeof(float);
1222 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_source_fluxes), bytes));
1223 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_source_fluxes), fluxes.data(), bytes, cudaMemcpyHostToDevice));
1224 h_params.
source_fluxes =
reinterpret_cast<float *
>(d_source_fluxes);
1228 freeCUdeviceptr(d_source_fluxes_cam);
1229 if (fluxes_cam.empty())
return;
1231 const size_t bytes = fluxes_cam.size() *
sizeof(float);
1232 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_source_fluxes_cam), bytes));
1233 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_source_fluxes_cam), fluxes_cam.data(), bytes,
1234 cudaMemcpyHostToDevice));
1243 size_t free_bytes = 0;
1244 size_t total_bytes = 0;
1245 CUDA_CHECK(cudaMemGetInfo(&free_bytes, &total_bytes));
1246 const float free_mb =
static_cast<float>(free_bytes) / (1024.0f * 1024.0f);
1247 const float total_mb =
static_cast<float>(total_bytes) / (1024.0f * 1024.0f);
1248 std::cout <<
"GPU memory: " << free_mb <<
" MB free / " << total_mb <<
" MB total" << std::endl;
1255void OptiX8Backend::freeCUdeviceptr(CUdeviceptr &ptr) {
1257 cudaFree(
reinterpret_cast<void *
>(ptr));
1262void OptiX8Backend::freeGeometryBuffers() {
1263 freeCUdeviceptr(d_transform_matrix);
1264 freeCUdeviceptr(d_primitive_type);
1265 freeCUdeviceptr(d_primitive_positions);
1266 freeCUdeviceptr(d_primitive_uuid_arr);
1267 freeCUdeviceptr(d_primitiveID);
1268 freeCUdeviceptr(d_objectID);
1269 freeCUdeviceptr(d_object_subdivisions);
1270 freeCUdeviceptr(d_twosided_flag);
1271 freeCUdeviceptr(d_primitive_solid_fraction);
1272 freeCUdeviceptr(d_patch_vertices);
1273 freeCUdeviceptr(d_patch_UUIDs);
1274 freeCUdeviceptr(d_triangle_vertices);
1275 freeCUdeviceptr(d_triangle_UUIDs);
1276 freeCUdeviceptr(d_disk_centers);
1277 freeCUdeviceptr(d_disk_radii);
1278 freeCUdeviceptr(d_disk_normals);
1279 freeCUdeviceptr(d_disk_UUIDs);
1280 freeCUdeviceptr(d_tile_vertices);
1281 freeCUdeviceptr(d_tile_UUIDs);
1282 freeCUdeviceptr(d_voxel_vertices);
1283 freeCUdeviceptr(d_voxel_UUIDs);
1284 freeCUdeviceptr(d_bbox_vertices);
1285 freeCUdeviceptr(d_bbox_UUIDs);
1286 freeCUdeviceptr(d_mask_data);
1287 freeCUdeviceptr(d_mask_offsets);
1288 freeCUdeviceptr(d_mask_sizes);
1289 freeCUdeviceptr(d_mask_IDs);
1290 freeCUdeviceptr(d_uv_data);
1291 freeCUdeviceptr(d_uv_IDs);
1292 freeCUdeviceptr(d_aabbs);
1293 freeCUdeviceptr(d_gas_output);
1297void OptiX8Backend::freeMaterialBuffers() {
1298 freeCUdeviceptr(d_rho);
1299 freeCUdeviceptr(d_tau);
1300 freeCUdeviceptr(d_rho_cam);
1301 freeCUdeviceptr(d_tau_cam);
1302 freeCUdeviceptr(d_specular_exponent);
1303 freeCUdeviceptr(d_specular_scale);
1306void OptiX8Backend::reallocDevice(CUdeviceptr &ptr,
size_t bytes) {
1307 freeCUdeviceptr(ptr);
1309 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&ptr), bytes));
1313std::vector<float> OptiX8Backend::downloadFloat(CUdeviceptr ptr,
size_t count)
const {
1314 std::vector<float> result(count);
1315 CUDA_CHECK(cudaMemcpy(result.data(),
reinterpret_cast<const void *
>(ptr),
1316 count *
sizeof(
float), cudaMemcpyDeviceToHost));
1320std::vector<uint32_t> OptiX8Backend::downloadUInt32(CUdeviceptr ptr,
size_t count)
const {
1321 std::vector<uint32_t> result(count);
1322 CUDA_CHECK(cudaMemcpy(result.data(),
reinterpret_cast<const void *
>(ptr),
1323 count *
sizeof(uint32_t), cudaMemcpyDeviceToHost));
1327void OptiX8Backend::buildAABBs(
const RayTracingGeometry &geometry) {
1328 const uint32_t Nprims =
static_cast<uint32_t
>(geometry.primitive_count);
1329 const uint32_t Nbboxes =
static_cast<uint32_t
>(geometry.bboxes.UUIDs.size());
1330 const uint32_t Ntotal = Nprims + Nbboxes;
1331 if (Ntotal == 0)
return;
1333 std::vector<OptixAabb> aabbs(Ntotal);
1334 const float eps = 1e-5f;
1336 auto store_aabb = [&](uint32_t pos,
float mn_x,
float mn_y,
float mn_z,
1337 float mx_x,
float mx_y,
float mx_z) {
1338 if (mx_x - mn_x < eps) { mn_x -= eps; mx_x += eps; }
1339 if (mx_y - mn_y < eps) { mn_y -= eps; mx_y += eps; }
1340 if (mx_z - mn_z < eps) { mn_z -= eps; mx_z += eps; }
1341 aabbs[pos] = {mn_x, mn_y, mn_z, mx_x, mx_y, mx_z};
1345 for (uint32_t pos = 0; pos < Nprims; pos++) {
1346 const float *T = &geometry.transform_matrices[pos * 16];
1347 const uint32_t pt = geometry.primitive_types[pos];
1349 float mn_x = FLT_MAX, mn_y = FLT_MAX, mn_z = FLT_MAX;
1350 float mx_x = -FLT_MAX, mx_y = -FLT_MAX, mx_z = -FLT_MAX;
1352 auto expand = [&](
float x,
float y,
float z) {
1353 const float wx = T[0]*x + T[1]*y + T[2]*z + T[3];
1354 const float wy = T[4]*x + T[5]*y + T[6]*z + T[7];
1355 const float wz = T[8]*x + T[9]*y + T[10]*z + T[11];
1356 if (wx < mn_x) mn_x = wx;
if (wx > mx_x) mx_x = wx;
1357 if (wy < mn_y) mn_y = wy;
if (wy > mx_y) mx_y = wy;
1358 if (wz < mn_z) mn_z = wz;
if (wz > mx_z) mx_z = wz;
1361 if (pt == 0 || pt == 3) {
1362 expand(-0.5f, -0.5f, 0.f); expand( 0.5f, -0.5f, 0.f);
1363 expand(-0.5f, 0.5f, 0.f); expand( 0.5f, 0.5f, 0.f);
1364 }
else if (pt == 1) {
1365 expand(0.f, 0.f, 0.f); expand(0.f, 1.f, 0.f); expand(1.f, 1.f, 0.f);
1366 }
else if (pt == 2) {
1367 expand(-0.5f, -0.5f, 0.f); expand( 0.5f, -0.5f, 0.f);
1368 expand(-0.5f, 0.5f, 0.f); expand( 0.5f, 0.5f, 0.f);
1369 }
else if (pt == 4) {
1370 for (
float fx : {0.f, 1.f})
1371 for (
float fy : {0.f, 1.f})
1372 for (
float fz : {0.f, 1.f})
1375 expand(-0.5f, -0.5f, -0.5f); expand( 0.5f, 0.5f, 0.5f);
1378 store_aabb(pos, mn_x, mn_y, mn_z, mx_x, mx_y, mx_z);
1382 for (uint32_t b = 0; b < Nbboxes; b++) {
1383 float mn_x = FLT_MAX, mn_y = FLT_MAX, mn_z = FLT_MAX;
1384 float mx_x = -FLT_MAX, mx_y = -FLT_MAX, mx_z = -FLT_MAX;
1385 for (
int v = 0; v < 4; v++) {
1386 const helios::vec3 &vtx = geometry.bboxes.vertices[b * 4 + v];
1387 mn_x = std::min(mn_x, vtx.
x); mx_x = std::max(mx_x, vtx.
x);
1388 mn_y = std::min(mn_y, vtx.
y); mx_y = std::max(mx_y, vtx.
y);
1389 mn_z = std::min(mn_z, vtx.
z); mx_z = std::max(mx_z, vtx.
z);
1391 store_aabb(Nprims + b, mn_x, mn_y, mn_z, mx_x, mx_y, mx_z);
1394 const size_t aabb_bytes = Ntotal *
sizeof(OptixAabb);
1395 reallocDevice(d_aabbs, aabb_bytes);
1396 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_aabbs), aabbs.data(), aabb_bytes, cudaMemcpyHostToDevice));
1399void OptiX8Backend::buildGAS(uint32_t Nprimitives) {
1400 if (d_gas_output) { cudaFree(
reinterpret_cast<void *
>(d_gas_output)); d_gas_output = 0; }
1403 if (Nprimitives == 0 || !d_aabbs)
return;
1405 const unsigned int build_flags[] = {OPTIX_GEOMETRY_FLAG_NONE};
1406 OptixBuildInputCustomPrimitiveArray ca = {};
1407 ca.aabbBuffers = &d_aabbs;
1408 ca.numPrimitives = Nprimitives;
1409 ca.strideInBytes =
sizeof(OptixAabb);
1410 ca.flags = build_flags;
1411 ca.numSbtRecords = 1;
1413 OptixBuildInput bi = {};
1414 bi.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
1415 bi.customPrimitiveArray = ca;
1417 OptixAccelBuildOptions opts = {};
1418 opts.buildFlags = OPTIX_BUILD_FLAG_ALLOW_COMPACTION | OPTIX_BUILD_FLAG_PREFER_FAST_TRACE;
1419 opts.operation = OPTIX_BUILD_OPERATION_BUILD;
1421 OptixAccelBufferSizes sizes = {};
1422 OPTIX_CHECK(optixAccelComputeMemoryUsage(optix_context, &opts, &bi, 1, &sizes));
1424 CUdeviceptr d_temp = 0, d_pre_compact = 0, d_compact_size = 0;
1425 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_temp), sizes.tempSizeInBytes));
1426 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_pre_compact), sizes.outputSizeInBytes));
1427 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_compact_size),
sizeof(uint64_t)));
1429 OptixAccelEmitDesc emit = {};
1430 emit.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
1431 emit.result = d_compact_size;
1433 OPTIX_CHECK(optixAccelBuild(optix_context, cuda_stream, &opts, &bi, 1,
1434 d_temp, sizes.tempSizeInBytes,
1435 d_pre_compact, sizes.outputSizeInBytes,
1436 &gas_handle, &emit, 1));
1437 CUDA_CHECK(cudaStreamSynchronize(cuda_stream));
1439 uint64_t compact_size = 0;
1440 CUDA_CHECK(cudaMemcpy(&compact_size,
reinterpret_cast<const void *
>(d_compact_size),
1441 sizeof(uint64_t), cudaMemcpyDeviceToHost));
1443 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_gas_output), compact_size));
1444 OPTIX_CHECK(optixAccelCompact(optix_context, cuda_stream, gas_handle, d_gas_output, compact_size, &gas_handle));
1445 CUDA_CHECK(cudaStreamSynchronize(cuda_stream));
1447 CUDA_CHECK(cudaFree(
reinterpret_cast<void *
>(d_temp)));
1448 CUDA_CHECK(cudaFree(
reinterpret_cast<void *
>(d_pre_compact)));
1449 CUDA_CHECK(cudaFree(
reinterpret_cast<void *
>(d_compact_size)));
1451 h_params.traversable = gas_handle;
1454void OptiX8Backend::buildSBT() {
1455 if (d_raygen_records) { cudaFree(
reinterpret_cast<void *
>(d_raygen_records)); d_raygen_records = 0; }
1456 if (d_miss_records) { cudaFree(
reinterpret_cast<void *
>(d_miss_records)); d_miss_records = 0; }
1457 if (d_hitgroup_records) { cudaFree(
reinterpret_cast<void *
>(d_hitgroup_records)); d_hitgroup_records = 0; }
1460 struct alignas(OPTIX_SBT_RECORD_ALIGNMENT) RaygenRecord {
1461 char header[OPTIX_SBT_RECORD_HEADER_SIZE];
1463 constexpr int N_rg = 4;
1464 std::vector<RaygenRecord> rg_recs(N_rg);
1465 OPTIX_CHECK(optixSbtRecordPackHeader(pg_raygen_direct, &rg_recs[0]));
1466 OPTIX_CHECK(optixSbtRecordPackHeader(pg_raygen_diffuse, &rg_recs[1]));
1467 OPTIX_CHECK(optixSbtRecordPackHeader(pg_raygen_camera, &rg_recs[2]));
1468 OPTIX_CHECK(optixSbtRecordPackHeader(pg_raygen_pixel_label, &rg_recs[3]));
1469 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_raygen_records), N_rg *
sizeof(RaygenRecord)));
1470 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_raygen_records), rg_recs.data(),
1471 N_rg *
sizeof(RaygenRecord), cudaMemcpyHostToDevice));
1474 d_raygen_record_direct = d_raygen_records;
1475 d_raygen_record_diffuse = d_raygen_records + 1 *
sizeof(RaygenRecord);
1476 d_raygen_record_camera = d_raygen_records + 2 *
sizeof(RaygenRecord);
1477 d_raygen_record_pixel_label = d_raygen_records + 3 *
sizeof(RaygenRecord);
1480 struct alignas(OPTIX_SBT_RECORD_ALIGNMENT) MissRecord {
1481 char header[OPTIX_SBT_RECORD_HEADER_SIZE];
1483 constexpr int N_ms = 4;
1484 std::vector<MissRecord> ms_recs(N_ms);
1485 OPTIX_CHECK(optixSbtRecordPackHeader(pg_miss_direct, &ms_recs[0]));
1486 OPTIX_CHECK(optixSbtRecordPackHeader(pg_miss_diffuse, &ms_recs[1]));
1487 OPTIX_CHECK(optixSbtRecordPackHeader(pg_miss_camera, &ms_recs[2]));
1488 OPTIX_CHECK(optixSbtRecordPackHeader(pg_miss_pixel_label, &ms_recs[3]));
1489 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_miss_records), N_ms *
sizeof(MissRecord)));
1490 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_miss_records), ms_recs.data(),
1491 N_ms *
sizeof(MissRecord), cudaMemcpyHostToDevice));
1494 struct alignas(OPTIX_SBT_RECORD_ALIGNMENT) HitRecord {
1495 char header[OPTIX_SBT_RECORD_HEADER_SIZE];
1498 constexpr int N_hg = 4;
1499 std::vector<HitRecord> hg_recs(N_hg);
1500 for (
auto &r : hg_recs) {
1501 r.data.vertices =
reinterpret_cast<float3 *
>(d_patch_vertices);
1502 r.data.UUIDs =
reinterpret_cast<uint32_t *
>(d_patch_UUIDs);
1503 r.data.prim_type = 0;
1505 OPTIX_CHECK(optixSbtRecordPackHeader(pg_hit_direct, &hg_recs[0]));
1506 OPTIX_CHECK(optixSbtRecordPackHeader(pg_hit_diffuse, &hg_recs[1]));
1507 OPTIX_CHECK(optixSbtRecordPackHeader(pg_hit_camera, &hg_recs[2]));
1508 OPTIX_CHECK(optixSbtRecordPackHeader(pg_hit_pixel_label, &hg_recs[3]));
1509 CUDA_CHECK(cudaMalloc(
reinterpret_cast<void **
>(&d_hitgroup_records), N_hg *
sizeof(HitRecord)));
1510 CUDA_CHECK(cudaMemcpy(
reinterpret_cast<void *
>(d_hitgroup_records), hg_recs.data(),
1511 N_hg *
sizeof(HitRecord), cudaMemcpyHostToDevice));
1514 sbt.raygenRecord = d_raygen_records;
1515 sbt.missRecordBase = d_miss_records;
1516 sbt.missRecordStrideInBytes =
static_cast<uint32_t
>(
sizeof(MissRecord));
1517 sbt.missRecordCount = N_ms;
1518 sbt.hitgroupRecordBase = d_hitgroup_records;
1519 sbt.hitgroupRecordStrideInBytes =
static_cast<uint32_t
>(
sizeof(HitRecord));
1520 sbt.hitgroupRecordCount = N_hg;
1523void OptiX8Backend::uploadLaunchParams() {
1524 CUDA_CHECK(cudaMemcpyAsync(
reinterpret_cast<void *
>(d_params), &h_params,
1528void OptiX8Backend::applyLaunchParams(
const RayTracingLaunchParams ¶ms) {
1531 h_params.rays_per_primitive = params.rays_per_primitive;
1532 h_params.random_seed = params.random_seed;
1533 h_params.Nbands_global = params.num_bands_global;
1534 h_params.Nbands_launch = params.num_bands_launch;
1536 h_params.scattering_iteration = params.scattering_iteration;
1537 h_params.specular_reflection_enabled = params.specular_reflection_enabled;
1538 h_params.camera_ID = params.camera_id;
1539 h_params.camera_position = make_float3(params.camera_position.x, params.camera_position.y, params.camera_position.z);
1541 h_params.camera_focal_length = params.camera_focal_length;
1542 h_params.camera_lens_diameter = params.camera_lens_diameter;
1543 h_params.FOV_aspect_ratio = params.camera_fov_aspect;
1544 h_params.camera_HFOV = params.camera_HFOV;
1545 h_params.camera_resolution.x = params.camera_resolution.x;
1546 h_params.camera_resolution.y = params.camera_resolution.y;
1547 h_params.camera_viewplane_length= params.camera_viewplane_length;
1548 h_params.camera_pixel_solid_angle = params.camera_pixel_solid_angle;
1549 h_params.camera_pixel_offset.x = params.camera_pixel_offset.x;
1550 h_params.camera_pixel_offset.y = params.camera_pixel_offset.y;
1551 h_params.camera_resolution_full.x = params.camera_resolution_full.x;
1552 h_params.camera_resolution_full.y = params.camera_resolution_full.y;
1555std::string OptiX8Backend::findDeviceCodeFile()
const {
1558 const std::vector<std::string> candidate_names = {
1559 "OptiX8DeviceCode.optixir",
1560 "OptiX8DeviceCode.ptx",
1562 for (
const auto &name : candidate_names) {
1564 if (!path.empty()) {
1565 return path.string();
1569 "ERROR (OptiX8Backend::findDeviceCodeFile): Could not find OptiX8DeviceCode.optixir or "
1570 "OptiX8DeviceCode.ptx in the radiation plugin asset directory. "
1571 "Ensure the radiation plugin was built with OptiX 8 support (HELIOS_HAVE_OPTIX8).");