CompFUSE · PDoakORNL · Feb 18, 2026 · Feb 20, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/applications/dca/CMakeLists.txt b/applications/dca/CMakeLists.txt
@@ -7,8 +7,9 @@ if (DCA_BUILD_DCA)
   if (DCA_HAVE_GPU)
     target_link_libraries(main_dca PRIVATE ${DCA_KERNEL_LIBS})
   endif()
-  
-  target_link_libraries(main_dca PUBLIC FFTW::Double signals ${DCA_LIBS} dca_io)
+
+  target_link_libraries(main_dca PUBLIC FFTW::Double signals ${DCA_LIBS} dca_io main_parameters)
 
   install(TARGETS main_dca RUNTIME DESTINATION bin)
+
 endif()
diff --git a/applications/dca/main_dca.cpp b/applications/dca/main_dca.cpp
@@ -15,6 +15,7 @@
 #include <iostream>
 
 #include "dca/config/dca.hpp"
+#include "dca/phys/parameters/main_parameters.hpp"
 #include "dca/application/dca_loop_dispatch.hpp"
 #include "dca/config/cmake_options.hpp"
 #include "dca/config/haves_defines.hpp"
@@ -63,17 +64,16 @@ int dca_main(int argc, char** argv) {
           << std::endl;
     }
 
-
     // Create the parameters object from the input file.
     ParametersType parameters(dca::util::GitVersion::string(), concurrency);
     parameters.read_input_and_broadcast<dca::io::JSONReader>(input_file);
-    if(concurrency.id() == concurrency.first())
+    if (concurrency.id() == concurrency.first())
       std::cout << "Input read and broadcast.\n";
     parameters.update_model();
-    if(concurrency.id() == concurrency.first())
+    if (concurrency.id() == concurrency.first())
       std::cout << "Model updated.\n";
     parameters.update_domains();
-    if(concurrency.id() == concurrency.first())
+    if (concurrency.id() == concurrency.first())
       std::cout << "Domains updated.\n";
 
     dca::DistType distribution = parameters.get_g4_distribution();

diff --git a/include/dca/function/function.hpp b/include/dca/function/function.hpp
@@ -408,6 +408,21 @@ class function {
   // These are the linear start and end indexes with respect to the complete function.
   std::size_t start_;
   std::size_t end_;
+
+public:
+#ifdef DEBUG
+  // Variadic debug accessor: forwards all args to operator()
+  template <typename... Ts>
+  const scalartype& value_at_debug(Ts&&... indices) const {
+    return (*this)(std::forward<Ts>(indices)...);
+  }
+
+  // Non-const version (if needed for writing)
+  template <typename... Ts>
+  scalartype& value_at_debug(Ts&&... indices) {
+    return (*this)(std::forward<Ts>(indices)...);
+  }
+#endif
 };
 
 template <typename scalartype, class domain, DistType DT>

diff --git a/include/dca/linalg/util/atomic_add_cuda.cu.hpp b/include/dca/linalg/util/atomic_add_cuda.cu.hpp
@@ -21,28 +21,7 @@ namespace dca {
 namespace linalg {
 // dca::linalg::
 
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-// Older devices do not have an hardware atomicAdd for double.
-// See
-// https://stackoverflow.com/questions/12626096/why-has-atomicadd-not-been-implemented-for-doubles
-__device__ double inline atomicAddImpl(double* address, const double val) {
-  unsigned long long int* address_as_ull = (unsigned long long int*)address;
-  unsigned long long int old = *address_as_ull, assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
-    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }
-  } while (assumed != old);
-  return __longlong_as_double(old);
-}
-
-__device__ void inline atomicAdd(double* address, const double val) {
-  atomicAddImpl(address, val);
-}
-
-#elif defined(DCA_HAVE_HIP)
+#if defined(DCA_HAVE_HIP)
 // HIP seems to have some horrible problem with concurrent atomic operations.
 __device__ double inline atomicAddImpl(double* address, const double val) {
   unsigned long long int* address_as_ull = (unsigned long long int*)address;
@@ -61,13 +40,12 @@ __device__ double inline atomicAddImpl(float* address, const float val) {
   unsigned long int old = *address_as_int, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address_as_int, assumed,
-                    __float_as_int(val + __int_as_float(assumed)));
+    old = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
     // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }
   } while (assumed != old);
   return __int_as_float(old);
 }
-  
+
 __device__ void inline atomicAdd(float* address, const float val) {
   atomicAddImpl(address, val);
 }
@@ -82,7 +60,7 @@ __device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val)
   atomicAddImpl(a_d + 1, val.y);
 }
 
-  __device__ void inline atomicAdd(magmaFloatComplex* const address, magmaFloatComplex val) {
+__device__ void inline atomicAdd(magmaFloatComplex* const address, magmaFloatComplex val) {
   double* a_d = reinterpret_cast<double*>(address);
   atomicAddImpl(a_d, val.x);
   atomicAddImpl(a_d + 1, val.y);
@@ -105,12 +83,13 @@ __device__ void inline atomicAdd(float* address, float val) {
 
 __device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val) {
   double* a_d = reinterpret_cast<double*>(address);
-  atomicAdd(a_d, val.x);
-  atomicAdd(a_d + 1, val.y);
+  ::atomicAdd(a_d, val.x);
+  ::atomicAdd(a_d + 1, val.y);
 }
+
 #endif  // atomic operation help
 
-}  // linalg
-}  // dca
+}  // namespace linalg
+}  // namespace dca
 
 #endif  // DCA_LINALG_UTIL_ATOMIC_ADD_CUDA_CU_HPP
diff --git a/include/dca/linalg/util/gpu_event.hpp b/include/dca/linalg/util/gpu_event.hpp
@@ -64,7 +64,7 @@ class GpuEvent {
 };
 
 // Returns the elapsed time in seconds between two recorded events. Blocks host.
-float elapsedTime(cudaEvent_t stop, cudaEvent_t start) {
+inline float elapsedTime(cudaEvent_t stop, cudaEvent_t start) {
   checkRC(cudaEventSynchronize(stop));
   float msec(0);
   checkRC(cudaEventElapsedTime(&msec, start, stop));
@@ -91,8 +91,8 @@ class GpuEvent {
 
 #endif  // DCA_HAVE_GPU
 
-}  // util
-}  // linalg
-}  // dca
+}  // namespace util
+}  // namespace linalg
+}  // namespace dca
 
 #endif  // DCA_LINALG_UTIL_GPU_EVENT_HPP
diff --git a/include/dca/linalg/util/util_lapack.hpp b/include/dca/linalg/util/util_lapack.hpp
@@ -60,7 +60,7 @@ inline void checkLapackInfoInternal(int info, std::string function_name, std::st
 #define warnLapackInfo(info) \
   dca::linalg::lapack::util::warnLapackInfoInternal(info, __FUNCTION__, __FILE__, __LINE__)
 inline void warnLapackInfoInternal(int info, std::string function_name, std::string file_name,
-                                    int line) {
+                                   int line) {
   if (info < 0) {
     std::stringstream s;
     s << "Error in function: " << function_name << " (" << file_name << ":" << line << ")"
@@ -69,15 +69,16 @@ inline void warnLapackInfoInternal(int info, std::string function_name, std::str
 
     throw LapackException(s.str(), info);
   }
-  else if (info > 0) {
-    std::cout << "warning lapack info = " << info << " at " << file_name << ":" << line << '\n';
-  }
+#ifndef NDEBUG
+  // else if (info > 0) {
+  //   std::cout << "warning lapack info = " << info << " at " << file_name << ":" << line << '\n';
+  // }
+#endif
 }
 
-
-}  // util
-}  // lapack
-}  // linalg
-}  // dca
+}  // namespace util
+}  // namespace lapack
+}  // namespace linalg
+}  // namespace dca
 
 #endif  // DCA_LINALG_UTIL_UTIL_LAPACK_HPP
diff --git a/include/dca/math/function_transform/hermite_splines/hermite_cubic_spline.hpp b/include/dca/math/function_transform/hermite_splines/hermite_cubic_spline.hpp
@@ -22,8 +22,7 @@ namespace transform {
 // dca::math::transform::
 
 // Empty template declaration.
-template <typename lh_dmn_type, typename rh_dmn_type, ELEMENT_SPACINGS ES, BOUNDARY_CONDITIONS BC,
-          int DIMENSION>
+template <typename lh_dmn_type, typename rh_dmn_type, ELEMENT_SPACINGS ES, BOUNDARY_CONDITIONS BC, int DIMENSION>
 struct hermite_cubic_spline {};
 
 // Template specialization for the equidistant periodic case.
@@ -117,8 +116,98 @@ typename hermite_cubic_spline<lh_dmn_type, rh_dmn_type, EQUIDISTANT, PERIODIC, D
   return result;
 }
 
-}  // transform
-}  // math
-}  // dca
+template <typename lh_dmn_type, typename rh_dmn_type, int DIMENSION>
+class hermite_cubic_spline<lh_dmn_type, rh_dmn_type, EQUIDISTANT, INTERVAL, DIMENSION> {
+private:
+  typedef typename lh_dmn_type::dmn_specifications_type lh_spec_dmn_type;
+  typedef typename rh_dmn_type::dmn_specifications_type rh_spec_dmn_type;
+
+  typedef typename lh_spec_dmn_type::scalar_type lh_scalar_type;
+  typedef typename rh_spec_dmn_type::scalar_type rh_scalar_type;
+
+  typedef typename lh_spec_dmn_type::element_type lh_element_type;
+  typedef typename rh_spec_dmn_type::element_type rh_element_type;
+
+  typedef lh_scalar_type f_scalar_type;
+
+public:
+  static f_scalar_type execute(int i, int j);
+};
+
+template <typename lh_dmn_type, typename rh_dmn_type, int DIMENSION>
+typename hermite_cubic_spline<lh_dmn_type, rh_dmn_type, EQUIDISTANT, INTERVAL, DIMENSION>::f_scalar_type hermite_cubic_spline<
+    lh_dmn_type, rh_dmn_type, EQUIDISTANT, INTERVAL, DIMENSION>::execute(int i, int j) {
+  const static rh_scalar_type a = -0.5;
+
+  lh_element_type x = lh_dmn_type::get_elements()[i];
+  rh_element_type y = rh_dmn_type::get_elements()[j];
+
+  rh_scalar_type* super_basis = rh_dmn_type::get_super_basis();
+
+  rh_scalar_type* inv_basis = rh_dmn_type::get_inverse_basis();
+  rh_scalar_type* inv_super_basis = rh_dmn_type::get_inverse_super_basis();
+
+  rh_element_type delta(DIMENSION, 0.);
+  rh_element_type delta_affine(DIMENSION, 0.);
+
+  f_scalar_type result = 0;
+
+  {
+    for (int li = 0; li < DIMENSION; li++)
+      delta[li] = (y[li] - x[li]);
+
+    {
+      for (int li = 0; li < DIMENSION; li++)
+        delta_affine[li] = 0.;
+
+      for (int li = 0; li < DIMENSION; li++)
+        for (int lj = 0; lj < DIMENSION; lj++)
+          delta_affine[li] += inv_super_basis[li + lj * DIMENSION] * delta[lj];
+    }
+
+    for (int li = 0; li < DIMENSION; li++) {
+      while (delta_affine[li] > 0.5 - 1.e-6)
+        delta_affine[li] -= 1.;
+
+      while (delta_affine[li] < -0.5 - 1.e-6)
+        delta_affine[li] += 1.;
+    }
+
+    {
+      for (int li = 0; li < DIMENSION; li++)
+        delta[li] = 0.;
+
+      for (int li = 0; li < DIMENSION; li++)
+        for (int lj = 0; lj < DIMENSION; lj++)
+          delta[li] += super_basis[li + lj * DIMENSION] * delta_affine[lj];
+    }
+
+    {
+      for (int li = 0; li < DIMENSION; li++)
+        delta_affine[li] = 0.;
+
+      for (int li = 0; li < DIMENSION; li++)
+        for (int lj = 0; lj < DIMENSION; lj++)
+          delta_affine[li] += inv_basis[li + lj * DIMENSION] * delta[lj];
+    }
+
+    for (int li = 0; li < DIMENSION; li++)
+      delta[li] = (delta_affine[li] > -2. and delta_affine[li] < 2.)
+                      ? hermite_spline::cubic(0., delta_affine[li], 1., a)
+                      : 0.;
+
+    f_scalar_type t_result = 1;
+    for (int li = 0; li < DIMENSION; li++)
+      t_result *= delta[li];
+
+    result += t_result;
+  }
+
+  return result;
+}
+
+}  // namespace transform
+}  // namespace math
+}  // namespace dca
 
 #endif  // DCA_MATH_FUNCTION_TRANSFORM_HERMITE_SPLINES_HERMITE_CUBIC_SPLINE_HPP
diff --git a/include/dca/math/util/vector_operations.hpp b/include/dca/math/util/vector_operations.hpp
@@ -104,16 +104,49 @@ std::complex<T> innerProduct(const std::vector<std::complex<T>>& x,
   return res;
 }
 
+template <typename T>
+std::complex<T> innerProduct(const std::vector<T>& x, const std::vector<std::complex<T>>& y) {
+  assert(x.size() == y.size());
+
+  std::complex<T> res(0.);
+  for (std::size_t i = 0; i < x.size(); ++i)
+    res += x[i] * std::conj(y[i]);
+
+  return res;
+}
+
+template <typename T>
+std::complex<T> innerProduct(const std::vector<std::complex<T>>& x, const std::vector<T>& y) {
+  assert(x.size() == y.size());
+
+  std::complex<T> res(0.);
+  for (std::size_t i = 0; i < x.size(); ++i)
+    res += x[i] * y[i];
+
+  return res;
+}
+
 // Treats scalars as vectors of size 1.
 template <typename T>
 T innerProduct(const T x, const T y) {
   return x * y;
 }
+
 template <typename T>
 std::complex<T> innerProduct(const std::complex<T> x, const std::complex<T> y) {
   return x * std::conj(y);
 }
 
+// template <typename T>
+// std::complex<T> innerProduct(const T x, const std::complex<T> y) {
+//   return x * std::conj(y);
+// }
+
+// template <typename T>
+// std::complex<T> innerProduct(const std::complex<T> x, const T y) {
+//   return x * y;
+// }
+
 // Computes the square of the L^2 norm of the vector x.
 template <typename T>
 auto l2Norm2(const std::vector<T>& x) {