сейчас я пришлю тебе две версии моего алгоритма, о...

Question

сейчас я пришлю тебе две версии моего алгоритма, одну обычную, вторую оптимизированную по скорости, обычная версия работает и в обычном режиме и в режиме transition, оптимизированная версия прекрасно быстро работает в обычном режиме и не работает в режиме transition, твоя задача будет скопировать поведение оптимизированной версии везде где это касается обычного режима но сделать её рабочей и ЧТО ВАЖНО ОПТИМИЗИРОВАННОЙ ТАКЖЕ ДЛЯ TRANSITION режима - для этого надо будет внимательно проанализировать за счёт чего обычная версия остаётся рабочей в transition режиме, и прислать мне полный код целиком оптимизированной по скорости версии которая работает быстро в обоих режимах, причём безопасность/читаемость/требования по памяти/поддерживаемость кода не важны, важна лишь максимальная скорость, важно что ты должен прислать именно код максимального оптимизированного под скорость метода целиком а не просто набор правок, код обычной версии: "static declspec(noalias) void agp_run_branch_mpi( const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed, std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H, std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ, float& bestF, float& bestX, float& bestY, size_t& out_iterations, float& out_achieved_epsilon, float M_prior) noexcept { struct TrialPoint { alignas(16) float q[AGP_MAX_STATE_DIM]; float t = 0.0f; float f = FLT_MAX; float x = 0.0f; float y = 0.0f; unsigned idx = 0u; bool feasible = false; }; using AlignedFloatVec = std::vector<float, boost::alignment::aligned_allocator<float, 16u>>; constexpr float T_EPS = 1.0e-7f; const int n = cost.n; const int dim = n + (cost.variableLen ? n : 0); const float dim_f = static_cast<float>(dim); const unsigned fullConstraintIndex = cost.feasible_index(); const bool hasHardConstraints = (fullConstraintIndex != 0u); if (n > 32 || dim > AGP_MAX_STATE_DIM) { bestQ.clear(); bestF = FLT_MAX; bestX = 0.0f; bestY = 0.0f; out_iterations = 0u; out_achieved_epsilon = 0.0f; return; } unsigned exchange_counter = 0u; unsigned exchange_counter_T = 0u; alignas(16) float M_by_span[12]; for (int i = 0; i < 12; ++i) M_by_span[i] = M_prior; float Mmax = M_prior; alignas(16) float phi[AGP_MAX_STATE_DIM]; alignas(16) float s_arr[AGP_MAX_STATE_DIM]; alignas(16) float c_arr[AGP_MAX_STATE_DIM]; alignas(16) float sum_s[AGP_MAX_STATE_DIM]; alignas(16) float sum_c[AGP_MAX_STATE_DIM]; alignas(16) float q_try[AGP_MAX_STATE_DIM]; bestQ.reserve(static_cast<size_t>(dim)); AlignedFloatVec bestQIndexed; bestQIndexed.reserve(static_cast<size_t>(dim)); AlignedFloatVec transitionReferenceSeed; float transitionRefF = FLT_MAX; unsigned transitionRefIdx = 0u; float transitionRefX = 0.0f; float transitionRefY = 0.0f; bool haveTransitionRefSeed = false; if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles()) { const int refDim = cost.full_state_dim(); transitionReferenceSeed.resize(static_cast<size_t>(refDim)); std::memcpy( transitionReferenceSeed.data(), cost.referenceState, static_cast<size_t>(refDim) * sizeof(float)); float xr = 0.0f; float yr = 0.0f; unsigned idxr = 0u; float fr = 0.0f; const bool ref_ok = cost.evaluate_indexed(transitionReferenceSeed.data(), xr, yr, idxr, fr); transitionRefF = fr; transitionRefIdx = idxr; transitionRefX = xr; transitionRefY = yr; haveTransitionRefSeed = ref_ok; } unsigned bestIndexFound = 0u; float bestIndexValue = FLT_MAX; float bestIndexedX = 0.0f; float bestIndexedY = 0.0f; BestSolutionMsg lastSentBestMsg; InitBestSolutionMsg(lastSentBestMsg); int no_improve = 0; const float a = 0.0f; const float b = 1.0f; float p = 0.0f; float dmax = b - a; const float initial_len = dmax; const float A_dim = fmaf(1.0f / sqrtf(dim_f + 6.75f), 5.535f, 0.0f); const float A_dim = fmaf(1.0f / sqrtf(dim_f + 6.75f), 3.425f, 0.0f); const float B_dim = fmaf(A_dim, 0.7f, 0.0f); const float B_dim__ = fmaf(A_dim__, 4.325f, 0.0f); const float log_argument = A_dim - 2.03f; const float log_argument__ = A_dim__ - 2.0f; const float C_dim = fmaf( log_argument, fmaf( log_argument, fmaf( log_argument, fmaf( log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim; const float C_dim__ = fmaf( log_argument__, fmaf( log_argument__, fmaf( log_argument__, fmaf( log_argument__, fmaf(log_argument__, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument__) - B_dim__; const float adaptive_coeff_addition = fmaf( C_dim, fmaf( C_dim, fmaf( C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float adaptive_coeff_addition__ = fmaf( C_dim__, fmaf( C_dim__, fmaf( C_dim__, fmaf(C_dim__, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); float adaptive_coeff = A_dim - adaptive_coeff_addition; float adaptive_coeff__ = A_dim__ - adaptive_coeff_addition__; int it = 0; int stag_boost_remaining = 0; float stag_r_multiplier = 0.0f; int last_send_T = 0; const int send_interval_T = 7; int last_send_best = 0; const int send_interval_best = 2; const int n_stag_iters = static_cast<int>(3.0f + 2.045f * sqrtf(dim_f)); const int noImproveThrDim = static_cast<int>(fmaf(7.5f, exp2f(-0.1f * sqrtf(dim_f)), 0.0f)); const AGPHookeJeevesParams& hj_cfg = g_agp_hj_params; auto clamp01 = [](float v) noexcept -> float { if (v < 0.0f) return 0.0f; if (v > 1.0f) return 1.0f; return v; }; auto lerp_hj = [](float a0, float a1, float t) noexcept -> float { return fmaf(t, a1 - a0, a0); }; auto select_hj_iteration_schedule = [&](float progress, int& period_out, int& phase_out) noexcept { period_out = hj_cfg.iteration_period; phase_out = hj_cfg.iteration_phase; int count = hj_cfg.iteration_schedule_count; if (count <= 0) return; if (count > AGP_HJ_MAX_PROGRESS_RULES) count = AGP_HJ_MAX_PROGRESS_RULES; const float p01 = clamp01(progress); for (int i = 0; i < count; ++i) { const AGPHJIterationScheduleRule& rule = hj_cfg.iteration_schedule[i]; if (p01 < rule.progress_lt || i + 1 == count) { period_out = rule.period; phase_out = rule.phase; break; } } if (period_out < 1) period_out = 1; phase_out %= period_out; if (phase_out < 0) phase_out += period_out; }; auto select_hj_coord_profile = [&](float progress) noexcept -> AGPHJCoordScaleProfile { int count = hj_cfg.coord_profile_schedule_count; if (count > AGP_HJ_MAX_PROGRESS_RULES) count = AGP_HJ_MAX_PROGRESS_RULES; const float p01 = clamp01(progress); if (count > 0) { for (int i = 0; i < count; ++i) { const AGPHJCoordScaleRule& rule = hj_cfg.coord_profile_schedule[i]; if (p01 < rule.progress_lt || i + 1 == count) return rule.profile; } } AGPHJCoordScaleProfile fallback{}; fallback.angle_scale_proximal = hj_cfg.angle_scale_first; fallback.angle_scale_distal = hj_cfg.angle_scale_last; fallback.length_scale_proximal = hj_cfg.length_scale_first; fallback.length_scale_distal = hj_cfg.length_scale_last; return fallback; }; auto select_hj_order_mode = [&](float progress) noexcept -> int { int order_mode = hj_cfg.order_mode; int count = hj_cfg.order_schedule_count; if (count <= 0) return order_mode; if (count > AGP_HJ_MAX_PROGRESS_RULES) count = AGP_HJ_MAX_PROGRESS_RULES; const float p01 = clamp01(progress); for (int i = 0; i < count; ++i) { const AGPHJOrderScheduleRule& rule = hj_cfg.order_schedule[i]; if (p01 < rule.progress_lt || i + 1 == count) { order_mode = rule.order_mode; break; } } return order_mode; }; auto coord_profile_scale = [&](int d, const AGPHJCoordScaleProfile& profile) noexcept -> float { if (d < n) { const float t01 = (n > 1) ? (static_cast<float>(d) / static_cast<float>(n - 1)) : 0.0f; return lerp_hj(profile.angle_scale_proximal, profile.angle_scale_distal, t01); } if (cost.variableLen) { const int j = d - n; const float t01 = (n > 1) ? (static_cast<float>(j) / static_cast<float>(n - 1)) : 0.0f; return lerp_hj(profile.length_scale_proximal, profile.length_scale_distal, t01); } return 1.0f; }; alignas(16) float hj_delta[AGP_MAX_STATE_DIM]; alignas(16) float hj_profile_scale_applied[AGP_MAX_STATE_DIM]; alignas(64) int hj_order_natural[AGP_MAX_STATE_DIM]; alignas(64) int hj_order_reverse[AGP_MAX_STATE_DIM]; alignas(64) int hj_order_interleaved_proximal[AGP_MAX_STATE_DIM]; alignas(64) int hj_order_interleaved_distal[AGP_MAX_STATE_DIM]; alignas(64) int hj_order_lengths_first[AGP_MAX_STATE_DIM]; { int pos = 0; while (pos < dim) { hj_order_natural[pos] = pos; ++pos; } pos = 0; int d = dim - 1; while (d >= 0) { hj_order_reverse[pos] = d; ++pos; --d; } pos = 0; if (cost.variableLen) { int j = 0; while (j < n) { hj_order_interleaved_proximal[pos] = j; ++pos; hj_order_interleaved_proximal[pos] = n + j; ++pos; ++j; } } else { int j = 0; while (j < n) { hj_order_interleaved_proximal[pos] = j; ++pos; ++j; } } pos = 0; if (cost.variableLen) { int j = n - 1; while (j >= 0) { hj_order_interleaved_distal[pos] = j; ++pos; hj_order_interleaved_distal[pos] = n + j; ++pos; --j; } } else { int j = n - 1; while (j >= 0) { hj_order_interleaved_distal[pos] = j; ++pos; --j; } } pos = 0; if (cost.variableLen) { int j = 0; while (j < n) { hj_order_lengths_first[pos] = n + j; ++pos; ++j; } } int j = 0; while (j < n) { hj_order_lengths_first[pos] = j; ++pos; ++j; } } auto select_hj_order_ptr = [&](const int order_mode) noexcept -> const int* { switch (order_mode) { case AGP_HJ_ORDER_NATURAL: return hj_order_natural; case AGP_HJ_ORDER_REVERSE: return hj_order_reverse; case AGP_HJ_ORDER_INTERLEAVED_PROXIMAL: return hj_order_interleaved_proximal; case AGP_HJ_ORDER_LENGTHS_FIRST: return hj_order_lengths_first; default: return hj_order_interleaved_distal; } }; int hj_profile_cache_key = INT_MIN; auto apply_hj_coord_profile = [&](const float progress) noexcept { AGPHJCoordScaleProfile profile{}; int key = -2; int count = hj_cfg.coord_profile_schedule_count; if (count > AGP_HJ_MAX_PROGRESS_RULES) count = AGP_HJ_MAX_PROGRESS_RULES; if (count > 0) { int i = 0; while (i < count) { const AGPHJCoordScaleRule& rule = hj_cfg.coord_profile_schedule[i]; if (progress < rule.progress_lt || i + 1 == count) { profile = rule.profile; key = i; break; } ++i; } } else { profile.angle_scale_proximal = hj_cfg.angle_scale_first; profile.angle_scale_distal = hj_cfg.angle_scale_last; profile.length_scale_proximal = hj_cfg.length_scale_first; profile.length_scale_distal = hj_cfg.length_scale_last; } if (key == hj_profile_cache_key) return; int d = 0; #pragma loop ivdep while (d < dim) { const float new_scale = fmaxf(coord_profile_scale(d, profile), 1.0e-12f); const float old_scale = fmaxf(hj_profile_scale_applied[d], 1.0e-12f); if (fabsf(new_scale - old_scale) > 1.0e-12f) { hj_delta[d] = fmaxf(hj_delta[d] * (new_scale / old_scale), hj_cfg.min_delta); hj_profile_scale_applied[d] = new_scale; } ++d; } hj_profile_cache_key = key; }; { const AGPHJCoordScaleProfile initial_hj_profile = select_hj_coord_profile(0.0f); for (int d = 0; d < dim; ++d) { const float span = fmaf(map.high[d], 1.0f, -map.low[d]); const float scale = fmaxf(coord_profile_scale(d, initial_hj_profile), 1.0e-12f); hj_profile_scale_applied[d] = scale; hj_delta[d] = fmaxf(span * hj_cfg.init_delta_frac * scale, hj_cfg.init_delta_abs); } } auto clampPoint = [&](float* q) noexcept { int i = 0; #pragma loop ivdep while (i < n) { float lo = (i == 0) ? -1.0471975511965977f : -2.6179938779914944f; float hi = 2.6179938779914944f; if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles() && i < cost.referenceAngleCount) { const float ref = cost.referenceState[i]; const float lo_ref = ref - cost.transitionMaxAngleStep; const float hi_ref = ref + cost.transitionMaxAngleStep; if (lo_ref > lo) lo = lo_ref; if (hi_ref < hi) hi = hi_ref; } if (cost.use_joint_angle_limit_constraints()) { const float hard = fabsf(cost.minTheta); const float lo_hard = -hard; const float hi_hard = hard; if (lo_hard > lo) lo = lo_hard; if (hi_hard < hi) hi = hi_hard; } if (q[i] < lo) q[i] = lo; else if (q[i] > hi) q[i] = hi; if (cost.solveMode == ManipSolveMode::Transition && cost.transitionPreserveJointMagnitudeFloor && cost.has_transition_reference_angles() && i < cost.referenceAngleCount) { const float ref = cost.referenceState[i]; if (fabsf(ref) > cost.transitionJointSignDeadband) { const float min_abs = cost.min_joint_abs_angle_bound(i); if (fabsf(q[i]) < min_abs) { q[i] = copysignf(min_abs, ref); if (q[i] < lo) q[i] = lo; else if (q[i] > hi) q[i] = hi; } } } ++i; } if (cost.variableLen) { i = 0; #pragma loop ivdep while (i < n) { float lo = fmaf(1.0f / cost.stretchFactor, cost.fixedLength, 0.0f); float hi = fmaf(cost.stretchFactor, cost.fixedLength, 0.0f); if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_lengths() && i < cost.referenceLengthCount) { const float refL = cost.referenceState[n + i]; const float dL = cost.max_length_step(i); const float lo_ref = refL - dL; const float hi_ref = refL + dL; if (lo_ref > lo) lo = lo_ref; if (hi_ref < hi) hi = hi_ref; } if (q[n + i] < lo) q[n + i] = lo; else if (q[n + i] > hi) q[n + i] = hi; ++i; } } }; auto t_to_idx = [&](float t) -> unsigned long long { return static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f)); }; auto finalize_interval_geometry = [&](IntervalND* I) noexcept { I->i1 = t_to_idx(I->x1); I->i2 = t_to_idx(I->x2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); }; auto update_pockets_and_Mmax = [&](IntervalND* I) noexcept { const int k = I->span_level; if (I->M > M_by_span[k]) M_by_span[k] = I->M; if (M_by_span[k] > Mmax) Mmax = M_by_span[k]; }; auto recompute_dmax = & noexcept { float new_dmax = 0.0f; for (auto* pI : H) { const float Ls = fmaf(pI->x2, 1.0f, -pI->x1); if (Ls > new_dmax) new_dmax = Ls; } dmax = (new_dmax > 0.0f) ? new_dmax : initial_len; }; auto recompute_heap_constM = [&](float m_cur) noexcept { const size_t sz = H.size(); if (sz == 0u) return; if (!hasHardConstraints) RecomputeR_ConstM_AVX2_ND(H.data(), sz, m_cur); else RecomputeR_ConstM_Mixed_ND(H.data(), sz, m_cur); std::make_heap(H.begin(), H.end(), ComparePtrND); }; auto update_best_from_trial = [&](const TrialPoint& tr) noexcept { if (better_indexed(tr.idx, tr.f, bestIndexFound, bestIndexValue)) { bestIndexFound = tr.idx; bestIndexValue = tr.f; agp_store_q(bestQIndexed, tr.q, dim); bestIndexedX = tr.x; bestIndexedY = tr.y; } if (tr.feasible) { if (tr.f < bestF) { bestF = tr.f; agp_store_q(bestQ, tr.q, dim); bestX = tr.x; bestY = tr.y; no_improve = 0; } else { ++no_improve; } } }; auto refine_trial = [&](float* q_inout, float& x_io, float& y_io, unsigned& idx_io, float& f_io, float t_lo, float t_hi, bool enforce_t_bounds) noexcept { if (!(f_io < fmaf(bestF, adaptive_coeff, 0.0f))) return; const float f_start = f_io; const float c1 = 1e-4f; const float tau = 0.5f; const int max_outer_iters = static_cast<int>(50.0f * (1.0f + 0.65f * p)); const int max_backtrack = static_cast<int>(20.0f * (1.0f + 0.65f * p)); const float lbfgs_trigger = 0.6f; const int m_lbfgs = 9; const int max_lbfgs_iters = static_cast<int>(25.0f * (1.0f + 0.65f * p)); const float eps_lbfgs_curv = 1e-6f; const float eps_descent = 1e-12f; float eta = 2.0f / sqrtf(dim_f); auto computeGrad = [&](const float* q_in, const float x_in, const float y_in, float* grad_out, float& grad_norm2_out) noexcept { float phi_acc_local = 0.0f; int ii = 0; #pragma loop ivdep while (ii < n) { phi_acc_local = fmaf(q_in[ii], 1.0f, phi_acc_local); phi[ii] = phi_acc_local; ++ii; } FABE13_SINCOS(phi, s_arr, c_arr, n); float acc_s = 0.0f; float acc_c = 0.0f; int kk = n - 1; while (kk >= 0) { const float Lk = cost.link_length(q_in, kk); acc_s = fmaf(Lk, s_arr[kk], acc_s); acc_c = fmaf(Lk, c_arr[kk], acc_c); sum_s[kk] = acc_s; sum_c[kk] = acc_c; --kk; } alignas(16) float prefix_tail_grad[AGP_MAX_STATE_DIM]{}; if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles() && cost.transitionPrefixEnergyWeight > 0.0f) { alignas(16) float prefix_delta[AGP_MAX_STATE_DIM]{}; float phi_prefix = 0.0f; int jj = 0; #pragma loop ivdep while (jj < cost.referenceAngleCount) { phi_prefix += q_in[jj]; prefix_delta[jj] = ManipCost::wrap_pi(phi_prefix - cost.referencePrefixAngles[jj]); ++jj; } float tail = 0.0f; jj = cost.referenceAngleCount - 1; while (jj >= 0) { tail += prefix_delta[jj]; prefix_tail_grad[jj] = tail; --jj; } } const float dx = fmaf(x_in, 1.0f, -cost.targetX); const float dy = fmaf(y_in, 1.0f, -cost.targetY); const float dist2 = fmaf(dx, dx, dy * dy); const float dist = sqrtf(dist2); const float inv_dist = (dist > 1.0e-20f) ? (1.0f / dist) : 0.0f; grad_norm2_out = 0.0f; int i = 0; #pragma loop ivdep while (i < n) { float gpen = 0.0f; float g_main = 0.0f; if (cost.solveMode == ManipSolveMode::Transition) { const float dtheta = (i < cost.referenceAngleCount) ? cost.wrapped_angle_delta(q_in, i) : 0.0f; gpen = 2.0f * cost.transitionEnergyWeight * dtheta; gpen = fmaf(2.0f * cost.transitionPrefixEnergyWeight, prefix_tail_grad[i], gpen); g_main = 2.0f * cost.transitionCaptureWeight * fmaf(dx, -sum_s[i], dy * sum_c[i]); } else { { const float tsg = fmaf(-q_in[i], cost.archBiasK, 0.0f); const float exp_arg = -tsg; const float exp_val = fmaf( exp_arg, fmaf( exp_arg, fmaf( exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float sig = 1.0f / (exp_val + 1.0f); gpen = fmaf(-cost.archBiasW * cost.archBiasK, sig, gpen); } g_main = fmaf(dx, -sum_s[i], dy * sum_c[i]) * inv_dist; } const float gi = g_main + gpen; grad_out[i] = gi; grad_norm2_out = fmaf(gi, gi, grad_norm2_out); ++i; } if (cost.variableLen) { int j = 0; #pragma loop ivdep while (j < n) { float gi = 0.0f; if (cost.solveMode == ManipSolveMode::Transition) { float gpenL = 0.0f; if (cost.has_transition_reference_lengths() && j < cost.referenceLengthCount) { gpenL = 2.0f * cost.transitionLengthEnergyWeight * (q_in[n + j] - cost.referenceState[n + j]); } gi = 2.0f * cost.transitionCaptureWeight * fmaf(dx, c_arr[j], dy * s_arr[j]) + gpenL; } else { gi = fmaf(dx, c_arr[j], dy * s_arr[j]) * inv_dist; } grad_out[n + j] = gi; grad_norm2_out = fmaf(gi, gi, grad_norm2_out); ++j; } } }; auto armijoLineSearch = [&](const float* q_base, const float f_base, const float* dir, const float gtd, float& alpha_io, float* q_out, float& f_out, float& x_out, float& y_out, unsigned& idx_out) noexcept -> bool { float alpha = alpha_io; int backtrack = 0; while (backtrack < max_backtrack) { int i = 0; #pragma loop ivdep while (i < dim) { q_out[i] = fmaf(alpha, dir[i], q_base[i]); ++i; } clampPoint(q_out); float x2 = 0.0f; float y2 = 0.0f; unsigned idx_try = 0u; float val_try = 0.0f; const bool feasible_try = cost.evaluate_indexed(q_out, x2, y2, idx_try, val_try); if (!feasible_try || !(val_try == val_try)) { alpha = tau; ++backtrack; continue; } const float t_try = clamp01(map.pointToT(q_out)); if (enforce_t_bounds) { if (t_hi - t_lo > 2.0f * T_EPS) { if (!(t_try > t_lo + T_EPS && t_try < t_hi - T_EPS)) { alpha = tau; ++backtrack; continue; } } else { if (fabsf(t_try - t_lo) > T_EPS) { alpha = tau; ++backtrack; continue; } } } if (val_try <= f_base + c1 * alpha * gtd) { alpha_io = alpha; f_out = val_try; x_out = x2; y_out = y2; idx_out = idx_try; return true; } alpha = tau; ++backtrack; } return false; }; bool lbfgs_already_tried = false; int outer = 0; while (outer < max_outer_iters) { float grad[AGP_MAX_STATE_DIM]; float grad_norm2 = 0.0f; computeGrad(q_inout, x_io, y_io, grad, grad_norm2); if (!(grad_norm2 == grad_norm2) || grad_norm2 < 1e-12f) break; float dir_gd[AGP_MAX_STATE_DIM]; int i = 0; #pragma loop ivdep while (i < dim) { dir_gd[i] = -grad[i]; ++i; } const float gtd_gd = -grad_norm2; float eta_trial = eta; float f_new = f_io; float x_new = x_io; float y_new = y_io; unsigned idx_new = idx_io; const bool found = armijoLineSearch( q_inout, f_io, dir_gd, gtd_gd, eta_trial, q_try, f_new, x_new, y_new, idx_new); if (!found) break; std::memcpy(q_inout, q_try, static_cast<size_t>(dim) * sizeof(float)); f_io = f_new; x_io = x_new; y_io = y_new; idx_io = idx_new; eta = eta_trial; const float rel_impr = (f_start > 1.0e-20f) ? ((f_start - f_io) / f_start) : 0.0f; if (!lbfgs_already_tried && rel_impr >= lbfgs_trigger) { lbfgs_already_tried = true; float q_resume[AGP_MAX_STATE_DIM]; std::memcpy(q_resume, q_inout, static_cast<size_t>(dim) * sizeof(float)); const float f_resume = f_io; const float x_resume = x_io; const float y_resume = y_io; const unsigned idx_resume = idx_io; const float eta_resume = eta; float q_best_lbfgs[AGP_MAX_STATE_DIM]; std::memcpy(q_best_lbfgs, q_inout, static_cast<size_t>(dim) * sizeof(float)); float f_best_lbfgs = f_io; float x_best_lbfgs = x_io; float y_best_lbfgs = y_io; unsigned idx_best_lbfgs = idx_io; float s_hist[m_lbfgs][AGP_MAX_STATE_DIM]; float y_hist[m_lbfgs][AGP_MAX_STATE_DIM]; float rho_hist[m_lbfgs]; float alpha_hist[m_lbfgs]; int hist_size = 0; float gk[AGP_MAX_STATE_DIM]; float gk_norm2 = 0.0f; computeGrad(q_inout, x_io, y_io, gk, gk_norm2); bool lbfgs_ok = true; float alpha_k = 2.0f / sqrtf(dim_f); int it_lbfgs = 0; while (it_lbfgs < max_lbfgs_iters) { if (!(gk_norm2 == gk_norm2) || gk_norm2 < 1e-12f) break; float dir[AGP_MAX_STATE_DIM]; if (hist_size == 0) { int d = 0; #pragma loop ivdep while (d < dim) { dir[d] = -gk[d]; ++d; } } else { float q_vec[AGP_MAX_STATE_DIM]; int d = 0; #pragma loop ivdep while (d < dim) { q_vec[d] = gk[d]; ++d; } for (int jj = hist_size - 1; jj >= 0; --jj) { float dot_sq = 0.0f; d = 0; #pragma loop ivdep while (d < dim) { dot_sq = fmaf(s_hist[jj][d], q_vec[d], dot_sq); ++d; } const float a_coeff = dot_sq * rho_hist[jj]; alpha_hist[jj] = a_coeff; d = 0; #pragma loop ivdep while (d < dim) { q_vec[d] = fmaf(-a_coeff, y_hist[jj][d], q_vec[d]); ++d; } } float gamma = 1.0f; { const int last = hist_size - 1; float yy = 0.0f; int d = 0; #pragma loop ivdep while (d < dim) { yy = fmaf(y_hist[last][d], y_hist[last][d], yy); ++d; } const float ys = 1.0f / rho_hist[last]; if (yy > 0.0f) gamma = ys / yy; } float r_vec[AGP_MAX_STATE_DIM]; d = 0; #pragma loop ivdep while (d < dim) { r_vec[d] = gamma * q_vec[d]; ++d; } for (int jj = 0; jj < hist_size; ++jj) { float dot_yr = 0.0f; int d = 0; #pragma loop ivdep while (d < dim) { dot_yr = fmaf(y_hist[jj][d], r_vec[d], dot_yr); ++d; } const float b_coeff = dot_yr * rho_hist[jj]; const float coeff = alpha_hist[jj] - b_coeff; d = 0; #pragma loop ivdep while (d < dim) { r_vec[d] = fmaf(coeff, s_hist[jj][d], r_vec[d]); ++d; } } d = 0; #pragma loop ivdep while (d < dim) { dir[d] = -r_vec[d]; ++d; } } float gtd = 0.0f; int d = 0; #pragma loop ivdep while (d < dim) { gtd = fmaf(gk[d], dir[d], gtd); ++d; } if (!(gtd == gtd) || gtd >= -eps_descent) { d = 0; #pragma loop ivdep while (d < dim) { dir[d] = -gk[d]; ++d; } gtd = -gk_norm2; if (gtd >= -eps_descent) { lbfgs_ok = false; break; } } float q_old[AGP_MAX_STATE_DIM]; std::memcpy(q_old, q_inout, static_cast<size_t>(dim) * sizeof(float)); float g_old[AGP_MAX_STATE_DIM]; std::memcpy(g_old, gk, static_cast<size_t>(dim) * sizeof(float)); float alpha_try = alpha_k; float f_try = f_io; float x_try = x_io; float y_try = y_io; unsigned idx_try = idx_io; const bool step_ok = armijoLineSearch( q_inout, f_io, dir, gtd, alpha_try, q_try, f_try, x_try, y_try, idx_try); if (!step_ok) { lbfgs_ok = false; break; } std::memcpy(q_inout, q_try, static_cast<size_t>(dim) * sizeof(float)); f_io = f_try; x_io = x_try; y_io = y_try; idx_io = idx_try; alpha_k = alpha_try; computeGrad(q_inout, x_io, y_io, gk, gk_norm2); if (!(gk_norm2 == gk_norm2)) { lbfgs_ok = false; break; } if (f_io < f_best_lbfgs) { f_best_lbfgs = f_io; std::memcpy(q_best_lbfgs, q_inout, static_cast<size_t>(dim) * sizeof(float)); x_best_lbfgs = x_io; y_best_lbfgs = y_io; idx_best_lbfgs = idx_io; } float ys = 0.0f; float s_new[AGP_MAX_STATE_DIM]; float y_new[AGP_MAX_STATE_DIM]; d = 0; #pragma loop ivdep while (d < dim) { const float sd = q_inout[d] - q_old[d]; const float yd = gk[d] - g_old[d]; s_new[d] = sd; y_new[d] = yd; ys = fmaf(yd, sd, ys); ++d; } if (!(ys == ys) || ys <= eps_lbfgs_curv) { hist_size = 0; } else { if (hist_size < m_lbfgs) { const int idx_hist = hist_size; rho_hist[idx_hist] = 1.0f / ys; d = 0; #pragma loop ivdep while (d < dim) { s_hist[idx_hist][d] = s_new[d]; y_hist[idx_hist][d] = y_new[d]; ++d; } ++hist_size; } else { for (int jj = 0; jj < m_lbfgs - 1; ++jj) { rho_hist[jj] = rho_hist[jj + 1]; d = 0; #pragma loop ivdep while (d < dim) { s_hist[jj][d] = s_hist[jj + 1][d]; y_hist[jj][d] = y_hist[jj + 1][d]; ++d; } } const int idx_hist = m_lbfgs - 1; rho_hist[idx_hist] = 1.0f / ys; d = 0; #pragma loop ivdep while (d < dim) { s_hist[idx_hist][d] = s_new[d]; y_hist[idx_hist][d] = y_new[d]; ++d; } hist_size = m_lbfgs; } } if (gk_norm2 < 1e-12f) break; ++it_lbfgs; } if (lbfgs_ok) { std::memcpy(q_inout, q_best_lbfgs, static_cast<size_t>(dim) * sizeof(float)); f_io = f_best_lbfgs; x_io = x_best_lbfgs; y_io = y_best_lbfgs; idx_io = idx_best_lbfgs; break; } else { if (f_best_lbfgs < f_resume) { std::memcpy(q_inout, q_best_lbfgs, static_cast<size_t>(dim) * sizeof(float)); f_io = f_best_lbfgs; x_io = x_best_lbfgs; y_io = y_best_lbfgs; idx_io = idx_best_lbfgs; } else { std::memcpy(q_inout, q_resume, static_cast<size_t>(dim) * sizeof(float)); f_io = f_resume; x_io = x_resume; y_io = y_resume; idx_io = idx_resume; } eta = eta_resume; } } ++outer; } }; auto evaluate_trial_from_q = [&](const float q_seed, float t_lo, float t_hi, bool enforce_t_bounds) noexcept -> TrialPoint { TrialPoint tr{}; std::memcpy(tr.q, q_seed, static_cast<size_t>(dim) * sizeof(float)); tr.feasible = cost.evaluate_indexed(tr.q, tr.x, tr.y, tr.idx, tr.f); if (tr.feasible) { refine_trial(tr.q, tr.x, tr.y, tr.idx, tr.f, t_lo, t_hi, enforce_t_bounds); } tr.t = clamp01(map.pointToT(tr.q)); if (enforce_t_bounds) { if (tr.t < t_lo) tr.t = t_lo; if (tr.t > t_hi) tr.t = t_hi; } update_best_from_trial(tr); return tr; }; auto evaluate_trial_from_t = [&](float t_base, float t_lo, float t_hi, bool enforce_t_bounds) noexcept -> TrialPoint { alignas(16) float q_seed[AGP_MAX_STATE_DIM]; map.map01ToPoint(t_base, q_seed); return evaluate_trial_from_q(q_seed, t_lo, t_hi, enforce_t_bounds); }; auto make_interval_from_trials = [&](const TrialPoint& Lp, const TrialPoint& Rp) noexcept -> IntervalND { if (!(Lp.t + T_EPS < Rp.t)) return nullptr; IntervalND I = new IntervalND(Lp.t, Rp.t, Lp.f, Rp.f, Lp.idx, Rp.idx); finalize_interval_geometry(I); return I; }; auto find_container_index = [&](float t) noexcept -> int { int best_pos = -1; float best_len = FLT_MAX; for (size_t i = 0u; i < H.size(); ++i) { IntervalND I = H[i]; if (t > I->x1 + T_EPS && t < I->x2 - T_EPS) { const float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } } if (best_pos >= 0) return best_pos; for (size_t i = 0u; i < H.size(); ++i) { IntervalND* I = H[i]; if (t >= I->x1 - T_EPS && t <= I->x2 + T_EPS) { const float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } } return best_pos; }; auto inject_trial_into_queue = [&](const TrialPoint& tr, float r_eff_cur, float adaptive_coeff_) noexcept { if (H.empty()) return; const int pos = find_container_index(tr.t); if (pos < 0) return; IntervalND* src = H[static_cast<size_t>(pos)]; const float src_len = fmaf(src->x2, 1.0f, -src->x1); if (!(tr.t > src->x1 + T_EPS && tr.t < src->x2 - T_EPS)) return; IntervalND* L = new IntervalND(src->x1, tr.t, src->y1, tr.f, src->idx1, tr.idx); IntervalND* Rv = new IntervalND(tr.t, src->x2, tr.f, src->y2, tr.idx, src->idx2); finalize_interval_geometry(L); finalize_interval_geometry(Rv); const float prevMmax = Mmax; H[static_cast<size_t>(pos)] = H.back(); H.pop_back(); delete src; if (!H.empty()) std::make_heap(H.begin(), H.end(), ComparePtrND); update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv); const float m_cur = fmaf(r_eff_cur, Mmax, 0.0f); L->ChangeCharacteristic(m_cur); Rv->ChangeCharacteristic(m_cur); H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND); H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND); if (Mmax > fmaf(adaptive_coeff_, prevMmax, 0.0f)) recompute_heap_constM(m_cur); if (src_len + T_EPS >= dmax) recompute_dmax(); }; auto find_container_index_bounds = [&](const float t, float& lo, float& hi) noexcept -> int { int best_pos = -1; float best_len = FLT_MAX; size_t i = 0u; while (i < H.size()) { IntervalND* const I = H[i]; if (t > I->x1 + T_EPS && t < I->x2 - T_EPS) { const float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } ++i; } if (best_pos < 0) { i = 0u; while (i < H.size()) { IntervalND* const I = H[i]; if (t >= I->x1 - T_EPS && t <= I->x2 + T_EPS) { const float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } ++i; } } if (best_pos >= 0) { const IntervalND* const I = H[static_cast<size_t>(best_pos)]; lo = I->x1; hi = I->x2; } else { lo = 0.0f; hi = 1.0f; } return best_pos; }; auto inject_trial_into_queue_fast = [&](const TrialPoint& tr, const int pos, float r_eff_cur, float adaptive_coeff_) noexcept { if (pos < 0 || static_cast<size_t>(pos) >= H.size()) { inject_trial_into_queue(tr, r_eff_cur, adaptive_coeff_); return; } IntervalND* const src = H[static_cast<size_t>(pos)]; if (!(tr.t > src->x1 + T_EPS && tr.t < src->x2 - T_EPS)) { inject_trial_into_queue(tr, r_eff_cur, adaptive_coeff_); return; } const float src_len = fmaf(src->x2, 1.0f, -src->x1); IntervalND* const L = new IntervalND(src->x1, tr.t, src->y1, tr.f, src->idx1, tr.idx); IntervalND* const Rv = new IntervalND(tr.t, src->x2, tr.f, src->y2, tr.idx, src->idx2); finalize_interval_geometry(L); finalize_interval_geometry(Rv); const float prevMmax = Mmax; H[static_cast<size_t>(pos)] = H.back(); H.pop_back(); delete src; if (!H.empty()) std::make_heap(H.begin(), H.end(), ComparePtrND); update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv); const float m_cur = fmaf(r_eff_cur, Mmax, 0.0f); L->ChangeCharacteristic(m_cur); Rv->ChangeCharacteristic(m_cur); H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND); H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND); if (Mmax > fmaf(adaptive_coeff_, prevMmax, 0.0f)) recompute_heap_constM(m_cur); if (src_len + T_EPS >= dmax) recompute_dmax(); }; auto progress_outgoing = & noexcept { for (auto itReq = g_pendingMulti.begin(); itReq != g_pendingMulti.end();) { if (itReq->req.test()) itReq = g_pendingMulti.erase(itReq); else ++itReq; } for (auto itReq = g_pendingBest.begin(); itReq != g_pendingBest.end();) { if (itReq->req.test()) itReq = g_pendingBest.erase(itReq); else ++itReq; } }; auto select_hj_seed = [&](const AlignedFloatVec*& seed_vec, float& seed_f, unsigned& seed_idx, float& seed_x, float& seed_y) noexcept -> bool { auto take_indexed = & noexcept -> bool { if (bestQIndexed.empty()) return false; seed_vec = &bestQIndexed; seed_f = bestIndexValue; seed_idx = bestIndexFound; seed_x = bestIndexedX; seed_y = bestIndexedY; return true; }; auto take_feasible = & noexcept -> bool { if (bestQ.empty()) return false; seed_vec = &bestQ; seed_f = bestF; seed_idx = fullConstraintIndex; seed_x = bestX; seed_y = bestY; return true; }; auto take_reference = & noexcept -> bool { if (!haveTransitionRefSeed) return false; seed_vec = &transitionReferenceSeed; seed_f = transitionRefF; seed_idx = transitionRefIdx; seed_x = transitionRefX; seed_y = transitionRefY; return true; }; seed_vec = nullptr; seed_f = FLT_MAX; seed_idx = 0u; seed_x = 0.0f; seed_y = 0.0f; if (cost.solveMode == ManipSolveMode::Transition) return take_feasible() || take_reference() || take_indexed(); return take_indexed() || take_feasible(); }; auto should_run_hj_this_iter = [&](float progress) noexcept -> bool { if (!hj_cfg.enabled) return false; if (hj_cfg.require_adaptive_flag && !adaptive) return false; int period = 1; int phase = 0; select_hj_iteration_schedule(progress, period, phase); if (period <= 1) return true; return (it % period) == phase; }; auto run_hooke_jeeves_iteration = [&](float r_eff_cur, float adaptive_coeff_, float progress) noexcept { if (H.empty()) return; const AlignedFloatVec* seedQ = nullptr; float seedF = FLT_MAX; float seedX = 0.0f; float seedY = 0.0f; unsigned seedIdx = 0u; if (!select_hj_seed(seedQ, seedF, seedIdx, seedX, seedY)) return; TrialPoint base{}; std::memcpy(base.q, seedQ->data(), static_cast<size_t>(dim) * sizeof(float)); base.t = clamp01(map.pointToT(base.q)); base.f = seedF; base.idx = seedIdx; base.x = seedX; base.y = seedY; base.feasible = (base.idx == fullConstraintIndex); auto evaluate_hj_candidate = [&](const float* q_candidate) noexcept -> TrialPoint { const float t_guess = clamp01(map.pointToT(q_candidate)); float lo = 0.0f; float hi = 1.0f; const int pos = find_container_index_bounds(t_guess, lo, hi); const TrialPoint tr = evaluate_trial_from_q(q_candidate, lo, hi, true); inject_trial_into_queue_fast(tr, pos, r_eff_cur, adaptive_coeff_); return tr; }; apply_hj_coord_profile(progress); const int hj_order_mode = select_hj_order_mode(progress); const int* const hj_order = select_hj_order_ptr(select_hj_order_mode(progress)); TrialPoint current = base; TrialPoint base_before = base; bool improved_any = false; int ord = 0; while (ord < dim) { const int d = hj_order[ord]; const float step = hj_delta[d]; const TrialPoint axis_base = current; auto try_direction = [&](int sgn, TrialPoint& accepted_tp) noexcept -> bool { float q_candidate[AGP_MAX_STATE_DIM]; std::memcpy(q_candidate, axis_base.q, static_cast<size_t>(dim) * sizeof(float)); q_candidate[d] = fmaf(static_cast<float>(sgn), step, q_candidate[d]); clampPoint(q_candidate); if (fabsf(q_candidate[d] - axis_base.q[d]) <= 1.0e-12f) return false; const TrialPoint tp = evaluate_hj_candidate(q_candidate); if (better_indexed(tp.idx, tp.f, axis_base.idx, axis_base.f)) { accepted_tp = tp; return true; } return false; }; const int first_sign = hj_cfg.plus_first ? +1 : -1; const int second_sign = -first_sign; bool improved_this_axis = false; TrialPoint best_axis_tp{}; bool have_axis_tp = false; TrialPoint tp_try{}; if (try_direction(first_sign, tp_try)) { best_axis_tp = tp_try; have_axis_tp = true; if (hj_cfg.opportunistic_axis) { current = best_axis_tp; improved_any = true; improved_this_axis = true; } } if (!hj_cfg.opportunistic_axis || !have_axis_tp) { if (try_direction(second_sign, tp_try)) { if (!have_axis_tp || better_indexed(tp_try.idx, tp_try.f, best_axis_tp.idx, best_axis_tp.f)) { best_axis_tp = tp_try; have_axis_tp = true; } } if (have_axis_tp) { current = best_axis_tp; improved_any = true; improved_this_axis = true; } } if (improved_this_axis && hj_cfg.expand_delta_on_success) { hj_delta[d] = fmaxf(hj_cfg.min_delta, hj_delta[d] * hj_cfg.expand_factor); } ++ord; } if (improved_any) { if (hj_cfg.use_pattern) { float q_pattern[AGP_MAX_STATE_DIM]; for (int d = 0; d < dim; ++d) q_pattern[d] = fmaf(hj_cfg.pattern_gain, current.q[d] - base_before.q[d], current.q[d]); clampPoint(q_pattern); TrialPoint pat_tp = evaluate_hj_candidate(q_pattern); if (better_indexed(pat_tp.idx, pat_tp.f, current.idx, current.f)) current = pat_tp; } } else { for (int d = 0; d < dim; ++d) hj_delta[d] = fmaxf(hj_delta[d] * hj_cfg.shrink_factor, hj_cfg.min_delta); } }; auto ensure_queue_nonempty = [&](float r_eff_cur) noexcept -> bool { if (!H.empty()) return true; const AlignedFloatVec* seedQ = nullptr; float seedF = FLT_MAX; float seedX = 0.0f; float seedY = 0.0f; unsigned seedIdx = 0u; if (!select_hj_seed(seedQ, seedF, seedIdx, seedX, seedY)) return false; (void)seedF; (void)seedX; (void)seedY; (void)seedIdx; const float t_center = clamp01(map.pointToT(seedQ->data())); const float half = fmaxf(1.0e-4f * dim_f, 1.0e-6f); const float t1b = clamp01(t_center - half); const float t2b = clamp01(t_center + half); const float mid = 0.5f * (t1b + t2b); const TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true); const TrialPoint right_tp = evaluate_trial_from_t(t2b, mid, t2b, true); IntervalND* I = make_interval_from_trials(left_tp, right_tp); if (I == nullptr) return false; update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff_cur, Mmax, 0.0f)); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); recompute_dmax(); return true; }; while (g_world->iprobe(boost::mpi::any_source, 0)) { MultiCrossMsg dummy; g_world->recv(boost::mpi::any_source, 0, dummy); } while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg dummy; g_world->recv(boost::mpi::any_source, 2, dummy); } const int seedStride = AGP_MAX_SEED_STRIDE; AlignedFloatVec seeds(static_cast<size_t>(AGP_MAX_GENERATED_SEEDS) * static_cast<size_t>(seedStride), 0.0f); const int seedCnt = generate_heuristic_seeds( cost, map, dim, seeds.data(), seedStride, static_cast<unsigned>(fmaf(static_cast<float>(g_world->rank()), 7919.0f, static_cast<float>(seed)))); const int K_raw = static_cast<int>(fmaf(-fmaf(sqrtf(dim_f), dim_f, 0.0f), 0.725f, 10.95f)); const int K = (K_raw > 2) ? K_raw : 2; H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + static_cast<size_t>(seedCnt) + 32u); const int rank = g_world->rank(); const int world = g_world->size(); const size_t comm_levels = std::bit_width(static_cast<size_t>(world - 1)); for (int i = 0; i < seedCnt; ++i) { const float* s = seeds.data() + static_cast<size_t>(i) * static_cast<size_t>(seedStride); const float t_seed = clamp01(map.pointToT(s)); const float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f) : fmaf( fmaf(0.00031f, static_cast<float>(dim), 0.0f), exp2f( (1.0f / static_cast<float>(fmaxf(static_cast<float>(seedCnt - 4), 1.0f))) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)), 0.0f); const float t1b = clamp01(fmaf(-interval_size, 1.0f, t_seed)); const float t2b = clamp01(fmaf(interval_size, 1.0f, t_seed)); const float mid = 0.5f * (t1b + t2b); const TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true); const TrialPoint right_tp = evaluate_trial_from_t(t2b, mid, t2b, true); IntervalND* I = make_interval_from_trials(left_tp, right_tp); if (I == nullptr) continue; update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); if (i < 3) { I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f); } else { const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f); const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f); const float mult = fmaf( start_mult, exp2f( (1.0f / static_cast<float>(fmaxf(static_cast<float>(seedCnt - 4), 1.0f))) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)), 0.0f); I->R = fmaf(I->R, mult, 0.0f); } H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } std::vector<float> grid_base; grid_base.reserve(static_cast<size_t>(K) + 2u); grid_base.emplace_back(a); for (int k = 1; k <= K; ++k) { const float t = clamp01( fmaf( fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1)))); grid_base.emplace_back(t); } grid_base.emplace_back(b); std::vector<TrialPoint> grid_trials(grid_base.size()); for (size_t i = 0u; i < grid_base.size(); ++i) { float lo = grid_base[i]; float hi = grid_base[i]; if (i == 0u) { lo = hi = a; } else if (i + 1u == grid_base.size()) { lo = hi = b; } else { lo = 0.5f * (grid_base[i - 1u] + grid_base[i]); hi = 0.5f * (grid_base[i] + grid_base[i + 1u]); } grid_trials[i] = evaluate_trial_from_t(grid_base[i], lo, hi, true); } for (size_t i = 1u; i < grid_trials.size(); ++i) { IntervalND* I = make_interval_from_trials(grid_trials[i - 1u], grid_trials[i]); if (I == nullptr) continue; update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } recompute_dmax(); if (haveTransitionRefSeed) { TrialPoint ref_tp = evaluate_trial_from_q(transitionReferenceSeed.data(), 0.0f, 1.0f, false); if (!H.empty()) { inject_trial_into_queue(ref_tp, r, 1.0f); recompute_dmax(); } else { ensure_queue_nonempty(r); } } g_world->barrier(); while (true) { p = fmaf(-1.0f / initial_len, dmax, 1.0f); stag_r_multiplier = 1.4f - 1.1f * fmaf( 0.65f * p - 0.45f, fmaf( 0.65f * p - 0.45f, fmaf( 0.65f * p - 0.45f, fmaf( 0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), 0.65f * p - 0.45f); const float p_arg = fmaf(p, 2.3f, -2.9775f); float current_r = r; if (stag_boost_remaining > 0) { current_r = r * stag_r_multiplier; --stag_boost_remaining; } const float exp_arg = fmaf(B_dim, p, 0.0f); const float exp_arg__ = fmaf(B_dim__, p, 0.0f); adaptive_coeff = fmaf( -fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim); const float first_sqrt = sqrtf(fmaf(1.0f / dim_f, 2.0f, 0.0f)); const float second_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 5.0f, 0.0f)); const float third_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 9.0f, 0.0f)); const float fourth_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 6.5f, 0.0f)); const float rr = sqrtf(fmaf(-p, 1.0f, 1.0f)); const float xx = p * p; const float tt = fmaf(500.0f, p, -486.95472f); const float adaptive_coeff_ = (p < 0.95f) ? fmaf(fmaf(first_sqrt, xx, 0.0f), 0.0130349902f, fmaf(-0.04f, p, fmaf(fmaf(first_sqrt, rr, 0.0f), 0.15f, 1.1f))) : (p < 0.97390944f) ? fmaf(second_sqrt, rr, 0.9396f) : (p < 0.97590944f) ? fmaf( fmaf( fmaf(fmaf(third_sqrt, tt, 0.0f), tt, 0.0f), fmaf(-2.0f, tt, 3.0f), 0.0f), fmaf(0.25f, rr, -0.0396f), fmaf(fmaf(third_sqrt, rr, 0.0f), 0.75f, 0.9396f)) : fmaf(fourth_sqrt, rr, 0.925f); adaptive_coeff__ = fmaf( fmaf(exp_arg__, fmaf(exp_arg__, fmaf(exp_arg__, fmaf(exp_arg__, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition__, 2.0f - A_dim__); float grad_norm2_best = 0.0f; if (no_improve > 0 && !bestQ.empty()) { float acc_best = 0.0f; float phi_best[AGP_MAX_STATE_DIM]; int ii_best = 0; #pragma loop ivdep while (ii_best < n) { acc_best = fmaf(bestQ[ii_best], 1.0f, acc_best); phi_best[ii_best] = acc_best; ++ii_best; } float s_best[AGP_MAX_STATE_DIM]; float c_best[AGP_MAX_STATE_DIM]; FABE13_SINCOS(phi_best, s_best, c_best, n); float as_best = 0.0f; float ac_best = 0.0f; float sum_s_best[AGP_MAX_STATE_DIM]; float sum_c_best[AGP_MAX_STATE_DIM]; int k_best = n - 1; while (k_best >= 0) { const float Lk = cost.variableLen ? bestQ[n + k_best] : cost.fixedLength; as_best = fmaf(Lk, s_best[k_best], as_best); ac_best = fmaf(Lk, c_best[k_best], ac_best); sum_s_best[k_best] = as_best; sum_c_best[k_best] = ac_best; --k_best; } alignas(16) float prefix_tail_best[AGP_MAX_STATE_DIM]{}; if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles() && cost.transitionPrefixEnergyWeight > 0.0f) { alignas(16) float prefix_delta_best[AGP_MAX_STATE_DIM]{}; float phi_pref = 0.0f; int jj = 0; #pragma loop ivdep while (jj < cost.referenceAngleCount) { phi_pref += bestQ[jj]; prefix_delta_best[jj] = ManipCost::wrap_pi(phi_pref - cost.referencePrefixAngles[jj]); ++jj; } float tail = 0.0f; jj = cost.referenceAngleCount - 1; while (jj >= 0) { tail += prefix_delta_best[jj]; prefix_tail_best[jj] = tail; --jj; } } const float dx_best = fmaf(bestX, 1.0f, -cost.targetX); const float dy_best = fmaf(bestY, 1.0f, -cost.targetY); const float dist_best = sqrtf(fmaf(dx_best, dx_best, dy_best * dy_best)); const float inv_dist_best = (dist_best > 1.0e-20f) ? (1.0f / dist_best) : 0.0f; int i_best = 0; #pragma loop ivdep while (i_best < n) { float gpen_best = 0.0f; float g_main_best = 0.0f; if (cost.solveMode == ManipSolveMode::Transition) { const float dtheta_best = (i_best < cost.referenceAngleCount) ? cost.wrapped_angle_delta(bestQ.data(), i_best) : 0.0f; gpen_best = 2.0f * cost.transitionEnergyWeight * dtheta_best; gpen_best = fmaf(2.0f * cost.transitionPrefixEnergyWeight, prefix_tail_best[i_best], gpen_best); g_main_best = 2.0f * cost.transitionCaptureWeight * fmaf(dx_best, -sum_s_best[i_best], dy_best * sum_c_best[i_best]); } else { { const float tsg = fmaf(-bestQ[i_best], cost.archBiasK, 0.0f); const float exp_arg_g = -tsg; const float exp_val = fmaf( exp_arg_g, fmaf( exp_arg_g, fmaf( exp_arg_g, fmaf(exp_arg_g, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float sig = 1.0f / (exp_val + 1.0f); gpen_best = fmaf(-cost.archBiasW * cost.archBiasK, sig, gpen_best); } g_main_best = fmaf(dx_best, -sum_s_best[i_best], dy_best * sum_c_best[i_best]) * inv_dist_best; } const float gi_best = g_main_best + gpen_best; grad_norm2_best = fmaf(gi_best, gi_best, grad_norm2_best); ++i_best; } if (cost.variableLen) { int j_best = 0; #pragma loop ivdep while (j_best < n) { float gi_best = 0.0f; if (cost.solveMode == ManipSolveMode::Transition) { float gpenL_best = 0.0f; if (cost.has_transition_reference_lengths() && j_best < cost.referenceLengthCount) { gpenL_best = 2.0f * cost.transitionLengthEnergyWeight * (bestQ[n + j_best] - cost.referenceState[n + j_best]); } gi_best = 2.0f * cost.transitionCaptureWeight * fmaf(dx_best, c_best[j_best], dy_best * s_best[j_best]) + gpenL_best; } else { gi_best = fmaf(dx_best, c_best[j_best], dy_best * s_best[j_best]) * inv_dist_best; } grad_norm2_best = fmaf(gi_best, gi_best, grad_norm2_best); ++j_best; } } } const bool stagnation = (no_improve > noImproveThrDim) && (grad_norm2_best < 0.5e-1f); const float r_eff = (dim > 2) ? fmaf( -fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, fmaf(sqrtf(dim_f - 1), current_r, 0.0f), 0.0f) : fmaf( -fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, current_r, 0.0f); if (stagnation) { stag_boost_remaining = n_stag_iters; float t_seeds[32]; int seed_count = 0; if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles()) { alignas(16) float q_center[AGP_MAX_STATE_DIM]; std::memcpy(q_center, cost.referenceState, static_cast<size_t>(dim) * sizeof(float)); t_seeds[seed_count++] = clamp01(map.pointToT(q_center)); if (!bestQ.empty() && seed_count < 32) t_seeds[seed_count++] = clamp01(map.pointToT(bestQ.data())); unsigned st_local = seed + static_cast<unsigned>(it) + 911u; const int local_variants = 2 + static_cast<int>(sqrtf(dim_f)); for (int v = 0; v < local_variants && seed_count < 32; ++v) { float q_temp[AGP_MAX_STATE_DIM]; std::memcpy(q_temp, q_center, static_cast<size_t>(dim) * sizeof(float)); st_local ^= st_local << 13; st_local ^= st_local >> 17; st_local ^= st_local << 5; const float rnd0 = static_cast<float>(st_local & 0xFFFFFFu) * 5.9604645e-8f; const int aidx = v % n; q_temp[aidx] = fmaf((2.0f * rnd0 - 1.0f), 0.25f * cost.transitionMaxAngleStep, q_temp[aidx]); if (cost.variableLen && cost.has_transition_reference_lengths()) { st_local ^= st_local << 13; st_local ^= st_local >> 17; st_local ^= st_local << 5; const float rnd1 = static_cast<float>(st_local & 0xFFFFFFu) * 5.9604645e-8f; const int lidx = v % n; q_temp[n + lidx] = fmaf((2.0f * rnd1 - 1.0f), 0.25f * cost.max_length_step(lidx), q_temp[n + lidx]); } clampPoint(q_temp); t_seeds[seed_count++] = clamp01(map.pointToT(q_temp)); } } else { const int num_ik = 1 + static_cast<int>(sqrtf(dim_f)); float dist_to_target = sqrtf(cost.targetX * cost.targetX + cost.targetY * cost.targetY); float max_reach = 0.0f; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) max_reach += map.high[n + ii]; } else { max_reach = static_cast<float>(n) * cost.fixedLength; } const float ratio = dist_to_target / fmaxf(max_reach, 1.0e-20f); const bool prefer_extended = (ratio > 0.7f); const bool prefer_compact = (ratio < 0.4f); const bool use_ik = !(ratio > 0.4f && ratio < 0.7f); if (!use_ik) { AlignedFloatVec temp_S(static_cast<size_t>(AGP_MAX_GENERATED_SEEDS) * static_cast<size_t>(AGP_MAX_SEED_STRIDE), 0.0f); const int sobol_gen = generate_sobol_seeds_limited( map, dim, temp_S.data(), AGP_MAX_SEED_STRIDE, seed + static_cast<unsigned>(it), AGP_MAX_GENERATED_SEEDS); const int num_sobol = num_ik; for (int kk = 0; kk < num_sobol && kk < sobol_gen && seed_count < 32; ++kk) { const float* s = temp_S.data() + static_cast<size_t>(kk) * static_cast<size_t>(AGP_MAX_SEED_STRIDE); t_seeds[seed_count++] = clamp01(map.pointToT(s)); } } else { float angles_ccd[32] = { 0.0f }; float lengths_ccd[32]; if (cost.variableLen) { const float len_low = map.low[n]; const float len_high = map.high[n]; const float avg_len = 0.5f * (len_low + len_high); for (int ii = 0; ii < n; ++ii) lengths_ccd[ii] = avg_len; } else { for (int ii = 0; ii < n; ++ii) lengths_ccd[ii] = cost.fixedLength; } ccd_ik(cost.targetX, cost.targetY, lengths_ccd, n, angles_ccd, 10); float angles_fabrik[32] = { 0.0f }; float lengths_fabrik[32]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) lengths_fabrik[ii] = lengths_ccd[ii]; } else { for (int ii = 0; ii < n; ++ii) lengths_fabrik[ii] = cost.fixedLength; } float targetX_fab = cost.targetX; float targetY_fab = cost.targetY; for (int iter_fab = 0; iter_fab < 3; ++iter_fab) { float prevX = targetX_fab; float prevY = targetY_fab; for (int j = n - 1; j >= 0; --j) { const float len = lengths_fabrik[j]; const float angle_to_target = atan2f(prevY, prevX); angles_fabrik[j] = angle_to_target; float s_val; float c_val; FABE13_SINCOS(&angle_to_target, &s_val, &c_val, 1); prevX = prevX - len * c_val; prevY = prevY - len * s_val; } } if (prefer_extended) { { float q_ccd[AGP_MAX_STATE_DIM]; for (int ii = 0; ii < n; ++ii) q_ccd[ii] = angles_ccd[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) q_ccd[n + ii] = lengths_ccd[ii]; } t_seeds[seed_count++] = clamp01(map.pointToT(q_ccd)); } unsigned st_ik = seed + static_cast<unsigned>(it) + 222u; const int remaining = num_ik - 1; for (int v = 0; v < remaining && seed_count < 32; ++v) { float noisy_angles[32]; float noisy_lengths[32]; for (int ii = 0; ii < n; ++ii) { st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; const float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f; noisy_angles[ii] = angles_ccd[ii] + (2.0f * rnd - 1.0f) * 0.1f; const float lo = (ii == 0) ? -1.0471975511965976f : -2.6179938779914944f; const float hi = 2.6179938779914944f; if (noisy_angles[ii] < lo) noisy_angles[ii] = lo; if (noisy_angles[ii] > hi) noisy_angles[ii] = hi; } if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) { st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; const float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f; noisy_lengths[ii] = lengths_ccd[ii] + (2.0f * rnd - 1.0f) * 0.05f; if (noisy_lengths[ii] < 0.5f) noisy_lengths[ii] = 0.5f; if (noisy_lengths[ii] > 2.0f) noisy_lengths[ii] = 2.0f; } } float q_temp[AGP_MAX_STATE_DIM]; for (int ii = 0; ii < n; ++ii) q_temp[ii] = noisy_angles[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) q_temp[n + ii] = noisy_lengths[ii]; } t_seeds[seed_count++] = clamp01(map.pointToT(q_temp)); } } else if (prefer_compact) { { float q_fabrik[AGP_MAX_STATE_DIM]; for (int ii = 0; ii < n; ++ii) q_fabrik[ii] = angles_fabrik[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) q_fabrik[n + ii] = lengths_fabrik[ii]; } t_seeds[seed_count++] = clamp01(map.pointToT(q_fabrik)); } unsigned st_ik = seed + static_cast<unsigned>(it) + 222u; const int remaining = num_ik - 1; for (int v = 0; v < remaining && seed_count < 32; ++v) { float noisy_angles[32]; float noisy_lengths[32]; for (int ii = 0; ii < n; ++ii) { st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; const float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f; noisy_angles[ii] = angles_fabrik[ii] + (2.0f * rnd - 1.0f) * 0.06f; const float lo = (ii == 0) ? -1.0471975511965976f : -2.6179938779914944f; const float hi = 2.6179938779914944f; if (noisy_angles[ii] < lo) noisy_angles[ii] = lo; if (noisy_angles[ii] > hi) noisy_angles[ii] = hi; } if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) { st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; const float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f; noisy_lengths[ii] = lengths_fabrik[ii] + (2.0f * rnd - 1.0f) * 0.03f; if (noisy_lengths[ii] < 0.5f) noisy_lengths[ii] = 0.5f; if (noisy_lengths[ii] > 2.0f) noisy_lengths[ii] = 2.0f; } } float q_temp[AGP_MAX_STATE_DIM]; for (int ii = 0; ii < n; ++ii) q_temp[ii] = noisy_angles[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) q_temp[n + ii] = noisy_lengths[ii]; } t_seeds[seed_count++] = clamp01(map.pointToT(q_temp)); } } } } for (int s = 0; s < seed_count; ++s) { const float t_center = t_seeds[s]; const float interval_size = fmaf(0.00031f, static_cast<float>(dim), 0.0f); const float t1b = clamp01(t_center - 0.5f * interval_size); const float t2b = clamp01(t_center + 0.5f * interval_size); const float mid = 0.5f * (t1b + t2b); const TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true); const TrialPoint right_tp = evaluate_trial_from_t(t2b, mid, t2b, true); IntervalND* I = make_interval_from_trials(left_tp, right_tp); if (I == nullptr) continue; update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); I->R = fmaf(I->R, fmaf(0.01f, dim_f, 0.85f), 0.0f); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } recompute_dmax(); no_improve = 0; } const float bestFOld = bestF; const bool local_idle = !ensure_queue_nonempty(r_eff); const bool do_hj_this_iter = should_run_hj_this_iter(p); if (!local_idle) { if (do_hj_this_iter) { run_hooke_jeeves_iteration(r_eff, adaptive_coeff_, p); } else { std::pop_heap(H.begin(), H.end(), ComparePtrND); IntervalND* cur = H.back(); H.pop_back(); const float x1 = cur->x1; const float x2 = cur->x2; const float y1 = cur->y1; const float y2 = cur->y2; float m = fmaf(r_eff, Mmax, 0.0f); const float tBase = step(m, x1, x2, y1, y2, dim_f, r_eff, cur->idx1, cur->idx2); const TrialPoint trNew = evaluate_trial_from_t(tBase, x1, x2, true); IntervalND* L = new IntervalND(x1, trNew.t, y1, trNew.f, cur->idx1, trNew.idx); IntervalND* Rv = new IntervalND(trNew.t, x2, trNew.f, y2, trNew.idx, cur->idx2); finalize_interval_geometry(L); finalize_interval_geometry(Rv); const float Mloc = fmaxf(L->M, Rv->M); const float prevMmax = Mmax; update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv); m = fmaf(r_eff, Mmax, 0.0f); const float len1 = fmaf(trNew.t, 1.0f, -x1); const float len2 = fmaf(x2, 1.0f, -trNew.t); if (fmaf(len1, 1.0f, len2) >= dmax - T_EPS) { dmax = fmaxf(len1, len2); for (auto* pI : H) { const float Ls = fmaf(pI->x2, 1.0f, -pI->x1); if (Ls > dmax) dmax = Ls; } } if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) { const float alpha = p * p; const float beta = fmaf(-alpha, 1.0f, 2.0f); const float MULT = (1.0f / fmaxf(dmax, 1.0e-20f)) * Mmax; const float global_coeff = fmaf(MULT, r_eff, -MULT); const float GF = beta * global_coeff; L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f))); Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f))); const size_t sz = H.size(); if (sz > 0u) { if (!hasHardConstraints) RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha); else RecomputeR_AffineM_Mixed_ND(H.data(), sz, GF, alpha); std::make_heap(H.begin(), H.end(), ComparePtrND); } } else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); if (Mloc > prevMmax && Mloc > fmaf(adaptive_coeff_, prevMmax, 0.0f)) recompute_heap_constM(m); } H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND); H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND); delete cur; } } if (!H.empty()) { mm_prefetch(reinterpret_cast<const char*>(H[0]), MM_HINT_T0); if (H.size() > 1u) mm_prefetch(reinterpret_cast<const char*>(H[1]), MM_HINT_T0); } if (!H.empty() && (stagnation || bestF < fmaf(fmaf(bestFOld, 0.6f, 0.0f), adaptive_coeff, 0.0f)) && it - last_send_T >= send_interval_T) { last_send_T = it; progress_outgoing(); const size_t scan_count = H.size(); unsigned intervals_to_send = (dim < 12) ? static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f))) : 7u; if (intervals_to_send > scan_count) intervals_to_send = static_cast<unsigned>(scan_count); if (intervals_to_send > 0u) { const float alpha = 0.63f - sqrtf(fmaxf(-0.113f * (p - 1.0f), 0.0f)); const float beta = 1.0f - alpha; const float w_pos = 3.085f * (-sqrtf(fmaxf(0.007f * p, 0.0f)) + 1.0f); const float w_size = 5.085f - w_pos; const int num_bins = (p < 1.0f - 1.0e-9f) ? static_cast<int>(1.0f / (1.0f - p)) : 1; std::vector<float> pos_metric(scan_count); std::vector<float> size_metric(scan_count); std::vector<float> R_val(scan_count); float R_max = -FLT_MAX; float R_min = FLT_MAX; float size_max = -FLT_MAX; float size_min = FLT_MAX; float pos_max = -FLT_MAX; float pos_min = FLT_MAX; for (size_t idx = 0u; idx < scan_count; ++idx) { IntervalND* I = H[idx]; const float center = 0.5f * (I->x1 + I->x2); const float len = fmaf(I->x2, 1.0f, -I->x1); const float size = len * static_cast<float>(1 << I->span_level); if (p < 0.95f && num_bins > 1) { float bin = floorf(center * static_cast<float>(num_bins)); if (bin >= static_cast<float>(num_bins)) bin = static_cast<float>(num_bins - 1); pos_metric[idx] = bin / static_cast<float>(num_bins - 1); } else { pos_metric[idx] = center; } size_metric[idx] = size; R_val[idx] = I->R; if (pos_metric[idx] > pos_max) pos_max = pos_metric[idx]; if (pos_metric[idx] < pos_min) pos_min = pos_metric[idx]; if (size > size_max) size_max = size; if (size < size_min) size_min = size; if (I->R > R_max) R_max = I->R; if (I->R < R_min) R_min = I->R; } auto norm = [](float val, float minv, float maxv) noexcept -> float { const float denom = maxv - minv; if (fabsf(denom) <= 1.0e-20f) return 0.0f; return (val - minv) / denom; }; std::vector<float> R_norm(scan_count); std::vector<float> pos_norm(scan_count); std::vector<float> size_norm(scan_count); for (size_t idx = 0u; idx < scan_count; ++idx) { R_norm[idx] = norm(R_val[idx], R_min, R_max); pos_norm[idx] = norm(pos_metric[idx], pos_min, pos_max); size_norm[idx] = norm(size_metric[idx], size_min, size_max); } std::vector<int> selected; selected.reserve(intervals_to_send); std::vector<unsigned char> used(scan_count, 0u); selected.push_back(0); used[0] = true; while (selected.size() < intervals_to_send) { int best_idx = -1; float best_score = -FLT_MAX; for (size_t idx = 0u; idx < scan_count; ++idx) { if (used[idx] != 0u) continue; float min_dist = FLT_MAX; for (int s : selected) { float d2 = 0.0f; float d = pos_norm[idx] - pos_norm[static_cast<size_t>(s)]; d2 += w_pos * d * d; d = size_norm[idx] - size_norm[static_cast<size_t>(s)]; d2 += w_size * d * d; if (d2 < min_dist) min_dist = d2; } const float novelty = sqrtf(fmaxf(min_dist, 0.0f)) / sqrtf(fmaxf(w_pos + w_size, 1.0e-20f)); const float score = alpha * R_norm[idx] + beta * novelty; if (score > best_score) { best_score = score; best_idx = static_cast<int>(idx); } } if (best_idx < 0) break; selected.push_back(best_idx); used[static_cast<size_t>(best_idx)] = 1u; } MultiCrossMsg out{}; out.count = static_cast<unsigned>(selected.size()); float* dest = out.intervals; for (unsigned s = 0u; s < out.count; ++s) { IntervalND* Tt = H[static_cast<size_t>(selected[s])]; dest[0] = Tt->x1; dest[1] = 0.0f; dest[2] = Tt->x2; dest[3] = 0.0f; dest[4] = Tt->R; dest += 5; } const size_t iterations = comm_levels; bool active = true; const bool invert_T = ((static_cast<int>(exchange_counter_T) + 1) & 1) != 0; size_t ii2 = 0u; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ static_cast<int>(step); if (partner < world) { const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T); if (am_sender) { g_pendingMulti.emplace_back(g_world, partner, out); if (g_pendingMulti.size() > MAX_INFLIGHT_MULTI) { g_pendingMulti.front().req.wait(); g_pendingMulti.pop_front(); } active = false; } } ++ii2; } ++exchange_counter_T; } } const bool indexedBestImprovedSinceLastSend = !bestQIndexed.empty() && better_indexed(bestIndexFound, bestIndexValue, lastSentBestMsg.bestIndex, lastSentBestMsg.bestF); if (indexedBestImprovedSinceLastSend && it - last_send_best >= send_interval_best) { last_send_best = it; progress_outgoing(); const size_t iterations = comm_levels; bool active = true; const bool invert_best = ((static_cast<int>(exchange_counter) + 1) & 1) != 0; size_t ii2 = 0u; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ static_cast<int>(step); if (partner < world) { const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_best); if (am_sender) { BestSolutionMsg outMsg; InitBestSolutionMsg(outMsg); if (FillBestSolutionMsg( outMsg, bestIndexFound, bestIndexValue, bestIndexedX, bestIndexedY, bestQIndexed) && better_indexed( outMsg.bestIndex, outMsg.bestF, lastSentBestMsg.bestIndex, lastSentBestMsg.bestF)) { g_pendingBest.emplace_back(g_world, partner, outMsg); lastSentBestMsg = outMsg; } if (g_pendingBest.size() > MAX_INFLIGHT_BEST) { g_pendingBest.front().req.wait(); g_pendingBest.pop_front(); } active = false; } } ++ii2; } ++exchange_counter; } while (g_world->iprobe(boost::mpi::any_source, 0)) { MultiCrossMsg in; g_world->recv(boost::mpi::any_source, 0, in); for (unsigned ii = 0u; ii < in.count; ++ii) { const float d = &in.intervals[ii * 5u]; const float sx = d[0]; const float ex = d[2]; alignas(16) float ql[AGP_MAX_STATE_DIM]; float tx = 0.0f; float ty = 0.0f; unsigned v1i = 0u; unsigned v2i = 0u; float y1i = 0.0f; float y2i = 0.0f; map.map01ToPoint(sx, ql); cost.evaluate_indexed(ql, tx, ty, v1i, y1i); map.map01ToPoint(ex, ql); cost.evaluate_indexed(ql, tx, ty, v2i, y2i); IntervalND inj = new IntervalND(sx, ex, y1i, y2i, v1i, v2i); finalize_interval_geometry(inj); const float prevMmax = Mmax; const float candidateMmax = fmaxf(Mmax, inj->M); const float m_inj = fmaf(r_eff, candidateMmax, 0.0f); inj->ChangeCharacteristic(m_inj); if (H.empty() || inj->R > fmaf(adaptive_coeff, H.front()->R, 0.0f) || stagnation) { update_pockets_and_Mmax(inj); H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtrND); if (Mmax > fmaf(adaptive_coeff, prevMmax, 0.0f)) recompute_heap_constM(fmaf(r_eff, Mmax, 0.0f)); } else { delete inj; } } } while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg incoming; g_world->recv(boost::mpi::any_source, 2, incoming); UpdateIndexedBestFromMessage( incoming, fullConstraintIndex, bestIndexFound, bestIndexValue, bestQIndexed, bestIndexedX, bestIndexedY, bestF, bestQ, bestX, bestY); } const float interval_len = H.empty() ? 0.0f : ((dim > 1) ? fmaf(H.front()->x2, 1.0f, -H.front()->x1) : H.front()->diam); if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter - 1) { for (auto& s : g_pendingMulti) s.req.wait(); for (auto& s : g_pendingBest) s.req.wait(); g_pendingMulti.clear(); g_pendingBest.clear(); while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg incoming; g_world->recv(boost::mpi::any_source, 2, incoming); UpdateIndexedBestFromMessage( incoming, fullConstraintIndex, bestIndexFound, bestIndexValue, bestQIndexed, bestIndexedX, bestIndexedY, bestF, bestQ, bestX, bestY); } g_world->barrier(); BestSolutionMsg bestMsg; InitBestSolutionMsg(bestMsg); if (!bestQIndexed.empty()) { FillBestSolutionMsg( bestMsg, bestIndexFound, bestIndexValue, bestIndexedX, bestIndexedY, bestQIndexed); } const size_t iterations = comm_levels; bool active = true; size_t itx = 0u; while (itx < iterations && active) { const size_t step = 1ULL << itx; const int partner = rank ^ static_cast<int>(step); if (partner < world) { const bool am_sender = (rank & static_cast<int>(step)) != 0; if (am_sender) { g_world->send(partner, 3, bestMsg); active = false; } else { BestSolutionMsg inBest; InitBestSolutionMsg(inBest); g_world->recv(partner, 3, inBest); if (better_indexed(inBest.bestIndex, inBest.bestF, bestMsg.bestIndex, bestMsg.bestF)) bestMsg = inBest; } } ++itx; } if (rank == 0) { UpdateIndexedBestFromMessage( bestMsg, fullConstraintIndex, bestIndexFound, bestIndexValue, bestQIndexed, bestIndexedX, bestIndexedY, bestF, bestQ, bestX, bestY); } const bool haveFeasibleBest = (bestIndexFound == fullConstraintIndex) && !bestQ.empty(); if (haveFeasibleBest && cost.solveMode != ManipSolveMode::Transition) { alignas(16) float q_local[AGP_MAX_STATE_DIM]; std::memcpy(q_local, bestQ.data(), static_cast<size_t>(dim) * sizeof(float)); float x_final = bestX; float y_final = bestY; float f_final = bestF; const int last = n - 1; const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f; const float hi = 2.6179938779914943653855361527329f; float bestLocF = f_final; float saved = q_local[last]; float delta = 0.05f; while (delta >= 0.00625f) { int sgn = -1; while (sgn < 2) { float cand = fmaf(static_cast<float>(sgn), delta, saved); if (cand < lo) cand = lo; else if (cand > hi) cand = hi; const float backup = q_local[last]; q_local[last] = cand; float x2_loc = 0.0f; float y2_loc = 0.0f; unsigned idx2 = 0u; float val2 = 0.0f; const bool feasible2 = cost.evaluate_indexed(q_local, x2_loc, y2_loc, idx2, val2); if (feasible2 && val2 < bestLocF) { bestLocF = val2; x_final = x2_loc; y_final = y2_loc; saved = cand; } q_local[last] = backup; sgn += 2; } delta = 0.5f; } if (bestLocF < f_final) { q_local[last] = saved; bestF = bestLocF; bestX = x_final; bestY = y_final; bestQ.assign(q_local, q_local + dim); bestIndexFound = fullConstraintIndex; bestIndexValue = bestF; bestQIndexed = bestQ; bestIndexedX = bestX; bestIndexedY = bestY; } } else if (haveFeasibleBest) { bestIndexFound = fullConstraintIndex; bestIndexValue = bestF; bestQIndexed = bestQ; bestIndexedX = bestX; bestIndexedY = bestY; } else { bestQ = bestQIndexed; bestF = bestIndexValue; bestX = bestIndexedX; bestY = bestIndexedY; } out_iterations = static_cast<size_t>(it); out_achieved_epsilon = interval_len; return; } ++it; } }", код оптимизированной версии: "static __declspec(noalias) void agp_run_branch_mpi(const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed, std::vector<IntervalND, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H, std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ, float& bestF, float& bestX, float& bestY, size_t& out_iterations, float& out_achieved_epsilon, float M_prior)noexcept {
struct TrialPoint { alignas(16)float q[64]; float t, f, x, y; unsigned idx; bool feasible; };
using AlignedFloatVec = std::vector<float, boost::alignment::aligned_allocator<float, 16u>>;
constexpr float T_EPS = 1e-7f;
int n = cost.n, dim = n + (cost.variableLen ? n : 0);
float dim_f = static_cast<float>(dim);
unsigned fullConstraintIndex = cost.feasible_index();
const bool hasHardConstraints = (fullConstraintIndex != 0u);
unsigned exchange_counter = 0u, exchange_counter_T = 0u;
alignas(16)float M_by_span[12];
for (int i = 0; i < 12; ++i)M_by_span[i] = M_prior;
float Mmax = M_prior;
alignas(16)float phi[64], s_arr[64], c_arr[64], sum_s[64], sum_c[64], q_try[64];
bestQ.reserve(static_cast<size_t>(dim));
AlignedFloatVec bestQIndexed; bestQIndexed.reserve(static_cast<size_t>(dim));
AlignedFloatVec transitionReferenceSeed;
float transitionRefF = FLT_MAX; unsigned transitionRefIdx = 0u; float transitionRefX = 0.0f, transitionRefY = 0.0f; bool haveTransitionRefSeed = false;
if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles()) {
int refDim = cost.full_state_dim();
transitionReferenceSeed.resize(static_cast<size_t>(refDim));
std::memcpy(transitionReferenceSeed.data(), cost.referenceState, static_cast<size_t>(refDim) * sizeof(float));
float xr = 0.0f, yr = 0.0f, fr = 0.0f; unsigned idxr = 0u;
cost.evaluate_indexed(transitionReferenceSeed.data(), xr, yr, idxr, fr);
transitionRefF = fr; transitionRefIdx = idxr; transitionRefX = xr; transitionRefY = yr; haveTransitionRefSeed = true;
}
unsigned bestIndexFound = 0u; float bestIndexValue = FLT_MAX, bestIndexedX = 0.0f, bestIndexedY = 0.0f;
BestSolutionMsg lastSentBestMsg; InitBestSolutionMsg(lastSentBestMsg);
int no_improve = 0;
float a = 0.0f, b = 1.0f, p = 0.0f, dmax = b - a, initial_len = dmax;
float A_dim = fmaf(1.0f / sqrtf(dim_f + 6.75f), 5.535f, 0.0f), A_dim_ = fmaf(1.0f / sqrtf(dim_f + 6.75f), 3.425f, 0.0f), B_dim = fmaf(A_dim, 0.7f, 0.0f), B_dim__ = fmaf(A_dim__, 4.325f, 0.0f), log_argument = A_dim - 2.03f, log_argument__ = A_dim__ - 2.0f;
float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim;
float C_dim__ = fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument__) - B_dim__;
float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
float adaptive_coeff_addition__ = fmaf(C_dim__, fmaf(C_dim__, fmaf(C_dim__, fmaf(C_dim__, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
float adaptive_coeff = A_dim - adaptive_coeff_addition, adaptive_coeff__ = A_dim__ - adaptive_coeff_addition__;
int it = 0, stag_boost_remaining = 0; float stag_r_multiplier = 0.0f;
int last_send_T = 0, send_interval_T = 7, last_send_best = 0, send_interval_best = 2, n_stag_iters = static_cast<int>(3.0f + 2.045f * sqrtf(dim_f)), noImproveThrDim = static_cast<int>(fmaf(7.5f, exp2f(-0.1f * sqrtf(dim_f)), 0.0f));
const AGPHookeJeevesParams& hj_cfg = g_agp_hj_params;
auto clamp01 = [](float v)noexcept->float {return v < 0.0f ? 0.0f : (v > 1.0f ? 1.0f : v); };
auto lerp_hj = [](float a0, float a1, float t)noexcept->float {return fmaf(t, a1 - a0, a0); };
auto select_hj_iteration_schedule = [&](float progress, int& period_out, int& phase_out)noexcept {
period_out = hj_cfg.iteration_period; phase_out = hj_cfg.iteration_phase;
int count = hj_cfg.iteration_schedule_count; if (count <= 0)return;
if (count > AGP_HJ_MAX_PROGRESS_RULES)count = AGP_HJ_MAX_PROGRESS_RULES;
float p01 = clamp01(progress);
for (int i = 0; i < count; ++i) {
const AGPHJIterationScheduleRule& rule = hj_cfg.iteration_schedule[i];
if (p01 < rule.progress_lt || i + 1 == count) { period_out = rule.period; phase_out = rule.phase; break; }
}
if (period_out < 1)period_out = 1; phase_out %= period_out; if (phase_out < 0)phase_out += period_out;
};
auto select_hj_coord_profile = [&](float progress)noexcept->AGPHJCoordScaleProfile {
int count = hj_cfg.coord_profile_schedule_count;
if (count > AGP_HJ_MAX_PROGRESS_RULES)count = AGP_HJ_MAX_PROGRESS_RULES;
float p01 = clamp01(progress);
if (count > 0) {
for (int i = 0; i < count; ++i) {
const AGPHJCoordScaleRule& rule = hj_cfg.coord_profile_schedule[i];
if (p01 < rule.progress_lt || i + 1 == count)return rule.profile;
}
}
AGPHJCoordScaleProfile fallback{};
fallback.angle_scale_proximal = hj_cfg.angle_scale_first;
fallback.angle_scale_distal = hj_cfg.angle_scale_last;
fallback.length_scale_proximal = hj_cfg.length_scale_first;
fallback.length_scale_distal = hj_cfg.length_scale_last;
return fallback;
};
auto select_hj_order_mode = [&](float progress)noexcept->int {
int order_mode = hj_cfg.order_mode;
int count = hj_cfg.order_schedule_count;
if (count <= 0)return order_mode;
if (count > AGP_HJ_MAX_PROGRESS_RULES)count = AGP_HJ_MAX_PROGRESS_RULES;
float p01 = clamp01(progress);
for (int i = 0; i < count; ++i) {
const AGPHJOrderScheduleRule& rule = hj_cfg.order_schedule[i];
if (p01 < rule.progress_lt || i + 1 == count) { order_mode = rule.order_mode; break; }
}
return order_mode;
};
auto coord_profile_scale = [&](int d, const AGPHJCoordScaleProfile& profile)noexcept->float {
if (d < n) {
float t01 = (n > 1) ? (static_cast<float>(d) / static_cast<float>(n - 1)) : 0.0f;
return lerp_hj(profile.angle_scale_proximal, profile.angle_scale_distal, t01);
}
if (cost.variableLen) {
int j = d - n;
float t01 = (n > 1) ? (static_cast<float>(j) / static_cast<float>(n - 1)) : 0.0f;
return lerp_hj(profile.length_scale_proximal, profile.length_scale_distal, t01);
}
return 1.0f;
};
alignas(16)float hj_delta[64], hj_profile_scale_applied[64];
alignas(64)int hj_order_natural[64], hj_order_reverse[64], hj_order_interleaved_proximal[64], hj_order_interleaved_distal[64], hj_order_lengths_first[64];
{
int pos = 0; while (pos < dim) { hj_order_natural[pos] = pos; ++pos; }
pos = 0; int d = dim - 1; while (d >= 0) { hj_order_reverse[pos] = d; ++pos; --d; }
pos = 0; if (cost.variableLen) { int j = 0; while (j < n) { hj_order_interleaved_proximal[pos] = j; ++pos; hj_order_interleaved_proximal[pos] = n + j; ++pos; ++j; } }
else { int j = 0; while (j < n) { hj_order_interleaved_proximal[pos] = j; ++pos; ++j; } }
pos = 0; if (cost.variableLen) { int j = n - 1; while (j >= 0) { hj_order_interleaved_distal[pos] = j; ++pos; hj_order_interleaved_distal[pos] = n + j; ++pos; --j; } }
else { int j = n - 1; while (j >= 0) { hj_order_interleaved_distal[pos] = j; ++pos; --j; } }
pos = 0; if (cost.variableLen) { int j = 0; while (j < n) { hj_order_lengths_first[pos] = n + j; ++pos; ++j; } }int j = 0; while (j < n) { hj_order_lengths_first[pos] = j; ++pos; ++j; }
}
auto select_hj_order_ptr = [&](int order_mode)noexcept->const int* {
switch (order_mode) {
case AGP_HJ_ORDER_NATURAL:return hj_order_natural;
case AGP_HJ_ORDER_REVERSE:return hj_order_reverse;
case AGP_HJ_ORDER_INTERLEAVED_PROXIMAL:return hj_order_interleaved_proximal;
case AGP_HJ_ORDER_LENGTHS_FIRST:return hj_order_lengths_first;
default:return hj_order_interleaved_distal;
}
};
int hj_profile_cache_key = INT_MIN;
auto apply_hj_coord_profile = [&](float progress)noexcept {
AGPHJCoordScaleProfile profile{};
int key = -2;
int count = hj_cfg.coord_profile_schedule_count;
if (count > AGP_HJ_MAX_PROGRESS_RULES)count = AGP_HJ_MAX_PROGRESS_RULES;
if (count > 0) {
int i = 0; while (i < count) {
const AGPHJCoordScaleRule& rule = hj_cfg.coord_profile_schedule[i];
if (progress < rule.progress_lt || i + 1 == count) { profile = rule.profile; key = i; break; }
++i;
}
}
else {
profile.angle_scale_proximal = hj_cfg.angle_scale_first;
profile.angle_scale_distal = hj_cfg.angle_scale_last;
profile.length_scale_proximal = hj_cfg.length_scale_first;
profile.length_scale_distal = hj_cfg.length_scale_last;
}
if (key == hj_profile_cache_key)return;
int d = 0; while (d < dim) {
float new_scale = fmaxf(coord_profile_scale(d, profile), 1e-12f), old_scale = fmaxf(hj_profile_scale_applied[d], 1e-12f);
if (fabsf(new_scale - old_scale) > 1e-12f) { hj_delta[d] = fmaxf(hj_delta[d] * (new_scale / old_scale), hj_cfg.min_delta); hj_profile_scale_applied[d] = new_scale; }
++d;
}
hj_profile_cache_key = key;
};
{
const AGPHJCoordScaleProfile initial_hj_profile = select_hj_coord_profile(0.0f);
for (int d = 0; d < dim; ++d) {
float span = fmaf(map.high[d], 1.0f, -map.low[d]), scale = fmaxf(coord_profile_scale(d, initial_hj_profile), 1e-12f);
hj_profile_scale_applied[d] = scale;
hj_delta[d] = fmaxf(span * hj_cfg.init_delta_frac * scale, hj_cfg.init_delta_abs);
}
}
auto t_to_idx = [&](float t)->unsigned long long {return static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f)); };
auto finalize_interval_geometry = [&](IntervalND* I)noexcept {I->i1 = t_to_idx(I->x1); I->i2 = t_to_idx(I->x2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); };
auto update_pockets_and_Mmax = [&](IntervalND* I)noexcept {int k = I->span_level; if (I->M > M_by_span[k])M_by_span[k] = I->M; if (M_by_span[k] > Mmax)Mmax = M_by_span[k]; };
auto recompute_dmax = &noexcept {float new_dmax = 0.0f; for (auto* pI : H) { float Ls = fmaf(pI->x2, 1.0f, -pI->x1); if (Ls > new_dmax)new_dmax = Ls; }dmax = (new_dmax > 0.0f) ? new_dmax : initial_len; };
auto recompute_heap_constM = [&](float m_cur)noexcept {size_t sz = H.size(); if (!hasHardConstraints)RecomputeR_ConstM_AVX2_ND(H.data(), sz, m_cur); else RecomputeR_ConstM_Mixed_ND(H.data(), sz, m_cur); std::make_heap(H.begin(), H.end(), ComparePtrND); };
auto update_best_from_trial = [&](const TrialPoint& tr)noexcept {
if (better_indexed(tr.idx, tr.f, bestIndexFound, bestIndexValue)) { bestIndexFound = tr.idx; bestIndexValue = tr.f; agp_store_q(bestQIndexed, tr.q, dim); bestIndexedX = tr.x; bestIndexedY = tr.y; }
if (tr.feasible) { if (tr.f < bestF) { bestF = tr.f; agp_store_q(bestQ, tr.q, dim); bestX = tr.x; bestY = tr.y; no_improve = 0; } else { ++no_improve; } }};
auto refine_trial = [&](float* q_inout, float& x_io, float& y_io, unsigned& idx_io, float& f_io, float t_lo, float t_hi, bool enforce_t_bounds)noexcept {
if (!(f_io < fmaf(bestF, adaptive_coeff, 0.0f)))return;
float f_start = f_io, c1 = 1e-4f, tau = 0.5f;
int max_outer_iters = static_cast<int>(50.0f * (1.0f + 0.65f * p)), max_backtrack = static_cast<int>(20.0f * (1.0f + 0.65f * p));
float lbfgs_trigger = 0.6f; int m_lbfgs = 9, max_lbfgs_iters = static_cast<int>(25.0f * (1.0f + 0.65f * p));
float eps_lbfgs_curv = 1e-6f, eps_descent = 1e-12f, eta = 2.0f / sqrtf(dim_f);
auto computeGrad = [&](const float* q_in, float x_in, float y_in, float* grad_out, float& grad_norm2_out)noexcept {
float phi_acc_local = 0.0f; int ii = 0; while (ii < n) { phi_acc_local = fmaf(q_in[ii], 1.0f, phi_acc_local); phi[ii] = phi_acc_local; ++ii; }
FABE13_SINCOS(phi, s_arr, c_arr, n);
float acc_s = 0.0f, acc_c = 0.0f; int kk = n - 1; while (kk >= 0) { float Lk = cost.link_length(q_in, kk); acc_s = fmaf(Lk, s_arr[kk], acc_s); acc_c = fmaf(Lk, c_arr[kk], acc_c); sum_s[kk] = acc_s; sum_c[kk] = acc_c; --kk; }
alignas(16)float prefix_tail_grad[64] = { 0.0f };
if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles() && cost.transitionPrefixEnergyWeight > 0.0f) {
alignas(16)float prefix_delta[64] = { 0.0f }; float phi_prefix = 0.0f; int jj = 0;
while (jj < cost.referenceAngleCount) { phi_prefix += q_in[jj]; prefix_delta[jj] = ManipCost::wrap_pi(phi_prefix - cost.referencePrefixAngles[jj]); ++jj; }
float tail = 0.0f; jj = cost.referenceAngleCount - 1; while (jj >= 0) { tail += prefix_delta[jj]; prefix_tail_grad[jj] = tail; --jj; }
}
float dx = fmaf(x_in, 1.0f, -cost.targetX), dy = fmaf(y_in, 1.0f, -cost.targetY), dist2 = fmaf(dx, dx, dy * dy), dist = sqrtf(dist2), inv_dist = (dist > 1e-20f) ? (1.0f / dist) : 0.0f;
grad_norm2_out = 0.0f; int i = 0; while (i < n) {
float gpen = 0.0f, g_main = 0.0f;
if (cost.solveMode == ManipSolveMode::Transition) {
float dtheta = (i < cost.referenceAngleCount) ? cost.wrapped_angle_delta(q_in, i) : 0.0f;
gpen = 2.0f * cost.transitionEnergyWeight * dtheta;
gpen = fmaf(2.0f * cost.transitionPrefixEnergyWeight, prefix_tail_grad[i], gpen);
g_main = 2.0f * cost.transitionCaptureWeight * fmaf(dx, -sum_s[i], dy * sum_c[i]);
}
else {
float tsg = fmaf(-q_in[i], cost.archBiasK, 0.0f), exp_arg = -tsg, exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), sig = 1.0f / (exp_val + 1.0f);
gpen = fmaf(-cost.archBiasW * cost.archBiasK, sig, gpen);
g_main = fmaf(dx, -sum_s[i], dy * sum_c[i]) * inv_dist;
}
float gi = g_main + gpen; grad_out[i] = gi; grad_norm2_out = fmaf(gi, gi, grad_norm2_out); ++i;
}
if (cost.variableLen) {
int j = 0; while (j < n) {
float gi = 0.0f;
if (cost.solveMode == ManipSolveMode::Transition) {
float gpenL = 0.0f; if (cost.has_transition_reference_lengths() && j < cost.referenceLengthCount)gpenL = 2.0f * cost.transitionLengthEnergyWeight * (q_in[n + j] - cost.referenceState[n + j]);
gi = 2.0f * cost.transitionCaptureWeight * fmaf(dx, c_arr[j], dy * s_arr[j]) + gpenL;
}
else { gi = fmaf(dx, c_arr[j], dy * s_arr[j]) * inv_dist; }
grad_out[n + j] = gi; grad_norm2_out = fmaf(gi, gi, grad_norm2_out); ++j;
}
}
};
auto armijoLineSearch = [&](const float* q_base, float f_base, const float* dir, float gtd, float& alpha_io, float* q_out, float& f_out, float& x_out, float& y_out, unsigned& idx_out)noexcept->bool {
float alpha = alpha_io; int backtrack = 0;
while (backtrack < max_backtrack) {
int i = 0; while (i < dim) { q_out[i] = fmaf(alpha, dir[i], q_base[i]); ++i; }
float x2 = 0.0f, y2 = 0.0f; unsigned idx_try = 0u; float val_try = 0.0f; cost.evaluate_indexed(q_out, x2, y2, idx_try, val_try);
float t_try = clamp01(map.pointToT(q_out));
if (enforce_t_bounds) {
if (t_hi - t_lo > 2.0f * T_EPS) { if (!(t_try > t_lo + T_EPS && t_try < t_hi - T_EPS)) { alpha = tau; ++backtrack; continue; } }
else { if (fabsf(t_try - t_lo) > T_EPS) { alpha = tau; ++backtrack; continue; } }
}
if (val_try <= f_base + c1 * alpha * gtd) { alpha_io = alpha; f_out = val_try; x_out = x2; y_out = y2; idx_out = idx_try; return true; }
alpha = tau; ++backtrack;
}
return false;
};
bool lbfgs_already_tried = false; int outer = 0;
while (outer < max_outer_iters) {
float grad[64], grad_norm2 = 0.0f; computeGrad(q_inout, x_io, y_io, grad, grad_norm2);
if (!(grad_norm2 == grad_norm2) || grad_norm2 < 1e-12f)break;
float dir_gd[64]; int i = 0; while (i < dim) { dir_gd[i] = -grad[i]; ++i; }
float gtd_gd = -grad_norm2, eta_trial = eta, f_new = f_io, x_new = x_io, y_new = y_io; unsigned idx_new = idx_io;
bool found = armijoLineSearch(q_inout, f_io, dir_gd, gtd_gd, eta_trial, q_try, f_new, x_new, y_new, idx_new);
if (!found)break;
std::memcpy(q_inout, q_try, static_cast<size_t>(dim) * sizeof(float));
f_io = f_new; x_io = x_new; y_io = y_new; idx_io = idx_new; eta = eta_trial;
float rel_impr = (f_start > 1e-20f) ? ((f_start - f_io) / f_start) : 0.0f;
if (!lbfgs_already_tried && rel_impr >= lbfgs_trigger) {
lbfgs_already_tried = true;
float q_resume[64]; std::memcpy(q_resume, q_inout, static_cast<size_t>(dim) * sizeof(float));
float f_resume = f_io, x_resume = x_io, y_resume = y_io; unsigned idx_resume = idx_io; float eta_resume = eta;
float q_best_lbfgs[64]; std::memcpy(q_best_lbfgs, q_inout, static_cast<size_t>(dim) * sizeof(float));
float f_best_lbfgs = f_io, x_best_lbfgs = x_io, y_best_lbfgs = y_io; unsigned idx_best_lbfgs = idx_io;
float s_hist[9][64], y_hist[9][64], rho_hist[9], alpha_hist[9]; int hist_size = 0;
float gk[64], gk_norm2 = 0.0f; computeGrad(q_inout, x_io, y_io, gk, gk_norm2);
bool lbfgs_ok = true; float alpha_k = 2.0f / sqrtf(dim_f); int it_lbfgs = 0;
while (it_lbfgs < max_lbfgs_iters) {
if (!(gk_norm2 == gk_norm2) || gk_norm2 < 1e-12f)break;
float dir[64];
if (hist_size == 0) { int d = 0; while (d < dim) { dir[d] = -gk[d]; ++d; } }
else {
float q_vec[64]; int d = 0; while (d < dim) { q_vec[d] = gk[d]; ++d; }
for (int jj = hist_size - 1; jj >= 0; --jj) {
float dot_sq = 0.0f; int d = 0; while (d < dim) { dot_sq = fmaf(s_hist[jj][d], q_vec[d], dot_sq); ++d; }
float a_coeff = dot_sq * rho_hist[jj]; alpha_hist[jj] = a_coeff;
d = 0; while (d < dim) { q_vec[d] = fmaf(-a_coeff, y_hist[jj][d], q_vec[d]); ++d; }
}
float gamma = 1.0f; { int last = hist_size - 1; float yy = 0.0f; int d = 0; while (d < dim) { yy = fmaf(y_hist[last][d], y_hist[last][d], yy); ++d; }float ys = 1.0f / rho_hist[last]; if (yy > 0.0f)gamma = ys / yy; }
float r_vec[64]; d = 0; while (d < dim) { r_vec[d] = gamma * q_vec[d]; ++d; }
for (int jj = 0; jj < hist_size; ++jj) {
float dot_yr = 0.0f; int d = 0; while (d < dim) { dot_yr = fmaf(y_hist[jj][d], r_vec[d], dot_yr); ++d; }
float b_coeff = dot_yr * rho_hist[jj], coeff = alpha_hist[jj] - b_coeff;
d = 0; while (d < dim) { r_vec[d] = fmaf(coeff, s_hist[jj][d], r_vec[d]); ++d; }
}
d = 0; while (d < dim) { dir[d] = -r_vec[d]; ++d; }
}
float gtd = 0.0f; int d = 0; while (d < dim) { gtd = fmaf(gk[d], dir[d], gtd); ++d; }
if (!(gtd == gtd) || gtd >= -eps_descent) { d = 0; while (d < dim) { dir[d] = -gk[d]; ++d; }gtd = -gk_norm2; if (gtd >= -eps_descent) { lbfgs_ok = false; break; } }
float q_old[64]; std::memcpy(q_old, q_inout, static_cast<size_t>(dim) * sizeof(float));
float g_old[64]; std::memcpy(g_old, gk, static_cast<size_t>(dim) * sizeof(float));
float alpha_try = alpha_k, f_try = f_io, x_try = x_io, y_try = y_io; unsigned idx_try = idx_io;
bool step_ok = armijoLineSearch(q_inout, f_io, dir, gtd, alpha_try, q_try, f_try, x_try, y_try, idx_try);
if (!step_ok) { lbfgs_ok = false; break; }
std::memcpy(q_inout, q_try, static_cast<size_t>(dim) * sizeof(float));
f_io = f_try; x_io = x_try; y_io = y_try; idx_io = idx_try; alpha_k = alpha_try;
computeGrad(q_inout, x_io, y_io, gk, gk_norm2);
if (!(gk_norm2 == gk_norm2)) { lbfgs_ok = false; break; }
if (f_io < f_best_lbfgs) { f_best_lbfgs = f_io; std::memcpy(q_best_lbfgs, q_inout, static_cast<size_t>(dim) * sizeof(float)); x_best_lbfgs = x_io; y_best_lbfgs = y_io; idx_best_lbfgs = idx_io; }
float ys = 0.0f, s_new[64], y_new[64]; d = 0; while (d < dim) { float sd = q_inout[d] - q_old[d], yd = gk[d] - g_old[d]; s_new[d] = sd; y_new[d] = yd; ys = fmaf(yd, sd, ys); ++d; }
if (!(ys == ys) || ys <= eps_lbfgs_curv) { hist_size = 0; }
else {
if (hist_size < m_lbfgs) { int idx_hist = hist_size; rho_hist[idx_hist] = 1.0f / ys; d = 0; while (d < dim) { s_hist[idx_hist][d] = s_new[d]; y_hist[idx_hist][d] = y_new[d]; ++d; }++hist_size; }
else { for (int jj = 0; jj < m_lbfgs - 1; ++jj) { rho_hist[jj] = rho_hist[jj + 1]; d = 0; while (d < dim) { s_hist[jj][d] = s_hist[jj + 1][d]; y_hist[jj][d] = y_hist[jj + 1][d]; ++d; } }int idx_hist = m_lbfgs - 1; rho_hist[idx_hist] = 1.0f / ys; d = 0; while (d < dim) { s_hist[idx_hist][d] = s_new[d]; y_hist[idx_hist][d] = y_new[d]; ++d; }hist_size = m_lbfgs; }
}
if (gk_norm2 < 1e-12f)break; ++it_lbfgs;
}
if (lbfgs_ok) { std::memcpy(q_inout, q_best_lbfgs, static_cast<size_t>(dim) * sizeof(float)); f_io = f_best_lbfgs; x_io = x_best_lbfgs; y_io = y_best_lbfgs; idx_io = idx_best_lbfgs; break; }
else { if (f_best_lbfgs < f_resume) { std::memcpy(q_inout, q_best_lbfgs, static_cast<size_t>(dim) * sizeof(float)); f_io = f_best_lbfgs; x_io = x_best_lbfgs; y_io = y_best_lbfgs; idx_io = idx_best_lbfgs; } else { std::memcpy(q_inout, q_resume, static_cast<size_t>(dim) * sizeof(float)); f_io = f_resume; x_io = x_resume; y_io = y_resume; idx_io = idx_resume; }eta = eta_resume; }
}
++outer;
}
};
auto evaluate_trial_from_q = [&](const float q_seed, float t_lo, float t_hi, bool enforce_t_bounds)noexcept->TrialPoint {
TrialPoint tr{}; std::memcpy(tr.q, q_seed, static_cast<size_t>(dim) * sizeof(float));
tr.feasible = cost.evaluate_indexed(tr.q, tr.x, tr.y, tr.idx, tr.f);
if (tr.feasible)refine_trial(tr.q, tr.x, tr.y, tr.idx, tr.f, t_lo, t_hi, enforce_t_bounds);
tr.t = clamp01(map.pointToT(tr.q));
if (enforce_t_bounds) { if (tr.t < t_lo)tr.t = t_lo; if (tr.t > t_hi)tr.t = t_hi; }
update_best_from_trial(tr);
return tr;
};
auto evaluate_trial_from_t = [&](float t_base, float t_lo, float t_hi, bool enforce_t_bounds)noexcept->TrialPoint {
alignas(16)float q_seed[64]; map.map01ToPoint(t_base, q_seed);
return evaluate_trial_from_q(q_seed, t_lo, t_hi, enforce_t_bounds);
};
auto make_interval_from_trials = [&](const TrialPoint& Lp, const TrialPoint& Rp)noexcept->IntervalND {
if (!(Lp.t + T_EPS < Rp.t))return nullptr;
IntervalND I = new IntervalND(Lp.t, Rp.t, Lp.f, Rp.f, Lp.idx, Rp.idx);
finalize_interval_geometry(I);
return I;
};
auto find_container_index = [&](float t)noexcept->int {
int best_pos = -1; float best_len = FLT_MAX;
for (size_t i = 0u; i < H.size(); ++i) { IntervalND* I = H[i]; if (t > I->x1 + T_EPS && t < I->x2 - T_EPS) { float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } }
if (best_pos >= 0)return best_pos;
for (size_t i = 0u; i < H.size(); ++i) { IntervalND* I = H[i]; if (t >= I->x1 - T_EPS && t <= I->x2 + T_EPS) { float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } }
return best_pos;
};
auto inject_trial_into_queue = [&](const TrialPoint& tr, float r_eff_cur, float adaptive_coeff)noexcept {
int pos = find_container_index(tr.t); if (pos < 0)return;
IntervalND* src = H[static_cast<size_t>(pos)];
if (!(tr.t > src->x1 + T_EPS && tr.t < src->x2 - T_EPS))return;
float src_len = fmaf(src->x2, 1.0f, -src->x1);
IntervalND* L = new IntervalND(src->x1, tr.t, src->y1, tr.f, src->idx1, tr.idx);
IntervalND* Rv = new IntervalND(tr.t, src->x2, tr.f, src->y2, tr.idx, src->idx2);
finalize_interval_geometry(L); finalize_interval_geometry(Rv);
float prevMmax = Mmax;
H[static_cast<size_t>(pos)] = H.back(); H.pop_back(); delete src;
if (!H.empty())std::make_heap(H.begin(), H.end(), ComparePtrND);
update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv);
float m_cur = fmaf(r_eff_cur, Mmax, 0.0f);
L->ChangeCharacteristic(m_cur); Rv->ChangeCharacteristic(m_cur);
H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND);
H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND);
if (Mmax > fmaf(adaptive_coeff, prevMmax, 0.0f))recompute_heap_constM(m_cur);
if (src_len + T_EPS >= dmax)recompute_dmax();
};
auto find_container_index_bounds = [&](float t, float& lo, float& hi)noexcept->int {
int best_pos = -1; float best_len = FLT_MAX;
for (size_t i = 0u; i < H.size(); ++i) { IntervalND* I = H[i]; if (t > I->x1 + T_EPS && t < I->x2 - T_EPS) { float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } }
if (best_pos < 0) { for (size_t i = 0u; i < H.size(); ++i) { IntervalND* I = H[i]; if (t >= I->x1 - T_EPS && t <= I->x2 + T_EPS) { float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } } }
if (best_pos >= 0) { const IntervalND* I = H[static_cast<size_t>(best_pos)]; lo = I->x1; hi = I->x2; }
else { lo = 0.0f; hi = 1.0f; }
return best_pos;
};
auto inject_trial_into_queue_fast = [&](const TrialPoint& tr, int pos, float r_eff_cur, float adaptive_coeff)noexcept {
if (pos < 0 || static_cast<size_t>(pos) >= H.size()) { inject_trial_into_queue(tr, r_eff_cur, adaptive_coeff); return; }
IntervalND* src = H[static_cast<size_t>(pos)];
if (!(tr.t > src->x1 + T_EPS && tr.t < src->x2 - T_EPS)) { inject_trial_into_queue(tr, r_eff_cur, adaptive_coeff); return; }
float src_len = fmaf(src->x2, 1.0f, -src->x1);
IntervalND* L = new IntervalND(src->x1, tr.t, src->y1, tr.f, src->idx1, tr.idx);
IntervalND* Rv = new IntervalND(tr.t, src->x2, tr.f, src->y2, tr.idx, src->idx2);
finalize_interval_geometry(L); finalize_interval_geometry(Rv);
float prevMmax = Mmax;
H[static_cast<size_t>(pos)] = H.back(); H.pop_back(); delete src;
if (!H.empty())std::make_heap(H.begin(), H.end(), ComparePtrND);
update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv);
float m_cur = fmaf(r_eff_cur, Mmax, 0.0f);
L->ChangeCharacteristic(m_cur); Rv->ChangeCharacteristic(m_cur);
H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND);
H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND);
if (Mmax > fmaf(adaptive_coeff, prevMmax, 0.0f))recompute_heap_constM(m_cur);
if (src_len + T_EPS >= dmax)recompute_dmax();
};
auto progress_outgoing = &noexcept {
for (auto itReq = g_pendingMulti.begin(); itReq != g_pendingMulti.end();) { if (itReq->req.test())itReq = g_pendingMulti.erase(itReq); else ++itReq; }
for (auto itReq = g_pendingBest.begin(); itReq != g_pendingBest.end();) { if (itReq->req.test())itReq = g_pendingBest.erase(itReq); else ++itReq; }
};
auto select_hj_seed = [&](const AlignedFloatVec*& seed_vec, float& seed_f, unsigned& seed_idx, float& seed_x, float& seed_y)noexcept->bool {
auto take_indexed = &noexcept->bool {if (bestQIndexed.empty())return false; seed_vec = &bestQIndexed; seed_f = bestIndexValue; seed_idx = bestIndexFound; seed_x = bestIndexedX; seed_y = bestIndexedY; return true; };
auto take_feasible = &noexcept->bool {if (bestQ.empty())return false; seed_vec = &bestQ; seed_f = bestF; seed_idx = fullConstraintIndex; seed_x = bestX; seed_y = bestY; return true; };
auto take_reference = &noexcept->bool {if (!haveTransitionRefSeed)return false; seed_vec = &transitionReferenceSeed; seed_f = transitionRefF; seed_idx = transitionRefIdx; seed_x = transitionRefX; seed_y = transitionRefY; return true; };
seed_vec = nullptr; seed_f = FLT_MAX; seed_idx = 0u; seed_x = seed_y = 0.0f;
if (cost.solveMode == ManipSolveMode::Transition)return take_feasible() || take_reference() || take_indexed();
return take_indexed() || take_feasible();
};
auto should_run_hj_this_iter = [&](float progress)noexcept->bool {
if (!hj_cfg.enabled)return false;
if (hj_cfg.require_adaptive_flag && !adaptive)return false;
int period = 1, phase = 0; select_hj_iteration_schedule(progress, period, phase);
if (period <= 1)return true;
return(it % period) == phase;
};
auto run_hooke_jeeves_iteration = [&](float r_eff_cur, float adaptive_coeff, float progress)noexcept {
if (H.empty())return;
const AlignedFloatVec* seedQ = nullptr; float seedF = FLT_MAX, seedX = 0.0f, seedY = 0.0f; unsigned seedIdx = 0u;
if (!select_hj_seed(seedQ, seedF, seedIdx, seedX, seedY))return;
TrialPoint base{}; std::memcpy(base.q, seedQ->data(), static_cast<size_t>(dim) * sizeof(float));
base.t = clamp01(map.pointToT(base.q)); base.f = seedF; base.idx = seedIdx; base.x = seedX; base.y = seedY; base.feasible = (base.idx == fullConstraintIndex);
auto evaluate_hj_candidate = [&](const float* q_candidate)noexcept->TrialPoint {
float t_guess = clamp01(map.pointToT(q_candidate)); float lo = 0.0f, hi = 1.0f;
int pos = find_container_index_bounds(t_guess, lo, hi);
TrialPoint tr = evaluate_trial_from_q(q_candidate, lo, hi, true);
inject_trial_into_queue_fast(tr, pos, r_eff_cur, adaptive_coeff);
return tr;
};
apply_hj_coord_profile(progress);
int hj_order_mode = select_hj_order_mode(progress);
const int* hj_order = select_hj_order_ptr(select_hj_order_mode(progress));
TrialPoint current = base, base_before = base; bool improved_any = false;
int ord = 0; while (ord < dim) {
int d = hj_order[ord]; float step = hj_delta[d];
TrialPoint axis_base = current;
auto try_direction = [&](int sgn, TrialPoint& accepted_tp)noexcept->bool {
float q_candidate[64]; std::memcpy(q_candidate, axis_base.q, static_cast<size_t>(dim) * sizeof(float));
q_candidate[d] = fmaf(static_cast<float>(sgn), step, q_candidate[d]);
if (fabsf(q_candidate[d] - axis_base.q[d]) <= 1e-12f)return false;
TrialPoint tp = evaluate_hj_candidate(q_candidate);
if (better_indexed(tp.idx, tp.f, axis_base.idx, axis_base.f)) { accepted_tp = tp; return true; }
return false;
};
int first_sign = hj_cfg.plus_first ? +1 : -1, second_sign = -first_sign;
bool improved_this_axis = false; TrialPoint best_axis_tp{}; bool have_axis_tp = false; TrialPoint tp_try{};
if (try_direction(first_sign, tp_try)) { best_axis_tp = tp_try; have_axis_tp = true; if (hj_cfg.opportunistic_axis) { current = best_axis_tp; improved_any = true; improved_this_axis = true; } }
if (!hj_cfg.opportunistic_axis || !have_axis_tp) {
if (try_direction(second_sign, tp_try)) { if (!have_axis_tp || better_indexed(tp_try.idx, tp_try.f, best_axis_tp.idx, best_axis_tp.f)) { best_axis_tp = tp_try; have_axis_tp = true; } }
if (have_axis_tp) { current = best_axis_tp; improved_any = true; improved_this_axis = true; }
}
if (improved_this_axis && hj_cfg.expand_delta_on_success) { hj_delta[d] = fmaxf(hj_cfg.min_delta, hj_delta[d] * hj_cfg.expand_factor); }
++ord;
}
if (improved_any) {
if (hj_cfg.use_pattern) {
float q_pattern[64]; for (int d = 0; d < dim; ++d)q_pattern[d] = fmaf(hj_cfg.pattern_gain, current.q[d] - base_before.q[d], current.q[d]);
TrialPoint pat_tp = evaluate_hj_candidate(q_pattern);
if (better_indexed(pat_tp.idx, pat_tp.f, current.idx, current.f))current = pat_tp;
}
}
else { for (int d = 0; d < dim; ++d)hj_delta[d] = fmaxf(hj_delta[d] * hj_cfg.shrink_factor, hj_cfg.min_delta); }
};
auto ensure_queue_nonempty = [&](float r_eff_cur)noexcept->bool {
if (!H.empty())return true;
const AlignedFloatVec* seedQ = nullptr; float seedF = FLT_MAX, seedX = 0.0f, seedY = 0.0f; unsigned seedIdx = 0u;
if (!select_hj_seed(seedQ, seedF, seedIdx, seedX, seedY))return false;
float t_center = clamp01(map.pointToT(seedQ->data())), half = fmaxf(1e-4f * dim_f, 1e-6f), t1b = clamp01(t_center - half), t2b = clamp01(t_center + half), mid = 0.5f * (t1b + t2b);
TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true), right_tp = evaluate_trial_from_t(t2b, mid, t2b, true);
IntervalND* I = make_interval_from_trials(left_tp, right_tp); if (I == nullptr)return false;
update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff_cur, Mmax, 0.0f));
H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND);
recompute_dmax(); return true;
};
while (g_world->iprobe(boost::mpi::any_source, 0)) { MultiCrossMsg dummy; g_world->recv(boost::mpi::any_source, 0, dummy); }
while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg dummy; g_world->recv(boost::mpi::any_source, 2, dummy); }
int seedStride = 64;
AlignedFloatVec seeds(static_cast<size_t>(1024) * static_cast<size_t>(seedStride), 0.0f);
int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds.data(), seedStride, static_cast<unsigned>(fmaf(static_cast<float>(g_world->rank()), 7919.0f, static_cast<float>(seed))));
int K_raw = static_cast<int>(fmaf(-fmaf(sqrtf(dim_f), dim_f, 0.0f), 0.725f, 10.95f)), K = (K_raw > 2) ? K_raw : 2;
H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + static_cast<size_t>(seedCnt) + 32u);
int rank = g_world->rank(), world = g_world->size();
size_t comm_levels = std::bit_width(static_cast<size_t>(world - 1));
for (int i = 0; i < seedCnt; ++i) {
const float* s = seeds.data() + static_cast<size_t>(i) * static_cast<size_t>(seedStride);
float t_seed = clamp01(map.pointToT(s));
float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f) : fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f), exp2f((1.0f / static_cast<float>(fmaxf(static_cast<float>(seedCnt - 4), 1.0f))) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
float t1b = clamp01(fmaf(-interval_size, 1.0f, t_seed)), t2b = clamp01(fmaf(interval_size, 1.0f, t_seed)), mid = 0.5f * (t1b + t2b);
TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true), right_tp = evaluate_trial_from_t(t2b, mid, t2b, true);
IntervalND* I = make_interval_from_trials(left_tp, right_tp); if (I == nullptr)continue;
update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
if (i < 3) { I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f); }
else { float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f), end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f), mult = fmaf(start_mult, exp2f((1.0f / static_cast<float>(fmaxf(static_cast<float>(seedCnt - 4), 1.0f))) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)), 0.0f); I->R = fmaf(I->R, mult, 0.0f); }
H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND);
}
std::vector<float> grid_base; grid_base.reserve(static_cast<size_t>(K) + 2u); grid_base.emplace_back(a);
for (int k = 1; k <= K; ++k) { float t = clamp01(fmaf(fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1)))); grid_base.emplace_back(t); }
grid_base.emplace_back(b);
std::vector<TrialPoint> grid_trials(grid_base.size());
for (size_t i = 0u; i < grid_base.size(); ++i) {
float lo = grid_base[i], hi = grid_base[i];
if (i == 0u) { lo = hi = a; }
else if (i + 1u == grid_base.size()) { lo = hi = b; }
else { lo = 0.5f * (grid_base[i - 1u] + grid_base[i]); hi = 0.5f * (grid_base[i] + grid_base[i + 1u]); }
grid_trials[i] = evaluate_trial_from_t(grid_base[i], lo, hi, true);
}
for (size_t i = 1u; i < grid_trials.size(); ++i) {
IntervalND* I = make_interval_from_trials(grid_trials[i - 1u], grid_trials[i]); if (I == nullptr)continue;
update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND);
}
recompute_dmax();
if (haveTransitionRefSeed) {
TrialPoint ref_tp = evaluate_trial_from_q(transitionReferenceSeed.data(), 0.0f, 1.0f, false);
if (!H.empty()) { inject_trial_into_queue(ref_tp, r, 1.0f); recompute_dmax(); }
else { ensure_queue_nonempty(r); }
}
g_world->barrier();
while (true) {
p = fmaf(-1.0f / initial_len, dmax, 1.0f);
stag_r_multiplier = 1.4f - 1.1f * fmaf(0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), 0.65f * p - 0.45f);
float p_arg = fmaf(p, 2.3f, -2.9775f);
float current_r = r; if (stag_boost_remaining > 0) { current_r = r * stag_r_multiplier; --stag_boost_remaining; }
float exp_arg = fmaf(B_dim, p, 0.0f), exp_arg__ = fmaf(B_dim__, p, 0.0f);
adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim);
float first_sqrt = sqrtf(fmaf(1.0f / dim_f, 2.0f, 0.0f)), second_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 5.0f, 0.0f)), third_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 9.0f, 0.0f)), fourth_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 6.5f, 0.0f)), rr = sqrtf(fmaf(-p, 1.0f, 1.0f)), xx = p * p, tt = fmaf(500.0f, p, -486.95472f);
float adaptive_coeff_ = (p < 0.95f) ? fmaf(fmaf(first_sqrt, xx, 0.0f), 0.0130349902f, fmaf(-0.04f, p, fmaf(fmaf(first_sqrt, rr, 0.0f), 0.15f, 1.1f))) : (p < 0.97390944f) ? fmaf(second_sqrt, rr, 0.9396f) : (p < 0.97590944f) ? fmaf(fmaf(fmaf(fmaf(third_sqrt, tt, 0.0f), tt, 0.0f), fmaf(-2.0f, tt, 3.0f), 0.0f), fmaf(0.25f, rr, -0.0396f), fmaf(fmaf(third_sqrt, rr, 0.0f), 0.75f, 0.9396f)) : fmaf(fourth_sqrt, rr, 0.925f);
adaptive_coeff__ = fmaf(fmaf(exp_arg__, fmaf(exp_arg__, fmaf(exp_arg__, fmaf(exp_arg__, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition__, 2.0f - A_dim__);
float grad_norm2_best = 0.0f;
if (no_improve > 0 && !bestQ.empty()) {
float acc_best = 0.0f, phi_best[64]; int ii_best = 0;
while (ii_best < n) { acc_best = fmaf(bestQ[ii_best], 1.0f, acc_best); phi_best[ii_best] = acc_best; ++ii_best; }
float s_best[64], c_best[64]; FABE13_SINCOS(phi_best, s_best, c_best, n);
float as_best = 0.0f, ac_best = 0.0f, sum_s_best[64], sum_c_best[64]; int k_best = n - 1;
while (k_best >= 0) { float Lk = cost.variableLen ? bestQ[n + k_best] : cost.fixedLength; as_best = fmaf(Lk, s_best[k_best], as_best); ac_best = fmaf(Lk, c_best[k_best], ac_best); sum_s_best[k_best] = as_best; sum_c_best[k_best] = ac_best; --k_best; }
alignas(16)float prefix_tail_best[64] = { 0.0f };
if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles() && cost.transitionPrefixEnergyWeight > 0.0f) {
alignas(16)float prefix_delta_best[64] = { 0.0f }; float phi_pref = 0.0f; int jj = 0;
while (jj < cost.referenceAngleCount) { phi_pref += bestQ[jj]; prefix_delta_best[jj] = ManipCost::wrap_pi(phi_pref - cost.referencePrefixAngles[jj]); ++jj; }
float tail = 0.0f; jj = cost.referenceAngleCount - 1; while (jj >= 0) { tail += prefix_delta_best[jj]; prefix_tail_best[jj] = tail; --jj; }
}
float dx_best = fmaf(bestX, 1.0f, -cost.targetX), dy_best = fmaf(bestY, 1.0f, -cost.targetY), dist_best = sqrtf(fmaf(dx_best, dx_best, dy_best * dy_best)), inv_dist_best = (dist_best > 1e-20f) ? (1.0f / dist_best) : 0.0f;
int i_best = 0; while (i_best < n) {
float gpen_best = 0.0f, g_main_best = 0.0f;
if (cost.solveMode == ManipSolveMode::Transition) {
float dtheta_best = (i_best < cost.referenceAngleCount) ? cost.wrapped_angle_delta(bestQ.data(), i_best) : 0.0f;
gpen_best = 2.0f * cost.transitionEnergyWeight * dtheta_best;
gpen_best = fmaf(2.0f * cost.transitionPrefixEnergyWeight, prefix_tail_best[i_best], gpen_best);
g_main_best = 2.0f * cost.transitionCaptureWeight * fmaf(dx_best, -sum_s_best[i_best], dy_best * sum_c_best[i_best]);
}
else {
float tsg = fmaf(-bestQ[i_best], cost.archBiasK, 0.0f), exp_arg_g = -tsg, exp_val = fmaf(exp_arg_g, fmaf(exp_arg_g, fmaf(exp_arg_g, fmaf(exp_arg_g, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), sig = 1.0f / (exp_val + 1.0f);
gpen_best = fmaf(-cost.archBiasW * cost.archBiasK, sig, gpen_best);
g_main_best = fmaf(dx_best, -sum_s_best[i_best], dy_best * sum_c_best[i_best]) * inv_dist_best;
}
float gi_best = g_main_best + gpen_best; grad_norm2_best = fmaf(gi_best, gi_best, grad_norm2_best); ++i_best;
}
if (cost.variableLen) {
int j_best = 0; while (j_best < n) {
float gi_best = 0.0f;
if (cost.solveMode == ManipSolveMode::Transition) {
float gpenL_best = 0.0f; if (cost.has_transition_reference_lengths() && j_best < cost.referenceLengthCount)gpenL_best = 2.0f * cost.transitionLengthEnergyWeight * (bestQ[n + j_best] - cost.referenceState[n + j_best]);
gi_best = 2.0f * cost.transitionCaptureWeight * fmaf(dx_best, c_best[j_best], dy_best * s_best[j_best]) + gpenL_best;
}
else { gi_best = fmaf(dx_best, c_best[j_best], dy_best * s_best[j_best]) * inv_dist_best; }
grad_norm2_best = fmaf(gi_best, gi_best, grad_norm2_best); ++j_best;
}
}
}
bool stagnation = (no_improve > noImproveThrDim) && (grad_norm2_best < 0.5e-1f);
float r_eff = (dim > 2) ? fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, fmaf(sqrtf(dim_f - 1), current_r, 0.0f), 0.0f) : fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, current_r, 0.0f);
if (stagnation) {
stag_boost_remaining = n_stag_iters;
float t_seeds[32]; int seed_count = 0;
if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles()) {
alignas(16)float q_center[64]; std::memcpy(q_center, cost.referenceState, static_cast<size_t>(dim) * sizeof(float));
t_seeds[seed_count++] = clamp01(map.pointToT(q_center));
if (!bestQ.empty() && seed_count < 32)t_seeds[seed_count++] = clamp01(map.pointToT(bestQ.data()));
unsigned st_local = seed + static_cast<unsigned>(it) + 911u; int local_variants = 2 + static_cast<int>(sqrtf(dim_f));
for (int v = 0; v < local_variants && seed_count < 32; ++v) {
float q_temp[64]; std::memcpy(q_temp, q_center, static_cast<size_t>(dim) * sizeof(float));
st_local ^= st_local << 13; st_local ^= st_local >> 17; st_local ^= st_local << 5; float rnd0 = static_cast<float>(st_local & 0xFFFFFFu) * 5.9604645e-8f;
int aidx = v % n; q_temp[aidx] = fmaf((2.0f * rnd0 - 1.0f), 0.25f * cost.transitionMaxAngleStep, q_temp[aidx]);
if (cost.variableLen && cost.has_transition_reference_lengths()) {
st_local ^= st_local << 13; st_local ^= st_local >> 17; st_local ^= st_local << 5; float rnd1 = static_cast<float>(st_local & 0xFFFFFFu) * 5.9604645e-8f;
int lidx = v % n; q_temp[n + lidx] = fmaf((2.0f * rnd1 - 1.0f), 0.25f * cost.max_length_step(lidx), q_temp[n + lidx]);
}
t_seeds[seed_count++] = clamp01(map.pointToT(q_temp));
}
}
else {
int num_ik = 1 + static_cast<int>(sqrtf(dim_f));
float dist_to_target = sqrtf(cost.targetX * cost.targetX + cost.targetY * cost.targetY), max_reach = 0.0f;
if (cost.variableLen) { for (int ii = 0; ii < n; ++ii)max_reach += map.high[n + ii]; }
else { max_reach = static_cast<float>(n) * cost.fixedLength; }
float ratio = dist_to_target / fmaxf(max_reach, 1e-20f); bool prefer_extended = (ratio > 0.7f), prefer_compact = (ratio < 0.4f), use_ik = !(ratio > 0.4f && ratio < 0.7f);
if (!use_ik) {
AlignedFloatVec temp_S(static_cast<size_t>(1024) * static_cast<size_t>(64), 0.0f);
int sobol_gen = generate_sobol_seeds_limited(map, dim, temp_S.data(), 64, seed + static_cast<unsigned>(it), 1024);
int num_sobol = num_ik; for (int kk = 0; kk < num_sobol && kk < sobol_gen && seed_count < 32; ++kk) { const float* s = temp_S.data() + static_cast<size_t>(kk) * 64; t_seeds[seed_count++] = clamp01(map.pointToT(s)); }
}
else {
float angles_ccd[32] = { 0.0f }, lengths_ccd[32];
if (cost.variableLen) { float len_low = map.low[n], len_high = map.high[n], avg_len = 0.5f * (len_low + len_high); for (int ii = 0; ii < n; ++ii)lengths_ccd[ii] = avg_len; }
else { for (int ii = 0; ii < n; ++ii)lengths_ccd[ii] = cost.fixedLength; }
ccd_ik(cost.targetX, cost.targetY, lengths_ccd, n, angles_ccd, 10);
float angles_fabrik[32] = { 0.0f }, lengths_fabrik[32];
if (cost.variableLen) { for (int ii = 0; ii < n; ++ii)lengths_fabrik[ii] = lengths_ccd[ii]; }
else { for (int ii = 0; ii < n; ++ii)lengths_fabrik[ii] = cost.fixedLength; }
float targetX_fab = cost.targetX, targetY_fab = cost.targetY;
for (int iter_fab = 0; iter_fab < 3; ++iter_fab) {
float prevX = targetX_fab, prevY = targetY_fab;
for (int j = n - 1; j >= 0; --j) { float len = lengths_fabrik[j], angle_to_target = atan2f(prevY, prevX); angles_fabrik[j] = angle_to_target; float s_val, c_val; FABE13_SINCOS(&angle_to_target, &s_val, &c_val, 1); prevX = prevX - len * c_val; prevY = prevY - len * s_val; }
}
if (prefer_extended) {
{ float q_ccd[64]; for (int ii = 0; ii < n; ++ii)q_ccd[ii] = angles_ccd[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii)q_ccd[n + ii] = lengths_ccd[ii]; }t_seeds[seed_count++] = clamp01(map.pointToT(q_ccd)); }
unsigned st_ik = seed + static_cast<unsigned>(it) + 222u; int remaining = num_ik - 1;
for (int v = 0; v < remaining && seed_count < 32; ++v) {
float noisy_angles[32], noisy_lengths[32];
for (int ii = 0; ii < n; ++ii) {
st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f;
noisy_angles[ii] = angles_ccd[ii] + (2.0f * rnd - 1.0f) * 0.1f;
float lo = (ii == 0) ? -1.0471975511965976f : -2.6179938779914944f, hi = 2.6179938779914944f;
if (noisy_angles[ii] < lo)noisy_angles[ii] = lo; if (noisy_angles[ii] > hi)noisy_angles[ii] = hi;
}
if (cost.variableLen) {
for (int ii = 0; ii < n; ++ii) {
st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f;
noisy_lengths[ii] = lengths_ccd[ii] + (2.0f * rnd - 1.0f) * 0.05f;
if (noisy_lengths[ii] < 0.5f)noisy_lengths[ii] = 0.5f; if (noisy_lengths[ii] > 2.0f)noisy_lengths[ii] = 2.0f;
}
}
float q_temp[64]; for (int ii = 0; ii < n; ++ii)q_temp[ii] = noisy_angles[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii)q_temp[n + ii] = noisy_lengths[ii]; }
t_seeds[seed_count++] = clamp01(map.pointToT(q_temp));
}
}
else if (prefer_compact) {
{ float q_fabrik[64]; for (int ii = 0; ii < n; ++ii)q_fabrik[ii] = angles_fabrik[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii)q_fabrik[n + ii] = lengths_fabrik[ii]; }t_seeds[seed_count++] = clamp01(map.pointToT(q_fabrik)); }
unsigned st_ik = seed + static_cast<unsigned>(it) + 222u; int remaining = num_ik - 1;
for (int v = 0; v < remaining && seed_count < 32; ++v) {
float noisy_angles[32], noisy_lengths[32];
for (int ii = 0; ii < n; ++ii) {
st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f;
noisy_angles[ii] = angles_fabrik[ii] + (2.0f * rnd - 1.0f) * 0.06f;
float lo = (ii == 0) ? -1.0471975511965976f : -2.6179938779914944f, hi = 2.6179938779914944f;
if (noisy_angles[ii] < lo)noisy_angles[ii] = lo; if (noisy_angles[ii] > hi)noisy_angles[ii] = hi;
}
if (cost.variableLen) {
for (int ii = 0; ii < n; ++ii) {
st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f;
noisy_lengths[ii] = lengths_fabrik[ii] + (2.0f * rnd - 1.0f) * 0.03f;
if (noisy_lengths[ii] < 0.5f)noisy_lengths[ii] = 0.5f; if (noisy_lengths[ii] > 2.0f)noisy_lengths[ii] = 2.0f;
}
}
float q_temp[64]; for (int ii = 0; ii < n; ++ii)q_temp[ii] = noisy_angles[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii)q_temp[n + ii] = noisy_lengths[ii]; }
t_seeds[seed_count++] = clamp01(map.pointToT(q_temp));
}
}
}
}
for (int s = 0; s < seed_count; ++s) {
float t_center = t_seeds[s], interval_size = fmaf(0.00031f, static_cast<float>(dim), 0.0f), t1b = clamp01(t_center - 0.5f * interval_size), t2b = clamp01(t_center + 0.5f * interval_size), mid = 0.5f * (t1b + t2b);
TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true), right_tp = evaluate_trial_from_t(t2b, mid, t2b, true);
IntervalND* I = make_interval_from_trials(left_tp, right_tp); if (I == nullptr)continue;
update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, fmaf(0.01f, dim_f, 0.85f), 0.0f);
H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND);
}
recompute_dmax(); no_improve = 0;
}
float bestFOld = bestF;
bool local_idle = !ensure_queue_nonempty(r_eff);
bool do_hj_this_iter = should_run_hj_this_iter(p);
if (!local_idle) {
if (do_hj_this_iter) {
run_hooke_jeeves_iteration(r_eff, adaptive_coeff, p);
}
else {
std::pop_heap(H.begin(), H.end(), ComparePtrND); IntervalND* cur = H.back(); H.pop_back();
float x1 = cur->x1, x2 = cur->x2, y1 = cur->y1, y2 = cur->y2;
float m = fmaf(r_eff, Mmax, 0.0f), tBase = step(m, x1, x2, y1, y2, dim_f, r_eff, cur->idx1, cur->idx2);
TrialPoint trNew = evaluate_trial_from_t(tBase, x1, x2, true);
IntervalND* L = new IntervalND(x1, trNew.t, y1, trNew.f, cur->idx1, trNew.idx);
IntervalND* Rv = new IntervalND(trNew.t, x2, trNew.f, y2, trNew.idx, cur->idx2);
finalize_interval_geometry(L); finalize_interval_geometry(Rv);
float Mloc = fmaxf(L->M, Rv->M), prevMmax = Mmax;
update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv);
m = fmaf(r_eff, Mmax, 0.0f);
float len1 = fmaf(trNew.t, 1.0f, -x1), len2 = fmaf(x2, 1.0f, -trNew.t);
if (fmaf(len1, 1.0f, len2) >= dmax - T_EPS) { dmax = fmaxf(len1, len2); for (auto* pI : H) { float Ls = fmaf(pI->x2, 1.0f, -pI->x1); if (Ls > dmax)dmax = Ls; } }
if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) {
float alpha = p * p, beta = fmaf(-alpha, 1.0f, 2.0f), MULT = (1.0f / fmaxf(dmax, 1e-20f)) * Mmax, global_coeff = fmaf(MULT, r_eff, -MULT), GF = beta * global_coeff;
L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f)));
Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f)));
size_t sz = H.size(); if (sz > 0u) { if (!hasHardConstraints)RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha); else RecomputeR_AffineM_Mixed_ND(H.data(), sz, GF, alpha); std::make_heap(H.begin(), H.end(), ComparePtrND); }
}
else {
L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m);
if (Mloc > prevMmax && Mloc > fmaf(adaptive_coeff_, prevMmax, 0.0f))recompute_heap_constM(m);
}
H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND);
H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND);
delete cur;
}
}
if (!H.empty()) { _mm_prefetch(reinterpret_cast<const char*>(H[0]), _MM_HINT_T0); if (H.size() > 1u)_mm_prefetch(reinterpret_cast<const char*>(H[1]), _MM_HINT_T0); }
if (!H.empty() && (stagnation || bestF < fmaf(fmaf(bestFOld, 0.6f, 0.0f), adaptive_coeff, 0.0f)) && it - last_send_T >= send_interval_T) {
last_send_T = it; progress_outgoing();
size_t scan_count = H.size(); unsigned intervals_to_send = (dim < 12) ? static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f))) : 7u;
if (intervals_to_send > scan_count)intervals_to_send = static_cast<unsigned>(scan_count);
if (intervals_to_send > 0u) {
float alpha = 0.63f - sqrtf(fmaxf(-0.113f * (p - 1.0f), 0.0f)), beta = 1.0f - alpha;
float w_pos = 3.085f * (-sqrtf(fmaxf(0.007f * p, 0.0f)) + 1.0f), w_size = 5.085f - w_pos;
int num_bins = (p < 1.0f - 1e-9f) ? static_cast<int>(1.0f / (1.0f - p)) : 1;
std::vector<float> pos_metric(scan_count), size_metric(scan_count), R_val(scan_count);
float R_max = -FLT_MAX, R_min = FLT_MAX, size_max = -FLT_MAX, size_min = FLT_MAX, pos_max = -FLT_MAX, pos_min = FLT_MAX;
for (size_t idx = 0u; idx < scan_count; ++idx) {
IntervalND* I = H[idx]; float center = 0.5f * (I->x1 + I->x2), len = fmaf(I->x2, 1.0f, -I->x1), size = len * static_cast<float>(1 << I->span_level);
if (p < 0.95f && num_bins>1) { float bin = floorf(center * static_cast<float>(num_bins)); if (bin >= static_cast<float>(num_bins))bin = static_cast<float>(num_bins - 1); pos_metric[idx] = bin / static_cast<float>(num_bins - 1); }
else { pos_metric[idx] = center; }
size_metric[idx] = size; R_val[idx] = I->R;
if (pos_metric[idx] > pos_max)pos_max = pos_metric[idx]; if (pos_metric[idx] < pos_min)pos_min = pos_metric[idx];
if (size > size_max)size_max = size; if (size < size_min)size_min = size;
if (I->R > R_max)R_max = I->R; if (I->R < R_min)R_min = I->R;
}
auto norm = [&](float val, float minv, float maxv)noexcept->float {float denom = maxv - minv; if (fabsf(denom) <= 1e-20f)return 0.0f; return(val - minv) / denom; };
std::vector<float> R_norm(scan_count), pos_norm(scan_count), size_norm(scan_count);
for (size_t idx = 0u; idx < scan_count; ++idx) { R_norm[idx] = norm(R_val[idx], R_min, R_max); pos_norm[idx] = norm(pos_metric[idx], pos_min, pos_max); size_norm[idx] = norm(size_metric[idx], size_min, size_max); }
std::vector<int> selected; selected.reserve(intervals_to_send);
std::vector<unsigned char> used(scan_count, 0u); selected.push_back(0); used[0] = true;
while (selected.size() < intervals_to_send) {
int best_idx = -1; float best_score = -FLT_MAX;
for (size_t idx = 0u; idx < scan_count; ++idx) {
if (used[idx] != 0u)continue;
float min_dist = FLT_MAX; for (int s : selected) { float d2 = 0.0f, d = pos_norm[idx] - pos_norm[static_cast<size_t>(s)]; d2 += w_pos * d * d; d = size_norm[idx] - size_norm[static_cast<size_t>(s)]; d2 += w_size * d * d; if (d2 < min_dist)min_dist = d2; }
float novelty = sqrtf(fmaxf(min_dist, 0.0f)) / sqrtf(fmaxf(w_pos + w_size, 1e-20f)), score = alpha * R_norm[idx] + beta * novelty;
if (score > best_score) { best_score = score; best_idx = static_cast<int>(idx); }
}
if (best_idx < 0)break; selected.push_back(best_idx); used[static_cast<size_t>(best_idx)] = 1u;
}
MultiCrossMsg out{}; out.count = static_cast<unsigned>(selected.size()); float* dest = out.intervals;
for (unsigned s = 0u; s < out.count; ++s) { IntervalND* Tt = H[static_cast<size_t>(selected[s])]; dest[0] = Tt->x1; dest[1] = 0.0f; dest[2] = Tt->x2; dest[3] = 0.0f; dest[4] = Tt->R; dest += 5; }
size_t iterations = comm_levels; bool active = true; bool invert_T = ((static_cast<int>(exchange_counter_T) + 1) & 1) != 0; size_t ii2 = 0u;
while (ii2 < iterations && active) { size_t step = 1ULL << ii2; int partner = rank ^ static_cast<int>(step); if (partner < world) { bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T); if (am_sender) { g_pendingMulti.emplace_back(g_world, partner, out); if (g_pendingMulti.size() > MAX_INFLIGHT_MULTI) { g_pendingMulti.front().req.wait(); g_pendingMulti.pop_front(); }active = false; } }++ii2; }
++exchange_counter_T;
}
}
bool indexedBestImprovedSinceLastSend = !bestQIndexed.empty() && better_indexed(bestIndexFound, bestIndexValue, lastSentBestMsg.bestIndex, lastSentBestMsg.bestF);
if (indexedBestImprovedSinceLastSend && it - last_send_best >= send_interval_best) {
last_send_best = it; progress_outgoing();
size_t iterations = comm_levels; bool active = true; bool invert_best = ((static_cast<int>(exchange_counter) + 1) & 1) != 0; size_t ii2 = 0u;
while (ii2 < iterations && active) { size_t step = 1ULL << ii2; int partner = rank ^ static_cast<int>(step); if (partner < world) { bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_best); if (am_sender) { BestSolutionMsg outMsg; InitBestSolutionMsg(outMsg); if (FillBestSolutionMsg(outMsg, bestIndexFound, bestIndexValue, bestIndexedX, bestIndexedY, bestQIndexed) && better_indexed(outMsg.bestIndex, outMsg.bestF, lastSentBestMsg.bestIndex, lastSentBestMsg.bestF)) { g_pendingBest.emplace_back(g_world, partner, outMsg); lastSentBestMsg = outMsg; }if (g_pendingBest.size() > MAX_INFLIGHT_BEST) { g_pendingBest.front().req.wait(); g_pendingBest.pop_front(); }active = false; } }++ii2; }
++exchange_counter;
}
while (g_world->iprobe(boost::mpi::any_source, 0)) {
MultiCrossMsg in; g_world->recv(boost::mpi::any_source, 0, in);
for (unsigned ii = 0u; ii < in.count; ++ii) {
const float d = &in.intervals[ii * 5u]; float sx = d[0], ex = d[2];
alignas(16)float ql[64]; float tx = 0.0f, ty = 0.0f; unsigned v1i = 0u, v2i = 0u; float y1i = 0.0f, y2i = 0.0f;
map.map01ToPoint(sx, ql); cost.evaluate_indexed(ql, tx, ty, v1i, y1i);
map.map01ToPoint(ex, ql); cost.evaluate_indexed(ql, tx, ty, v2i, y2i);
IntervalND inj = new IntervalND(sx, ex, y1i, y2i, v1i, v2i);
finalize_interval_geometry(inj);
float prevMmax = Mmax, candidateMmax = fmaxf(Mmax, inj->M), m_inj = fmaf(r_eff, candidateMmax, 0.0f);
inj->ChangeCharacteristic(m_inj);
if (H.empty() || inj->R > fmaf(adaptive_coeff, H.front()->R, 0.0f) || stagnation) {
update_pockets_and_Mmax(inj); H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtrND);
if (Mmax > fmaf(adaptive_coeff, prevMmax, 0.0f))recompute_heap_constM(fmaf(r_eff, Mmax, 0.0f));
}
else { delete inj; }
}
}
while (g_world->iprobe(boost::mpi::any_source, 2)) {
BestSolutionMsg incoming; g_world->recv(boost::mpi::any_source, 2, incoming);
UpdateIndexedBestFromMessage(incoming, fullConstraintIndex, bestIndexFound, bestIndexValue, bestQIndexed, bestIndexedX, bestIndexedY, bestF, bestQ, bestX, bestY);
}
float interval_len = H.empty() ? 0.0f : ((dim > 1) ? fmaf(H.front()->x2, 1.0f, -H.front()->x1) : H.front()->diam);
if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter - 1) {
for (auto& s : g_pendingMulti)s.req.wait(); for (auto& s : g_pendingBest)s.req.wait();
g_pendingMulti.clear(); g_pendingBest.clear();
while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg incoming; g_world->recv(boost::mpi::any_source, 2, incoming); UpdateIndexedBestFromMessage(incoming, fullConstraintIndex, bestIndexFound, bestIndexValue, bestQIndexed, bestIndexedX, bestIndexedY, bestF, bestQ, bestX, bestY); }
g_world->barrier();
BestSolutionMsg bestMsg; InitBestSolutionMsg(bestMsg);
if (!bestQIndexed.empty()) { FillBestSolutionMsg(bestMsg, bestIndexFound, bestIndexValue, bestIndexedX, bestIndexedY, bestQIndexed); }
size_t iterations = comm_levels; bool active = true; size_t itx = 0u;
while (itx < iterations && active) { size_t step = 1ULL << itx; int partner = rank ^ static_cast<int>(step); if (partner < world) { bool am_sender = (rank & static_cast<int>(step)) != 0; if (am_sender) { g_world->send(partner, 3, bestMsg); active = false; } else { BestSolutionMsg inBest; InitBestSolutionMsg(inBest); g_world->recv(partner, 3, inBest); if (better_indexed(inBest.bestIndex, inBest.bestF, bestMsg.bestIndex, bestMsg.bestF))bestMsg = inBest; } }++itx; }
if (rank == 0) { UpdateIndexedBestFromMessage(bestMsg, fullConstraintIndex, bestIndexFound, bestIndexValue, bestQIndexed, bestIndexedX, bestIndexedY, bestF, bestQ, bestX, bestY); }
bool haveFeasibleBest = (bestIndexFound == fullConstraintIndex) && !bestQ.empty();
if (haveFeasibleBest && cost.solveMode != ManipSolveMode::Transition) {
alignas(16)float q_local[64]; std::memcpy(q_local, bestQ.data(), static_cast<size_t>(dim) * sizeof(float));
float x_final = bestX, y_final = bestY, f_final = bestF; int last = n - 1;
float lo = (last == 0) ? -1.0471975511965977f : -2.6179938779914944f, hi = 2.6179938779914944f;
float bestLocF = f_final, saved = q_local[last], delta = 0.05f;
while (delta >= 0.00625f) { int sgn = -1; while (sgn < 2) { float cand = fmaf(static_cast<float>(sgn), delta, saved); if (cand < lo)cand = lo; else if (cand > hi)cand = hi; float backup = q_local[last]; q_local[last] = cand; float x2_loc = 0.0f, y2_loc = 0.0f; unsigned idx2 = 0u; float val2 = 0.0f; cost.evaluate_indexed(q_local, x2_loc, y2_loc, idx2, val2); if (val2 < bestLocF) { bestLocF = val2; x_final = x2_loc; y_final = y2_loc; saved = cand; }q_local[last] = backup; sgn += 2; }delta *= 0.5f; }
if (bestLocF < f_final) { q_local[last] = saved; bestF = bestLocF; bestX = x_final; bestY = y_final; bestQ.assign(q_local, q_local + dim); bestIndexFound = fullConstraintIndex; bestIndexValue = bestF; bestQIndexed = bestQ; bestIndexedX = bestX; bestIndexedY = bestY; }
}
else if (haveFeasibleBest) { bestIndexFound = fullConstraintIndex; bestIndexValue = bestF; bestQIndexed = bestQ; bestIndexedX = bestX; bestIndexedY = bestY; }
else { bestQ = bestQIndexed; bestF = bestIndexValue; bestX = bestIndexedX; bestY = bestIndexedY; }
out_iterations = static_cast<size_t>(it); out_achieved_epsilon = interval_len;
return;
}
++it;
}
}"

Answer 1

Ниже — цельный метод. Я сохранил быстрый fast-path для обычного режима, а для Transition вернул только те вещи, без которых он ломается: проекцию кандидатов в допустимую область, корректную обработку ref_ok для transition-reference seed и отбрасывание невалидных шагов в line search. При этом extra-логика вынесена в отдельный projected-path, чтобы не замедлять обычный режим лишними ветками внутри горячих циклов.

cpp
static __declspec(noalias) void agp_run_branch_mpi(
    const MortonND& map,
    const ManipCost& cost,
    int maxIter,
    float r,
    bool adaptive,
    float eps,
    unsigned seed,
    std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H,
    std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ,
    float& bestF,
    float& bestX,
    float& bestY,
    size_t& out_iterations,
    float& out_achieved_epsilon,
    float M_prior) noexcept
{
    constexpr int FAST_STATE_DIM = AGP_MAX_STATE_DIM;
    constexpr int FAST_SEED_STRIDE = AGP_MAX_SEED_STRIDE;
    constexpr int FAST_MAX_SEEDS = AGP_MAX_GENERATED_SEEDS;

    struct TrialPoint {
        alignas(16) float q[FAST_STATE_DIM];
        float t = 0.0f;
        float f = FLT_MAX;
        float x = 0.0f;
        float y = 0.0f;
        unsigned idx = 0u;
        bool feasible = false;
    };

    using AlignedFloatVec = std::vector<float, boost::alignment::aligned_allocator<float, 16u>>;
    constexpr float T_EPS = 1.0e-7f;

    const int n = cost.n;
    const int dim = n + (cost.variableLen ? n : 0);
    const float dim_f = static_cast<float>(dim);
    const unsigned fullConstraintIndex = cost.feasible_index();
    const bool hasHardConstraints = (fullConstraintIndex != 0u);

    if (n > 32 || dim > FAST_STATE_DIM) {
        bestQ.clear();
        bestF = FLT_MAX;
        bestX = 0.0f;
        bestY = 0.0f;
        out_iterations = 0u;
        out_achieved_epsilon = 0.0f;
        return;
    }

    const bool transitionMode = (cost.solveMode == ManipSolveMode::Transition);
    const bool hasTransitionRefAngles = transitionMode && cost.has_transition_reference_angles();
    const bool hasTransitionRefLengths = transitionMode && cost.has_transition_reference_lengths();
    const bool useJointAngleLimitConstraints = cost.use_joint_angle_limit_constraints();
    const bool needsProjectedCandidates = transitionMode || useJointAngleLimitConstraints;
    const bool preserveTransitionJointMagnitudeFloor =
        transitionMode && cost.transitionPreserveJointMagnitudeFloor && hasTransitionRefAngles;

    const float* const ref_state = cost.referenceState;
    const int referenceAngleCount = hasTransitionRefAngles ? cost.referenceAngleCount : 0;
    const int referenceLengthCount = hasTransitionRefLengths ? cost.referenceLengthCount : 0;
    const float transitionMaxAngleStep = transitionMode ? cost.transitionMaxAngleStep : 0.0f;
    const float transitionJointSignDeadband = transitionMode ? cost.transitionJointSignDeadband : 0.0f;
    const float jointAngleHard = useJointAngleLimitConstraints ? fabsf(cost.minTheta) : 0.0f;
    const float baseLengthLo = cost.variableLen ? (cost.fixedLength / cost.stretchFactor) : 0.0f;
    const float baseLengthHi = cost.variableLen ? (cost.fixedLength * cost.stretchFactor) : 0.0f;

    alignas(16) float angle_lo_bound[FAST_STATE_DIM];
    alignas(16) float angle_hi_bound[FAST_STATE_DIM];
    alignas(16) float length_lo_bound[FAST_STATE_DIM];
    alignas(16) float length_hi_bound[FAST_STATE_DIM];
    alignas(16) float joint_min_abs_bound[FAST_STATE_DIM];

    if (needsProjectedCandidates) {
        int i = 0;
        while (i < n) {
            float lo = (i == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
            float hi =  2.6179938779914943653855361527329f;

            if (hasTransitionRefAngles && i < referenceAngleCount) {
                const float ref = ref_state[i];
                const float lo_ref = ref - transitionMaxAngleStep;
                const float hi_ref = ref + transitionMaxAngleStep;
                if (lo_ref > lo) lo = lo_ref;
                if (hi_ref < hi) hi = hi_ref;
            }

            if (useJointAngleLimitConstraints) {
                const float lo_hard = -jointAngleHard;
                const float hi_hard =  jointAngleHard;
                if (lo_hard > lo) lo = lo_hard;
                if (hi_hard < hi) hi = hi_hard;
            }

            angle_lo_bound[i] = lo;
            angle_hi_bound[i] = hi;

            if (preserveTransitionJointMagnitudeFloor &&
                i < referenceAngleCount &&
                fabsf(ref_state[i]) > transitionJointSignDeadband) {
                joint_min_abs_bound[i] = cost.min_joint_abs_angle_bound(i);
            } else {
                joint_min_abs_bound[i] = 0.0f;
            }

            ++i;
        }

        if (transitionMode && cost.variableLen) {
            i = 0;
            while (i < n) {
                float lo = baseLengthLo;
                float hi = baseLengthHi;

                if (hasTransitionRefLengths && i < referenceLengthCount) {
                    const float refL = ref_state[n + i];
                    const float dL = cost.max_length_step(i);
                    const float lo_ref = refL - dL;
                    const float hi_ref = refL + dL;
                    if (lo_ref > lo) lo = lo_ref;
                    if (hi_ref < hi) hi = hi_ref;
                }

                length_lo_bound[i] = lo;
                length_hi_bound[i] = hi;
                ++i;
            }
        }
    }

    auto clampPointProjected = [&](float* q) noexcept {
        if (!needsProjectedCandidates) return;

        int i = 0;
        while (i < n) {
            float qi = q[i];
            const float lo = angle_lo_bound[i];
            const float hi = angle_hi_bound[i];

            if (qi < lo) qi = lo;
            else if (qi > hi) qi = hi;

            const float min_abs = joint_min_abs_bound[i];
            if (min_abs > 0.0f) {
                if (fabsf(qi) < min_abs) {
                    qi = copysignf(min_abs, ref_state[i]);
                    if (qi < lo) qi = lo;
                    else if (qi > hi) qi = hi;
                }
            }

            q[i] = qi;
            ++i;
        }

        if (transitionMode && cost.variableLen) {
            i = 0;
            while (i < n) {
                float qi = q[n + i];
                const float lo = length_lo_bound[i];
                const float hi = length_hi_bound[i];
                if (qi < lo) qi = lo;
                else if (qi > hi) qi = hi;
                q[n + i] = qi;
                ++i;
            }
        }
    };

    unsigned exchange_counter = 0u;
    unsigned exchange_counter_T = 0u;

    alignas(16) float M_by_span[12];
    for (int i = 0; i < 12; ++i) M_by_span[i] = M_prior;
    float Mmax = M_prior;

    alignas(16) float phi[FAST_STATE_DIM];
    alignas(16) float s_arr[FAST_STATE_DIM];
    alignas(16) float c_arr[FAST_STATE_DIM];
    alignas(16) float sum_s[FAST_STATE_DIM];
    alignas(16) float sum_c[FAST_STATE_DIM];
    alignas(16) float q_try[FAST_STATE_DIM];

    bestQ.reserve(static_cast<size_t>(dim));
    AlignedFloatVec bestQIndexed;
    bestQIndexed.reserve(static_cast<size_t>(dim));

    AlignedFloatVec transitionReferenceSeed;
    float transitionRefF = FLT_MAX;
    unsigned transitionRefIdx = 0u;
    float transitionRefX = 0.0f;
    float transitionRefY = 0.0f;
    bool haveTransitionRefSeed = false;

    if (hasTransitionRefAngles) {
        const int refDim = cost.full_state_dim();
        transitionReferenceSeed.resize(static_cast<size_t>(refDim));
        std::memcpy(
            transitionReferenceSeed.data(),
            ref_state,
            static_cast<size_t>(refDim) * sizeof(float));

        float xr = 0.0f;
        float yr = 0.0f;
        unsigned idxr = 0u;
        float fr = 0.0f;
        const bool ref_ok = cost.evaluate_indexed(transitionReferenceSeed.data(), xr, yr, idxr, fr);

        transitionRefF = fr;
        transitionRefIdx = idxr;
        transitionRefX = xr;
        transitionRefY = yr;
        haveTransitionRefSeed = ref_ok;
    }

    unsigned bestIndexFound = 0u;
    float bestIndexValue = FLT_MAX;
    float bestIndexedX = 0.0f;
    float bestIndexedY = 0.0f;

    BestSolutionMsg lastSentBestMsg;
    InitBestSolutionMsg(lastSentBestMsg);

    int no_improve = 0;
    const float a = 0.0f;
    const float b = 1.0f;
    float p = 0.0f;
    float dmax = b - a;
    const float initial_len = dmax;

    const float A_dim = fmaf(1.0f / sqrtf(dim_f + 6.75f), 5.535f, 0.0f);
    const float A_dim__ = fmaf(1.0f / sqrtf(dim_f + 6.75f), 3.425f, 0.0f);
    const float B_dim = fmaf(A_dim, 0.7f, 0.0f);
    const float B_dim__ = fmaf(A_dim__, 4.325f, 0.0f);
    const float log_argument = A_dim - 2.03f;
    const float log_argument__ = A_dim__ - 2.0f;

    const float C_dim =
        fmaf(log_argument,
             fmaf(log_argument,
                  fmaf(log_argument,
                       fmaf(log_argument,
                            fmaf(log_argument, 0.164056f, -0.098462f),
                            0.240884f),
                       -0.351834f),
                  0.999996f),
             log_argument) - B_dim;

    const float C_dim__ =
        fmaf(log_argument__,
             fmaf(log_argument__,
                  fmaf(log_argument__,
                       fmaf(log_argument__,
                            fmaf(log_argument__, 0.164056f, -0.098462f),
                            0.240884f),
                       -0.351834f),
                  0.999996f),
             log_argument__) - B_dim__;

    const float adaptive_coeff_addition =
        fmaf(C_dim,
             fmaf(C_dim,
                  fmaf(C_dim,
                       fmaf(C_dim, 0.00833333377f, 0.0416666679f),
                       0.16666667f),
                  0.5f),
             1.0f);

    const float adaptive_coeff_addition__ =
        fmaf(C_dim__,
             fmaf(C_dim__,
                  fmaf(C_dim__,
                       fmaf(C_dim__, 0.00833333377f, 0.0416666679f),
                       0.16666667f),
                  0.5f),
             1.0f);

    float adaptive_coeff = A_dim - adaptive_coeff_addition;
    float adaptive_coeff__ = A_dim__ - adaptive_coeff_addition__;

    int it = 0;
    int stag_boost_remaining = 0;
    float stag_r_multiplier = 0.0f;

    int last_send_T = 0;
    const int send_interval_T = 7;
    int last_send_best = 0;
    const int send_interval_best = 2;

    const int n_stag_iters = static_cast<int>(3.0f + 2.045f * sqrtf(dim_f));
    const int noImproveThrDim =
        static_cast<int>(fmaf(7.5f, exp2f(-0.1f * sqrtf(dim_f)), 0.0f));

    const AGPHookeJeevesParams& hj_cfg = g_agp_hj_params;

    auto clamp01 = [](float v) noexcept -> float {
        return v < 0.0f ? 0.0f : (v > 1.0f ? 1.0f : v);
    };

    auto lerp_hj = [](float a0, float a1, float t) noexcept -> float {
        return fmaf(t, a1 - a0, a0);
    };

    auto select_hj_iteration_schedule = [&](float progress, int& period_out, int& phase_out) noexcept {
        period_out = hj_cfg.iteration_period;
        phase_out = hj_cfg.iteration_phase;

        int count = hj_cfg.iteration_schedule_count;
        if (count <= 0) return;
        if (count > AGP_HJ_MAX_PROGRESS_RULES) count = AGP_HJ_MAX_PROGRESS_RULES;

        const float p01 = clamp01(progress);
        for (int i = 0; i < count; ++i) {
            const AGPHJIterationScheduleRule& rule = hj_cfg.iteration_schedule[i];
            if (p01 < rule.progress_lt || i + 1 == count) {
                period_out = rule.period;
                phase_out = rule.phase;
                break;
            }
        }

        if (period_out < 1) period_out = 1;
        phase_out %= period_out;
        if (phase_out < 0) phase_out += period_out;
    };

    auto select_hj_coord_profile = [&](float progress) noexcept -> AGPHJCoordScaleProfile {
        int count = hj_cfg.coord_profile_schedule_count;
        if (count > AGP_HJ_MAX_PROGRESS_RULES) count = AGP_HJ_MAX_PROGRESS_RULES;

        const float p01 = clamp01(progress);
        if (count > 0) {
            for (int i = 0; i < count; ++i) {
                const AGPHJCoordScaleRule& rule = hj_cfg.coord_profile_schedule[i];
                if (p01 < rule.progress_lt || i + 1 == count) {
                    return rule.profile;
                }
            }
        }

        AGPHJCoordScaleProfile fallback{};
        fallback.angle_scale_proximal = hj_cfg.angle_scale_first;
        fallback.angle_scale_distal = hj_cfg.angle_scale_last;
        fallback.length_scale_proximal = hj_cfg.length_scale_first;
        fallback.length_scale_distal = hj_cfg.length_scale_last;
        return fallback;
    };

    auto select_hj_order_mode = [&](float progress) noexcept -> int {
        int order_mode = hj_cfg.order_mode;
        int count = hj_cfg.order_schedule_count;
        if (count <= 0) return order_mode;
        if (count > AGP_HJ_MAX_PROGRESS_RULES) count = AGP_HJ_MAX_PROGRESS_RULES;

        const float p01 = clamp01(progress);
        for (int i = 0; i < count; ++i) {
            const AGPHJOrderScheduleRule& rule = hj_cfg.order_schedule[i];
            if (p01 < rule.progress_lt || i + 1 == count) {
                order_mode = rule.order_mode;
                break;
            }
        }
        return order_mode;
    };

    auto coord_profile_scale = [&](int d, const AGPHJCoordScaleProfile& profile) noexcept -> float {
        if (d < n) {
            const float t01 = (n > 1) ? (static_cast<float>(d) / static_cast<float>(n - 1)) : 0.0f;
            return lerp_hj(profile.angle_scale_proximal, profile.angle_scale_distal, t01);
        }
        if (cost.variableLen) {
            const int j = d - n;
            const float t01 = (n > 1) ? (static_cast<float>(j) / static_cast<float>(n - 1)) : 0.0f;
            return lerp_hj(profile.length_scale_proximal, profile.length_scale_distal, t01);
        }
        return 1.0f;
    };

    alignas(16) float hj_delta[FAST_STATE_DIM];
    alignas(16) float hj_profile_scale_applied[FAST_STATE_DIM];
    alignas(64) int hj_order_natural[FAST_STATE_DIM];
    alignas(64) int hj_order_reverse[FAST_STATE_DIM];
    alignas(64) int hj_order_interleaved_proximal[FAST_STATE_DIM];
    alignas(64) int hj_order_interleaved_distal[FAST_STATE_DIM];
    alignas(64) int hj_order_lengths_first[FAST_STATE_DIM];

    {
        int pos = 0;
        while (pos < dim) {
            hj_order_natural[pos] = pos;
            ++pos;
        }

        pos = 0;
        int d = dim - 1;
        while (d >= 0) {
            hj_order_reverse[pos] = d;
            ++pos;
            --d;
        }

        pos = 0;
        if (cost.variableLen) {
            int j = 0;
            while (j < n) {
                hj_order_interleaved_proximal[pos] = j;
                ++pos;
                hj_order_interleaved_proximal[pos] = n + j;
                ++pos;
                ++j;
            }
        } else {
            int j = 0;
            while (j < n) {
                hj_order_interleaved_proximal[pos] = j;
                ++pos;
                ++j;
            }
        }

        pos = 0;
        if (cost.variableLen) {
            int j = n - 1;
            while (j >= 0) {
                hj_order_interleaved_distal[pos] = j;
                ++pos;
                hj_order_interleaved_distal[pos] = n + j;
                ++pos;
                --j;
            }
        } else {
            int j = n - 1;
            while (j >= 0) {
                hj_order_interleaved_distal[pos] = j;
                ++pos;
                --j;
            }
        }

        pos = 0;
        if (cost.variableLen) {
            int j = 0;
            while (j < n) {
                hj_order_lengths_first[pos] = n + j;
                ++pos;
                ++j;
            }
        }
        int j = 0;
        while (j < n) {
            hj_order_lengths_first[pos] = j;
            ++pos;
            ++j;
        }
    }

    auto select_hj_order_ptr = [&](int order_mode) noexcept -> const int* {
        switch (order_mode) {
        case AGP_HJ_ORDER_NATURAL: return hj_order_natural;
        case AGP_HJ_ORDER_REVERSE: return hj_order_reverse;
        case AGP_HJ_ORDER_INTERLEAVED_PROXIMAL: return hj_order_interleaved_proximal;
        case AGP_HJ_ORDER_LENGTHS_FIRST: return hj_order_lengths_first;
        default: return hj_order_interleaved_distal;
        }
    };

    int hj_profile_cache_key = INT_MIN;
    auto apply_hj_coord_profile = [&](float progress) noexcept {
        AGPHJCoordScaleProfile profile{};
        int key = -2;

        int count = hj_cfg.coord_profile_schedule_count;
        if (count > AGP_HJ_MAX_PROGRESS_RULES) count = AGP_HJ_MAX_PROGRESS_RULES;

        if (count > 0) {
            int i = 0;
            while (i < count) {
                const AGPHJCoordScaleRule& rule = hj_cfg.coord_profile_schedule[i];
                if (progress < rule.progress_lt || i + 1 == count) {
                    profile = rule.profile;
                    key = i;
                    break;
                }
                ++i;
            }
        } else {
            profile.angle_scale_proximal = hj_cfg.angle_scale_first;
            profile.angle_scale_distal = hj_cfg.angle_scale_last;
            profile.length_scale_proximal = hj_cfg.length_scale_first;
            profile.length_scale_distal = hj_cfg.length_scale_last;
        }

        if (key == hj_profile_cache_key) return;

        int d = 0;
        while (d < dim) {
            const float new_scale = fmaxf(coord_profile_scale(d, profile), 1.0e-12f);
            const float old_scale = fmaxf(hj_profile_scale_applied[d], 1.0e-12f);
            if (fabsf(new_scale - old_scale) > 1.0e-12f) {
                hj_delta[d] = fmaxf(hj_delta[d] * (new_scale / old_scale), hj_cfg.min_delta);
                hj_profile_scale_applied[d] = new_scale;
            }
            ++d;
        }

        hj_profile_cache_key = key;
    };

    {
        const AGPHJCoordScaleProfile initial_hj_profile = select_hj_coord_profile(0.0f);
        for (int d = 0; d < dim; ++d) {
            const float span = fmaf(map.high[d], 1.0f, -map.low[d]);
            const float scale = fmaxf(coord_profile_scale(d, initial_hj_profile), 1.0e-12f);
            hj_profile_scale_applied[d] = scale;
            hj_delta[d] = fmaxf(span * hj_cfg.init_delta_frac * scale, hj_cfg.init_delta_abs);
        }
    }

    auto t_to_idx = [&](float t) -> unsigned long long {
        return static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f));
    };

    auto finalize_interval_geometry = [&](IntervalND* I) noexcept {
        I->i1 = t_to_idx(I->x1);
        I->i2 = t_to_idx(I->x2);
        I->diam = map.block_diameter(I->i1, I->i2);
        I->compute_span_level(map);
        I->set_metric(I->diam);
    };

    auto update_pockets_and_Mmax = [&](IntervalND* I) noexcept {
        const int k = I->span_level;
        if (I->M > M_by_span[k]) M_by_span[k] = I->M;
        if (M_by_span[k] > Mmax) Mmax = M_by_span[k];
    };

    auto recompute_dmax = [&]() noexcept {
        float new_dmax = 0.0f;
        for (auto* pI : H) {
            const float Ls = fmaf(pI->x2, 1.0f, -pI->x1);
            if (Ls > new_dmax) new_dmax = Ls;
        }
        dmax = (new_dmax > 0.0f) ? new_dmax : initial_len;
    };

    auto recompute_heap_constM = [&](float m_cur) noexcept {
        const size_t sz = H.size();
        if (sz == 0u) return;
        if (!hasHardConstraints) RecomputeR_ConstM_AVX2_ND(H.data(), sz, m_cur);
        else RecomputeR_ConstM_Mixed_ND(H.data(), sz, m_cur);
        std::make_heap(H.begin(), H.end(), ComparePtrND);
    };

    auto update_best_from_trial = [&](const TrialPoint& tr) noexcept {
        if (better_indexed(tr.idx, tr.f, bestIndexFound, bestIndexValue)) {
            bestIndexFound = tr.idx;
            bestIndexValue = tr.f;
            agp_store_q(bestQIndexed, tr.q, dim);
            bestIndexedX = tr.x;
            bestIndexedY = tr.y;
        }
        if (tr.feasible) {
            if (tr.f < bestF) {
                bestF = tr.f;
                agp_store_q(bestQ, tr.q, dim);
                bestX = tr.x;
                bestY = tr.y;
                no_improve = 0;
            } else {
                ++no_improve;
            }
        }
    };

    auto refine_trial = [&](float* q_inout, float& x_io, float& y_io, unsigned& idx_io, float& f_io,
                            float t_lo, float t_hi, bool enforce_t_bounds) noexcept {
        if (!(f_io < fmaf(bestF, adaptive_coeff, 0.0f))) return;

        const float f_start = f_io;
        const float c1 = 1e-4f;
        const float tau = 0.5f;
        const int max_outer_iters = static_cast<int>(50.0f * (1.0f + 0.65f * p));
        const int max_backtrack = static_cast<int>(20.0f * (1.0f + 0.65f * p));
        const float lbfgs_trigger = 0.6f;
        const int m_lbfgs = 9;
        const int max_lbfgs_iters = static_cast<int>(25.0f * (1.0f + 0.65f * p));
        const float eps_lbfgs_curv = 1e-6f;
        const float eps_descent = 1e-12f;
        float eta = 2.0f / sqrtf(dim_f);

        auto computeGrad = [&](const float* q_in, const float x_in, const float y_in,
                               float* grad_out, float& grad_norm2_out) noexcept {
            float phi_acc_local = 0.0f;
            int ii = 0;
            while (ii < n) {
                phi_acc_local = fmaf(q_in[ii], 1.0f, phi_acc_local);
                phi[ii] = phi_acc_local;
                ++ii;
            }

            FABE13_SINCOS(phi, s_arr, c_arr, n);

            float acc_s = 0.0f;
            float acc_c = 0.0f;
            int kk = n - 1;
            while (kk >= 0) {
                const float Lk = cost.link_length(q_in, kk);
                acc_s = fmaf(Lk, s_arr[kk], acc_s);
                acc_c = fmaf(Lk, c_arr[kk], acc_c);
                sum_s[kk] = acc_s;
                sum_c[kk] = acc_c;
                --kk;
            }

            alignas(16) float prefix_tail_grad[FAST_STATE_DIM] = { 0.0f };

            if (transitionMode && hasTransitionRefAngles && cost.transitionPrefixEnergyWeight > 0.0f) {
                alignas(16) float prefix_delta[FAST_STATE_DIM] = { 0.0f };
                float phi_prefix = 0.0f;
                int jj = 0;
                while (jj < referenceAngleCount) {
                    phi_prefix += q_in[jj];
                    prefix_delta[jj] = ManipCost::wrap_pi(phi_prefix - cost.referencePrefixAngles[jj]);
                    ++jj;
                }

                float tail = 0.0f;
                jj = referenceAngleCount - 1;
                while (jj >= 0) {
                    tail += prefix_delta[jj];
                    prefix_tail_grad[jj] = tail;
                    --jj;
                }
            }

            const float dx = fmaf(x_in, 1.0f, -cost.targetX);
            const float dy = fmaf(y_in, 1.0f, -cost.targetY);
            const float dist2 = fmaf(dx, dx, dy * dy);
            const float dist = sqrtf(dist2);
            const float inv_dist = (dist > 1.0e-20f) ? (1.0f / dist) : 0.0f;

            grad_norm2_out = 0.0f;
            int i = 0;
            while (i < n) {
                float gpen = 0.0f;
                float g_main = 0.0f;

                if (transitionMode) {
                    const float dtheta =
                        (i < referenceAngleCount) ? cost.wrapped_angle_delta(q_in, i) : 0.0f;
                    gpen = 2.0f * cost.transitionEnergyWeight * dtheta;
                    gpen = fmaf(2.0f * cost.transitionPrefixEnergyWeight, prefix_tail_grad[i], gpen);
                    g_main = 2.0f * cost.transitionCaptureWeight *
                             fmaf(dx, -sum_s[i], dy * sum_c[i]);
                } else {
                    const float tsg = fmaf(-q_in[i], cost.archBiasK, 0.0f);
                    const float exp_arg = -tsg;
                    const float exp_val =
                        fmaf(exp_arg,
                             fmaf(exp_arg,
                                  fmaf(exp_arg,
                                       fmaf(exp_arg, 0.00833333377f, 0.0416666679f),
                                       0.16666667f),
                                  0.5f),
                             1.0f);
                    const float sig = 1.0f / (exp_val + 1.0f);
                    gpen = fmaf(-cost.archBiasW * cost.archBiasK, sig, gpen);
                    g_main = fmaf(dx, -sum_s[i], dy * sum_c[i]) * inv_dist;
                }

                const float gi = g_main + gpen;
                grad_out[i] = gi;
                grad_norm2_out = fmaf(gi, gi, grad_norm2_out);
                ++i;
            }

            if (cost.variableLen) {
                int j = 0;
                while (j < n) {
                    float gi = 0.0f;
                    if (transitionMode) {
                        float gpenL = 0.0f;
                        if (hasTransitionRefLengths && j < referenceLengthCount) {
                            gpenL = 2.0f * cost.transitionLengthEnergyWeight *
                                    (q_in[n + j] - ref_state[n + j]);
                        }
                        gi = 2.0f * cost.transitionCaptureWeight *
                             fmaf(dx, c_arr[j], dy * s_arr[j]) + gpenL;
                    } else {
                        gi = fmaf(dx, c_arr[j], dy * s_arr[j]) * inv_dist;
                    }
                    grad_out[n + j] = gi;
                    grad_norm2_out = fmaf(gi, gi, grad_norm2_out);
                    ++j;
                }
            }
        };

        auto armijoLineSearch =
            [&](const float* q_base, const float f_base, const float* dir, const float gtd,
                float& alpha_io, float* q_out, float& f_out, float& x_out, float& y_out,
                unsigned& idx_out) noexcept -> bool {
                float alpha = alpha_io;
                int backtrack = 0;

                if (needsProjectedCandidates) {
                    while (backtrack < max_backtrack) {
                        int i = 0;
                        while (i < dim) {
                            q_out[i] = fmaf(alpha, dir[i], q_base[i]);
                            ++i;
                        }

                        clampPointProjected(q_out);

                        float x2 = 0.0f;
                        float y2 = 0.0f;
                        unsigned idx_try = 0u;
                        float val_try = 0.0f;
                        const bool feasible_try = cost.evaluate_indexed(q_out, x2, y2, idx_try, val_try);

                        if (!feasible_try || !(val_try == val_try)) {
                            alpha *= tau;
                            ++backtrack;
                            continue;
                        }

                        const float t_try = clamp01(map.pointToT(q_out));
                        if (enforce_t_bounds) {
                            if (t_hi - t_lo > 2.0f * T_EPS) {
                                if (!(t_try > t_lo + T_EPS && t_try < t_hi - T_EPS)) {
                                    alpha *= tau;
                                    ++backtrack;
                                    continue;
                                }
                            } else {
                                if (fabsf(t_try - t_lo) > T_EPS) {
                                    alpha *= tau;
                                    ++backtrack;
                                    continue;
                                }
                            }
                        }

                        if (val_try <= f_base + c1 * alpha * gtd) {
                            alpha_io = alpha;
                            f_out = val_try;
                            x_out = x2;
                            y_out = y2;
                            idx_out = idx_try;
                            return true;
                        }

                        alpha *= tau;
                        ++backtrack;
                    }
                } else {
                    while (backtrack < max_backtrack) {
                        int i = 0;
                        while (i < dim) {
                            q_out[i] = fmaf(alpha, dir[i], q_base[i]);
                            ++i;
                        }

                        float x2 = 0.0f;
                        float y2 = 0.0f;
                        unsigned idx_try = 0u;
                        float val_try = 0.0f;
                        cost.evaluate_indexed(q_out, x2, y2, idx_try, val_try);

                        const float t_try = clamp01(map.pointToT(q_out));
                        if (enforce_t_bounds) {
                            if (t_hi - t_lo > 2.0f * T_EPS) {
                                if (!(t_try > t_lo + T_EPS && t_try < t_hi - T_EPS)) {
                                    alpha *= tau;
                                    ++backtrack;
                                    continue;
                                }
                            } else {
                                if (fabsf(t_try - t_lo) > T_EPS) {
                                    alpha *= tau;
                                    ++backtrack;
                                    continue;
                                }
                            }
                        }

                        if (val_try <= f_base + c1 * alpha * gtd) {
                            alpha_io = alpha;
                            f_out = val_try;
                            x_out = x2;
                            y_out = y2;
                            idx_out = idx_try;
                            return true;
                        }

                        alpha *= tau;
                        ++backtrack;
                    }
                }

                return false;
            };

        bool lbfgs_already_tried = false;
        int outer = 0;

        while (outer < max_outer_iters) {
            float grad[FAST_STATE_DIM];
            float grad_norm2 = 0.0f;
            computeGrad(q_inout, x_io, y_io, grad, grad_norm2);

            if (!(grad_norm2 == grad_norm2) || grad_norm2 < 1.0e-12f) break;

            float dir_gd[FAST_STATE_DIM];
            int i = 0;
            while (i < dim) {
                dir_gd[i] = -grad[i];
                ++i;
            }

            const float gtd_gd = -grad_norm2;
            float eta_trial = eta;
            float f_new = f_io;
            float x_new = x_io;
            float y_new = y_io;
            unsigned idx_new = idx_io;

            const bool found =
                armijoLineSearch(q_inout, f_io, dir_gd, gtd_gd, eta_trial,
                                 q_try, f_new, x_new, y_new, idx_new);
            if (!found) break;

            std::memcpy(q_inout, q_try, static_cast<size_t>(dim) * sizeof(float));
            f_io = f_new;
            x_io = x_new;
            y_io = y_new;
            idx_io = idx_new;
            eta = eta_trial;

            const float rel_impr = (f_start > 1.0e-20f) ? ((f_start - f_io) / f_start) : 0.0f;
            if (!lbfgs_already_tried && rel_impr >= lbfgs_trigger) {
                lbfgs_already_tried = true;

                float q_resume[FAST_STATE_DIM];
                std::memcpy(q_resume, q_inout, static_cast<size_t>(dim) * sizeof(float));
                const float f_resume = f_io;
                const float x_resume = x_io;
                const float y_resume = y_io;
                const unsigned idx_resume = idx_io;
                const float eta_resume = eta;

                float q_best_lbfgs[FAST_STATE_DIM];
                std::memcpy(q_best_lbfgs, q_inout, static_cast<size_t>(dim) * sizeof(float));
                float f_best_lbfgs = f_io;
                float x_best_lbfgs = x_io;
                float y_best_lbfgs = y_io;
                unsigned idx_best_lbfgs = idx_io;

                float s_hist[9][FAST_STATE_DIM];
                float y_hist[9][FAST_STATE_DIM];
                float rho_hist[9];
                float alpha_hist[9];
                int hist_size = 0;

                float gk[FAST_STATE_DIM];
                float gk_norm2 = 0.0f;
                computeGrad(q_inout, x_io, y_io, gk, gk_norm2);

                bool lbfgs_ok = true;
                float alpha_k = 2.0f / sqrtf(dim_f);
                int it_lbfgs = 0;

                while (it_lbfgs < max_lbfgs_iters) {
                    if (!(gk_norm2 == gk_norm2) || gk_norm2 < 1.0e-12f) break;

                    float dir[FAST_STATE_DIM];

                    if (hist_size == 0) {
                        int d = 0;
                        while (d < dim) {
                            dir[d] = -gk[d];
                            ++d;
                        }
                    } else {
                        float q_vec[FAST_STATE_DIM];
                        int d = 0;
                        while (d < dim) {
                            q_vec[d] = gk[d];
                            ++d;
                        }

                        for (int jj = hist_size - 1; jj >= 0; --jj) {
                            float dot_sq = 0.0f;
                            int d2 = 0;
                            while (d2 < dim) {
                                dot_sq = fmaf(s_hist[jj][d2], q_vec[d2], dot_sq);
                                ++d2;
                            }
                            const float a_coeff = dot_sq * rho_hist[jj];
                            alpha_hist[jj] = a_coeff;
                            d2 = 0;
                            while (d2 < dim) {
                                q_vec[d2] = fmaf(-a_coeff, y_hist[jj][d2], q_vec[d2]);
                                ++d2;
                            }
                        }

                        float gamma = 1.0f;
                        {
                            const int last = hist_size - 1;
                            float yy = 0.0f;
                            int d2 = 0;
                            while (d2 < dim) {
                                yy = fmaf(y_hist[last][d2], y_hist[last][d2], yy);
                                ++d2;
                            }
                            const float ys = 1.0f / rho_hist[last];
                            if (yy > 0.0f) gamma = ys / yy;
                        }

                        float r_vec[FAST_STATE_DIM];
                        d = 0;
                        while (d < dim) {
                            r_vec[d] = gamma * q_vec[d];
                            ++d;
                        }

                        for (int jj = 0; jj < hist_size; ++jj) {
                            float dot_yr = 0.0f;
                            int d2 = 0;
                            while (d2 < dim) {
                                dot_yr = fmaf(y_hist[jj][d2], r_vec[d2], dot_yr);
                                ++d2;
                            }
                            const float b_coeff = dot_yr * rho_hist[jj];
                            const float coeff = alpha_hist[jj] - b_coeff;
                            d2 = 0;
                            while (d2 < dim) {
                                r_vec[d2] = fmaf(coeff, s_hist[jj][d2], r_vec[d2]);
                                ++d2;
                            }
                        }

                        d = 0;
                        while (d < dim) {
                            dir[d] = -r_vec[d];
                            ++d;
                        }
                    }

                    float gtd = 0.0f;
                    int d = 0;
                    while (d < dim) {
                        gtd = fmaf(gk[d], dir[d], gtd);
                        ++d;
                    }

                    if (!(gtd == gtd) || gtd >= -eps_descent) {
                        d = 0;
                        while (d < dim) {
                            dir[d] = -gk[d];
                            ++d;
                        }
                        gtd = -gk_norm2;
                        if (gtd >= -eps_descent) {
                            lbfgs_ok = false;
                            break;
                        }
                    }

                    float q_old[FAST_STATE_DIM];
                    std::memcpy(q_old, q_inout, static_cast<size_t>(dim) * sizeof(float));
                    float g_old[FAST_STATE_DIM];
                    std::memcpy(g_old, gk, static_cast<size_t>(dim) * sizeof(float));

                    float alpha_try = alpha_k;
                    float f_try = f_io;
                    float x_try = x_io;
                    float y_try = y_io;
                    unsigned idx_try = idx_io;

                    const bool step_ok =
                        armijoLineSearch(q_inout, f_io, dir, gtd, alpha_try,
                                         q_try, f_try, x_try, y_try, idx_try);
                    if (!step_ok) {
                        lbfgs_ok = false;
                        break;
                    }

                    std::memcpy(q_inout, q_try, static_cast<size_t>(dim) * sizeof(float));
                    f_io = f_try;
                    x_io = x_try;
                    y_io = y_try;
                    idx_io = idx_try;
                    alpha_k = alpha_try;

                    computeGrad(q_inout, x_io, y_io, gk, gk_norm2);
                    if (!(gk_norm2 == gk_norm2)) {
                        lbfgs_ok = false;
                        break;
                    }

                    if (f_io < f_best_lbfgs) {
                        f_best_lbfgs = f_io;
                        std::memcpy(q_best_lbfgs, q_inout, static_cast<size_t>(dim) * sizeof(float));
                        x_best_lbfgs = x_io;
                        y_best_lbfgs = y_io;
                        idx_best_lbfgs = idx_io;
                    }

                    float ys = 0.0f;
                    float s_new[FAST_STATE_DIM];
                    float y_new[FAST_STATE_DIM];
                    d = 0;
                    while (d < dim) {
                        const float sd = q_inout[d] - q_old[d];
                        const float yd = gk[d] - g_old[d];
                        s_new[d] = sd;
                        y_new[d] = yd;
                        ys = fmaf(yd, sd, ys);
                        ++d;
                    }

                    if (!(ys == ys) || ys <= eps_lbfgs_curv) {
                        hist_size = 0;
                    } else {
                        if (hist_size < m_lbfgs) {
                            const int idx_hist = hist_size;
                            rho_hist[idx_hist] = 1.0f / ys;
                            d = 0;
                            while (d < dim) {
                                s_hist[idx_hist][d] = s_new[d];
                                y_hist[idx_hist][d] = y_new[d];
                                ++d;
                            }
                            ++hist_size;
                        } else {
                            for (int jj = 0; jj < m_lbfgs - 1; ++jj) {
                                rho_hist[jj] = rho_hist[jj + 1];
                                d = 0;
                                while (d < dim) {
                                    s_hist[jj][d] = s_hist[jj + 1][d];
                                    y_hist[jj][d] = y_hist[jj + 1][d];
                                    ++d;
                                }
                            }
                            const int idx_hist = m_lbfgs - 1;
                            rho_hist[idx_hist] = 1.0f / ys;
                            d = 0;
                            while (d < dim) {
                                s_hist[idx_hist][d] = s_new[d];
                                y_hist[idx_hist][d] = y_new[d];
                                ++d;
                            }
                            hist_size = m_lbfgs;
                        }
                    }

                    if (gk_norm2 < 1.0e-12f) break;
                    ++it_lbfgs;
                }

                if (lbfgs_ok) {
                    std::memcpy(q_inout, q_best_lbfgs, static_cast<size_t>(dim) * sizeof(float));
                    f_io = f_best_lbfgs;
                    x_io = x_best_lbfgs;
                    y_io = y_best_lbfgs;
                    idx_io = idx_best_lbfgs;
                    break;
                } else {
                    if (f_best_lbfgs < f_resume) {
                        std::memcpy(q_inout, q_best_lbfgs, static_cast<size_t>(dim) * sizeof(float));
                        f_io = f_best_lbfgs;
                        x_io = x_best_lbfgs;
                        y_io = y_best_lbfgs;
                        idx_io = idx_best_lbfgs;
                    } else {
                        std::memcpy(q_inout, q_resume, static_cast<size_t>(dim) * sizeof(float));
                        f_io = f_resume;
                        x_io = x_resume;
                        y_io = y_resume;
                        idx_io = idx_resume;
                    }
                    eta = eta_resume;
                }
            }

            ++outer;
        }
    };

    auto evaluate_trial_from_q = [&](const float* q_seed, float t_lo, float t_hi, bool enforce_t_bounds) noexcept -> TrialPoint {
        TrialPoint tr{};
        std::memcpy(tr.q, q_seed, static_cast<size_t>(dim) * sizeof(float));
        tr.feasible = cost.evaluate_indexed(tr.q, tr.x, tr.y, tr.idx, tr.f);
        if (tr.feasible) {
            refine_trial(tr.q, tr.x, tr.y, tr.idx, tr.f, t_lo, t_hi, enforce_t_bounds);
        }
        tr.t = clamp01(map.pointToT(tr.q));
        if (enforce_t_bounds) {
            if (tr.t < t_lo) tr.t = t_lo;
            if (tr.t > t_hi) tr.t = t_hi;
        }
        update_best_from_trial(tr);
        return tr;
    };

    auto evaluate_trial_from_t = [&](float t_base, float t_lo, float t_hi, bool enforce_t_bounds) noexcept -> TrialPoint {
        alignas(16) float q_seed[FAST_STATE_DIM];
        map.map01ToPoint(t_base, q_seed);
        return evaluate_trial_from_q(q_seed, t_lo, t_hi, enforce_t_bounds);
    };

    auto make_interval_from_trials = [&](const TrialPoint& Lp, const TrialPoint& Rp) noexcept -> IntervalND* {
        if (!(Lp.t + T_EPS < Rp.t)) return nullptr;
        IntervalND* I = new IntervalND(Lp.t, Rp.t, Lp.f, Rp.f, Lp.idx, Rp.idx);
        finalize_interval_geometry(I);
        return I;
    };

    auto find_container_index = [&](float t) noexcept -> int {
        int best_pos = -1;
        float best_len = FLT_MAX;
        for (size_t i = 0u; i < H.size(); ++i) {
            IntervalND* I = H[i];
            if (t > I->x1 + T_EPS && t < I->x2 - T_EPS) {
                const float len = fmaf(I->x2, 1.0f, -I->x1);
                if (len < best_len) {
                    best_len = len;
                    best_pos = static_cast<int>(i);
                }
            }
        }
        if (best_pos >= 0) return best_pos;
        for (size_t i = 0u; i < H.size(); ++i) {
            IntervalND* I = H[i];
            if (t >= I->x1 - T_EPS && t <= I->x2 + T_EPS) {
                const float len = fmaf(I->x2, 1.0f, -I->x1);
                if (len < best_len) {
                    best_len = len;
                    best_pos = static_cast<int>(i);
                }
            }
        }
        return best_pos;
    };

    auto inject_trial_into_queue = [&](const TrialPoint& tr, float r_eff_cur, float adaptive_coeff_local) noexcept {
        if (H.empty()) return;

        const int pos = find_container_index(tr.t);
        if (pos < 0) return;

        IntervalND* src = H[static_cast<size_t>(pos)];
        if (!(tr.t > src->x1 + T_EPS && tr.t < src->x2 - T_EPS)) return;

        const float src_len = fmaf(src->x2, 1.0f, -src->x1);

        IntervalND* L = new IntervalND(src->x1, tr.t, src->y1, tr.f, src->idx1, tr.idx);
        IntervalND* Rv = new IntervalND(tr.t, src->x2, tr.f, src->y2, tr.idx, src->idx2);
        finalize_interval_geometry(L);
        finalize_interval_geometry(Rv);

        const float prevMmax = Mmax;

        H[static_cast<size_t>(pos)] = H.back();
        H.pop_back();
        delete src;

        if (!H.empty()) std::make_heap(H.begin(), H.end(), ComparePtrND);

        update_pockets_and_Mmax(L);
        update_pockets_and_Mmax(Rv);

        const float m_cur = fmaf(r_eff_cur, Mmax, 0.0f);
        L->ChangeCharacteristic(m_cur);
        Rv->ChangeCharacteristic(m_cur);

        H.emplace_back(L);
        std::push_heap(H.begin(), H.end(), ComparePtrND);
        H.emplace_back(Rv);
        std::push_heap(H.begin(), H.end(), ComparePtrND);

        if (Mmax > fmaf(adaptive_coeff_local, prevMmax, 0.0f)) recompute_heap_constM(m_cur);
        if (src_len + T_EPS >= dmax) recompute_dmax();
    };

    auto find_container_index_bounds = [&](const float t, float& lo, float& hi) noexcept -> int {
        int best_pos = -1;
        float best_len = FLT_MAX;

        size_t i = 0u;
        while (i < H.size()) {
            IntervalND* const I = H[i];
            if (t > I->x1 + T_EPS && t < I->x2 - T_EPS) {
                const float len = fmaf(I->x2, 1.0f, -I->x1);
                if (len < best_len) {
                    best_len = len;
                    best_pos = static_cast<int>(i);
                }
            }
            ++i;
        }

        if (best_pos < 0) {
            i = 0u;
            while (i < H.size()) {
                IntervalND* const I = H[i];
                if (t >= I->x1 - T_EPS && t <= I->x2 + T_EPS) {
                    const float len = fmaf(I->x2, 1.0f, -I->x1);
                    if (len < best_len) {
                        best_len = len;
                        best_pos = static_cast<int>(i);
                    }
                }
                ++i;
            }
        }

        if (best_pos >= 0) {
            const IntervalND* const I = H[static_cast<size_t>(best_pos)];
            lo = I->x1;
            hi = I->x2;
        } else {
            lo = 0.0f;
            hi = 1.0f;
        }

        return best_pos;
    };

    auto inject_trial_into_queue_fast = [&](const TrialPoint& tr, const int pos, float r_eff_cur, float adaptive_coeff_local) noexcept {
        if (pos < 0 || static_cast<size_t>(pos) >= H.size()) {
            inject_trial_into_queue(tr, r_eff_cur, adaptive_coeff_local);
            return;
        }

        IntervalND* const src = H[static_cast<size_t>(pos)];
        if (!(tr.t > src->x1 + T_EPS && tr.t < src->x2 - T_EPS)) {
            inject_trial_into_queue(tr, r_eff_cur, adaptive_coeff_local);
            return;
        }

        const float src_len = fmaf(src->x2, 1.0f, -src->x1);

        IntervalND* const L = new IntervalND(src->x1, tr.t, src->y1, tr.f, src->idx1, tr.idx);
        IntervalND* const Rv = new IntervalND(tr.t, src->x2, tr.f, src->y2, tr.idx, src->idx2);
        finalize_interval_geometry(L);
        finalize_interval_geometry(Rv);

        const float prevMmax = Mmax;

        H[static_cast<size_t>(pos)] = H.back();
        H.pop_back();
        delete src;

        if (!H.empty()) std::make_heap(H.begin(), H.end(), ComparePtrND);

        update_pockets_and_Mmax(L);
        update_pockets_and_Mmax(Rv);

        const float m_cur = fmaf(r_eff_cur, Mmax, 0.0f);
        L->ChangeCharacteristic(m_cur);
        Rv->ChangeCharacteristic(m_cur);

        H.emplace_back(L);
        std::push_heap(H.begin(), H.end(), ComparePtrND);
        H.emplace_back(Rv);
        std::push_heap(H.begin(), H.end(), ComparePtrND);

        if (Mmax > fmaf(adaptive_coeff_local, prevMmax, 0.0f)) recompute_heap_constM(m_cur);
        if (src_len + T_EPS >= dmax) recompute_dmax();
    };

    auto progress_outgoing = [&]() noexcept {
        for (auto itReq = g_pendingMulti.begin(); itReq != g_pendingMulti.end();) {
            if (itReq->req.test()) itReq = g_pendingMulti.erase(itReq);
            else ++itReq;
        }
        for (auto itReq = g_pendingBest.begin(); itReq != g_pendingBest.end();) {
            if (itReq->req.test()) itReq = g_pendingBest.erase(itReq);
            else ++itReq;
        }
    };

    auto select_hj_seed = [&](const AlignedFloatVec*& seed_vec, float& seed_f, unsigned& seed_idx, float& seed_x, float& seed_y) noexcept -> bool {
        auto take_indexed = [&]() noexcept -> bool {
            if (bestQIndexed.empty()) return false;
            seed_vec = &bestQIndexed;
            seed_f = bestIndexValue;
            seed_idx = bestIndexFound;
            seed_x = bestIndexedX;
            seed_y = bestIndexedY;
            return true;
        };

        auto take_feasible = [&]() noexcept -> bool {
            if (bestQ.empty()) return false;
            seed_vec = &bestQ;
            seed_f = bestF;
            seed_idx = fullConstraintIndex;
            seed_x = bestX;
            seed_y = bestY;
            return true;
        };

        auto take_reference = [&]() noexcept -> bool {
            if (!haveTransitionRefSeed) return false;
            seed_vec = &transitionReferenceSeed;
            seed_f = transitionRefF;
            seed_idx = transitionRefIdx;
            seed_x = transitionRefX;
            seed_y = transitionRefY;
            return true;
        };

        seed_vec = nullptr;
        seed_f = FLT_MAX;
        seed_idx = 0u;
        seed_x = 0.0f;
        seed_y = 0.0f;

        if (transitionMode) return take_feasible() || take_reference() || take_indexed();
        return take_indexed() || take_feasible();
    };

    auto should_run_hj_this_iter = [&](float progress) noexcept -> bool {
        if (!hj_cfg.enabled) return false;
        if (hj_cfg.require_adaptive_flag && !adaptive) return false;
        int period = 1;
        int phase = 0;
        select_hj_iteration_schedule(progress, period, phase);
        if (period <= 1) return true;
        return (it % period) == phase;
    };

    auto run_hooke_jeeves_iteration = [&](float r_eff_cur, float adaptive_coeff_local, float progress) noexcept {
        if (H.empty()) return;

        const AlignedFloatVec* seedQ = nullptr;
        float seedF = FLT_MAX;
        float seedX = 0.0f;
        float seedY = 0.0f;
        unsigned seedIdx = 0u;

        if (!select_hj_seed(seedQ, seedF, seedIdx, seedX, seedY)) return;

        TrialPoint base{};
        std::memcpy(base.q, seedQ->data(), static_cast<size_t>(dim) * sizeof(float));
        base.t = clamp01(map.pointToT(base.q));
        base.f = seedF;
        base.idx = seedIdx;
        base.x = seedX;
        base.y = seedY;
        base.feasible = (base.idx == fullConstraintIndex);

        auto evaluate_hj_candidate = [&](const float* q_candidate) noexcept -> TrialPoint {
            const float t_guess = clamp01(map.pointToT(q_candidate));
            float lo = 0.0f;
            float hi = 1.0f;
            const int pos = find_container_index_bounds(t_guess, lo, hi);
            const TrialPoint tr = evaluate_trial_from_q(q_candidate, lo, hi, true);
            inject_trial_into_queue_fast(tr, pos, r_eff_cur, adaptive_coeff_local);
            return tr;
        };

        apply_hj_coord_profile(progress);

        const int hj_order_mode = select_hj_order_mode(progress);
        const int* const hj_order = select_hj_order_ptr(hj_order_mode);

        TrialPoint current = base;
        TrialPoint base_before = base;
        bool improved_any = false;

        int ord = 0;
        while (ord < dim) {
            const int d = hj_order[ord];
            const float step = hj_delta[d];
            const TrialPoint axis_base = current;

            auto try_direction = [&](int sgn, TrialPoint& accepted_tp) noexcept -> bool {
                float q_candidate[FAST_STATE_DIM];
                std::memcpy(q_candidate, axis_base.q, static_cast<size_t>(dim) * sizeof(float));
                q_candidate[d] = fmaf(static_cast<float>(sgn), step, q_candidate[d]);
                if (needsProjectedCandidates) clampPointProjected(q_candidate);
                if (fabsf(q_candidate[d] - axis_base.q[d]) <= 1.0e-12f) return false;

                const TrialPoint tp = evaluate_hj_candidate(q_candidate);
                if (better_indexed(tp.idx, tp.f, axis_base.idx, axis_base.f)) {
                    accepted_tp = tp;
                    return true;
                }
                return false;
            };

            const int first_sign = hj_cfg.plus_first ? +1 : -1;
            const int second_sign = -first_sign;

            bool improved_this_axis = false;
            TrialPoint best_axis_tp{};
            bool have_axis_tp = false;
            TrialPoint tp_try{};

            if (try_direction(first_sign, tp_try)) {
                best_axis_tp = tp_try;
                have_axis_tp = true;
                if (hj_cfg.opportunistic_axis) {
                    current = best_axis_tp;
                    improved_any = true;
                    improved_this_axis = true;
                }
            }

            if (!hj_cfg.opportunistic_axis || !have_axis_tp) {
                if (try_direction(second_sign, tp_try)) {
                    if (!have_axis_tp || better_indexed(tp_try.idx, tp_try.f, best_axis_tp.idx, best_axis_tp.f)) {
                        best_axis_tp = tp_try;
                        have_axis_tp = true;
                    }
                }
                if (have_axis_tp) {
                    current = best_axis_tp;
                    improved_any = true;
                    improved_this_axis = true;
                }
            }

            if (improved_this_axis && hj_cfg.expand_delta_on_success) {
                hj_delta[d] = fmaxf(hj_cfg.min_delta, hj_delta[d] * hj_cfg.expand_factor);
            }

            ++ord;
        }

        if (improved_any) {
            if (hj_cfg.use_pattern) {
                float q_pattern[FAST_STATE_DIM];
                for (int d = 0; d < dim; ++d) {
                    q_pattern[d] = fmaf(hj_cfg.pattern_gain, current.q[d] - base_before.q[d], current.q[d]);
                }
                if (needsProjectedCandidates) clampPointProjected(q_pattern);
                TrialPoint pat_tp = evaluate_hj_candidate(q_pattern);
                if (better_indexed(pat_tp.idx, pat_tp.f, current.idx, current.f)) {
                    current = pat_tp;
                }
            }
        } else {
            for (int d = 0; d < dim; ++d) {
                hj_delta[d] = fmaxf(hj_delta[d] * hj_cfg.shrink_factor, hj_cfg.min_delta);
            }
        }
    };

    auto ensure_queue_nonempty = [&](float r_eff_cur) noexcept -> bool {
        if (!H.empty()) return true;

        const AlignedFloatVec* seedQ = nullptr;
        float seedF = FLT_MAX;
        float seedX = 0.0f;
        float seedY = 0.0f;
        unsigned seedIdx = 0u;
        if (!select_hj_seed(seedQ, seedF, seedIdx, seedX, seedY)) return false;

        const float t_center = clamp01(map.pointToT(seedQ->data()));
        const float half = fmaxf(1.0e-4f * dim_f, 1.0e-6f);
        const float t1b = clamp01(t_center - half);
        const float t2b = clamp01(t_center + half);
        const float mid = 0.5f * (t1b + t2b);

        const TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true);
        const TrialPoint right_tp = evaluate_trial_from_t(t2b, mid, t2b, true);

        IntervalND* I = make_interval_from_trials(left_tp, right_tp);
        if (I == nullptr) return false;

        update_pockets_and_Mmax(I);
        I->ChangeCharacteristic(fmaf(r_eff_cur, Mmax, 0.0f));
        H.emplace_back(I);
        std::push_heap(H.begin(), H.end(), ComparePtrND);
        recompute_dmax();
        return true;
    };

    while (g_world->iprobe(boost::mpi::any_source, 0)) {
        MultiCrossMsg dummy;
        g_world->recv(boost::mpi::any_source, 0, dummy);
    }
    while (g_world->iprobe(boost::mpi::any_source, 2)) {
        BestSolutionMsg dummy;
        g_world->recv(boost::mpi::any_source, 2, dummy);
    }

    const int seedStride = FAST_SEED_STRIDE;
    AlignedFloatVec seeds(static_cast<size_t>(FAST_MAX_SEEDS) * static_cast<size_t>(seedStride), 0.0f);

    const int seedCnt =
        generate_heuristic_seeds(cost, map, dim, seeds.data(), seedStride,
                                 static_cast<unsigned>(fmaf(static_cast<float>(g_world->rank()),
                                                            7919.0f,
                                                            static_cast<float>(seed))));

    const int K_raw =
        static_cast<int>(fmaf(-fmaf(sqrtf(dim_f), dim_f, 0.0f), 0.725f, 10.95f));
    const int K = (K_raw > 2) ? K_raw : 2;

    H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + static_cast<size_t>(seedCnt) + 32u);

    const int rank = g_world->rank();
    const int world = g_world->size();
    const size_t comm_levels = std::bit_width(static_cast<size_t>(world - 1));

    for (int i = 0; i < seedCnt; ++i) {
        const float* s = seeds.data() + static_cast<size_t>(i) * static_cast<size_t>(seedStride);
        const float t_seed = clamp01(map.pointToT(s));

        const float interval_size =
            (i < 3)
            ? fmaf(0.0004f, static_cast<float>(dim), 0.0f)
            : fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f),
                   exp2f((1.0f / static_cast<float>(fmaxf(static_cast<float>(seedCnt - 4), 1.0f))) *
                         log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) *
                         static_cast<float>(i - 3)),
                   0.0f);

        const float t1b = clamp01(fmaf(-interval_size, 1.0f, t_seed));
        const float t2b = clamp01(fmaf(interval_size, 1.0f, t_seed));
        const float mid = 0.5f * (t1b + t2b);

        const TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true);
        const TrialPoint right_tp = evaluate_trial_from_t(t2b, mid, t2b, true);

        IntervalND* I = make_interval_from_trials(left_tp, right_tp);
        if (I == nullptr) continue;

        update_pockets_and_Mmax(I);
        I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));

        if (i < 3) {
            I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f);
        } else {
            const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f);
            const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f);
            const float mult =
                fmaf(start_mult,
                     exp2f((1.0f / static_cast<float>(fmaxf(static_cast<float>(seedCnt - 4), 1.0f))) *
                           log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) *
                           static_cast<float>(i - 3)),
                     0.0f);
            I->R = fmaf(I->R, mult, 0.0f);
        }

        H.emplace_back(I);
        std::push_heap(H.begin(), H.end(), ComparePtrND);
    }

    std::vector<float> grid_base;
    grid_base.reserve(static_cast<size_t>(K) + 2u);
    grid_base.emplace_back(a);

    for (int k = 1; k <= K; ++k) {
        const float t =
            clamp01(fmaf(fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a),
                         1.0f,
                         static_cast<float>(rank) / static_cast<float>(world * (K + 1))));
        grid_base.emplace_back(t);
    }
    grid_base.emplace_back(b);

    std::vector<TrialPoint> grid_trials(grid_base.size());
    for (size_t i = 0u; i < grid_base.size(); ++i) {
        float lo = grid_base[i];
        float hi = grid_base[i];
        if (i == 0u) {
            lo = hi = a;
        } else if (i + 1u == grid_base.size()) {
            lo = hi = b;
        } else {
            lo = 0.5f * (grid_base[i - 1u] + grid_base[i]);
            hi = 0.5f * (grid_base[i] + grid_base[i + 1u]);
        }
        grid_trials[i] = evaluate_trial_from_t(grid_base[i], lo, hi, true);
    }

    for (size_t i = 1u; i < grid_trials.size(); ++i) {
        IntervalND* I = make_interval_from_trials(grid_trials[i - 1u], grid_trials[i]);
        if (I == nullptr) continue;
        update_pockets_and_Mmax(I);
        I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
        H.emplace_back(I);
        std::push_heap(H.begin(), H.end(), ComparePtrND);
    }

    recompute_dmax();

    if (haveTransitionRefSeed) {
        const TrialPoint ref_tp = evaluate_trial_from_q(transitionReferenceSeed.data(), 0.0f, 1.0f, false);
        if (!H.empty()) {
            inject_trial_into_queue(ref_tp, r, 1.0f);
            recompute_dmax();
        } else {
            ensure_queue_nonempty(r);
        }
    }

    g_world->barrier();

    while (true) {
        p = fmaf(-1.0f / initial_len, dmax, 1.0f);

        stag_r_multiplier =
            1.4f - 1.1f *
            fmaf(0.65f * p - 0.45f,
                 fmaf(0.65f * p - 0.45f,
                      fmaf(0.65f * p - 0.45f,
                           fmaf(0.65f * p - 0.45f,
                                fmaf(0.65f * p - 0.45f, 0.164056f, -0.098462f),
                                0.240884f),
                           -0.351834f),
                      0.999996f),
                 0.65f * p - 0.45f);

        const float p_arg = fmaf(p, 2.3f, -2.9775f);

        float current_r = r;
        if (stag_boost_remaining > 0) {
            current_r = r * stag_r_multiplier;
            --stag_boost_remaining;
        }

        const float exp_arg = fmaf(B_dim, p, 0.0f);
        const float exp_arg__ = fmaf(B_dim__, p, 0.0f);

        adaptive_coeff =
            fmaf(-fmaf(exp_arg,
                       fmaf(exp_arg,
                            fmaf(exp_arg,
                                 fmaf(exp_arg, 0.00833333377f, 0.0416666679f),
                                 0.16666667f),
                            0.5f),
                       1.0f),
                 adaptive_coeff_addition,
                 A_dim);

        const float first_sqrt  = sqrtf(fmaf(1.0f / dim_f,         2.0f, 0.0f));
        const float second_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 5.0f, 0.0f));
        const float third_sqrt  = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 9.0f, 0.0f));
        const float fourth_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 6.5f, 0.0f));
        const float rr = sqrtf(fmaf(-p, 1.0f, 1.0f));
        const float xx = p * p;
        const float tt = fmaf(500.0f, p, -486.95472f);

        const float adaptive_coeff_ =
            (p < 0.95f)
            ? fmaf(fmaf(first_sqrt, xx, 0.0f),
                   0.0130349902f,
                   fmaf(-0.04f, p, fmaf(fmaf(first_sqrt, rr, 0.0f), 0.15f, 1.1f)))
            : (p < 0.97390944f)
              ? fmaf(second_sqrt, rr, 0.9396f)
              : (p < 0.97590944f)
                ? fmaf(fmaf(fmaf(fmaf(third_sqrt, tt, 0.0f), tt, 0.0f),
                             fmaf(-2.0f, tt, 3.0f), 0.0f),
                       fmaf(0.25f, rr, -0.0396f),
                       fmaf(fmaf(third_sqrt, rr, 0.0f), 0.75f, 0.9396f))
                : fmaf(fourth_sqrt, rr, 0.925f);

        adaptive_coeff__ =
            fmaf(fmaf(exp_arg__,
                      fmaf(exp_arg__,
                           fmaf(exp_arg__,
                                fmaf(exp_arg__, 0.00833333377f, 0.0416666679f),
                                0.16666667f),
                           0.5f),
                      1.0f),
                 adaptive_coeff_addition__,
                 2.0f - A_dim__);

        float grad_norm2_best = 0.0f;
        if (no_improve > 0 && !bestQ.empty()) {
            float acc_best = 0.0f;
            float phi_best[FAST_STATE_DIM];
            int ii_best = 0;
            while (ii_best < n) {
                acc_best = fmaf(bestQ[ii_best], 1.0f, acc_best);
                phi_best[ii_best] = acc_best;
                ++ii_best;
            }

            float s_best[FAST_STATE_DIM];
            float c_best[FAST_STATE_DIM];
            FABE13_SINCOS(phi_best, s_best, c_best, n);

            float as_best = 0.0f;
            float ac_best = 0.0f;
            float sum_s_best[FAST_STATE_DIM];
            float sum_c_best[FAST_STATE_DIM];
            int k_best = n - 1;
            while (k_best >= 0) {
                const float Lk = cost.variableLen ? bestQ[n + k_best] : cost.fixedLength;
                as_best = fmaf(Lk, s_best[k_best], as_best);
                ac_best = fmaf(Lk, c_best[k_best], ac_best);
                sum_s_best[k_best] = as_best;
                sum_c_best[k_best] = ac_best;
                --k_best;
            }

            alignas(16) float prefix_tail_best[FAST_STATE_DIM] = { 0.0f };
            if (transitionMode && hasTransitionRefAngles && cost.transitionPrefixEnergyWeight > 0.0f) {
                alignas(16) float prefix_delta_best[FAST_STATE_DIM] = { 0.0f };
                float phi_pref = 0.0f;
                int jj = 0;
                while (jj < referenceAngleCount) {
                    phi_pref += bestQ[jj];
                    prefix_delta_best[jj] =
                        ManipCost::wrap_pi(phi_pref - cost.referencePrefixAngles[jj]);
                    ++jj;
                }
                float tail = 0.0f;
                jj = referenceAngleCount - 1;
                while (jj >= 0) {
                    tail += prefix_delta_best[jj];
                    prefix_tail_best[jj] = tail;
                    --jj;
                }
            }

            const float dx_best = fmaf(bestX, 1.0f, -cost.targetX);
            const float dy_best = fmaf(bestY, 1.0f, -cost.targetY);
            const float dist_best = sqrtf(fmaf(dx_best, dx_best, dy_best * dy_best));
            const float inv_dist_best = (dist_best > 1.0e-20f) ? (1.0f / dist_best) : 0.0f;

            int i_best = 0;
            while (i_best < n) {
                float gpen_best = 0.0f;
                float g_main_best = 0.0f;

                if (transitionMode) {
                    const float dtheta_best =
                        (i_best < referenceAngleCount)
                        ? cost.wrapped_angle_delta(bestQ.data(), i_best)
                        : 0.0f;
                    gpen_best = 2.0f * cost.transitionEnergyWeight * dtheta_best;
                    gpen_best = fmaf(2.0f * cost.transitionPrefixEnergyWeight,
                                     prefix_tail_best[i_best], gpen_best);
                    g_main_best = 2.0f * cost.transitionCaptureWeight *
                                  fmaf(dx_best, -sum_s_best[i_best], dy_best * sum_c_best[i_best]);
                } else {
                    const float tsg = fmaf(-bestQ[i_best], cost.archBiasK, 0.0f);
                    const float exp_arg_g = -tsg;
                    const float exp_val =
                        fmaf(exp_arg_g,
                             fmaf(exp_arg_g,
                                  fmaf(exp_arg_g,
                                       fmaf(exp_arg_g, 0.00833333377f, 0.0416666679f),
                                       0.16666667f),
                                  0.5f),
                             1.0f);
                    const float sig = 1.0f / (exp_val + 1.0f);
                    gpen_best = fmaf(-cost.archBiasW * cost.archBiasK, sig, gpen_best);
                    g_main_best = fmaf(dx_best, -sum_s_best[i_best], dy_best * sum_c_best[i_best]) * inv_dist_best;
                }

                const float gi_best = g_main_best + gpen_best;
                grad_norm2_best = fmaf(gi_best, gi_best, grad_norm2_best);
                ++i_best;
            }

            if (cost.variableLen) {
                int j_best = 0;
                while (j_best < n) {
                    float gi_best = 0.0f;
                    if (transitionMode) {
                        float gpenL_best = 0.0f;
                        if (hasTransitionRefLengths && j_best < referenceLengthCount) {
                            gpenL_best = 2.0f * cost.transitionLengthEnergyWeight *
                                         (bestQ[n + j_best] - ref_state[n + j_best]);
                        }
                        gi_best = 2.0f * cost.transitionCaptureWeight *
                                  fmaf(dx_best, c_best[j_best], dy_best * s_best[j_best]) +
                                  gpenL_best;
                    } else {
                        gi_best = fmaf(dx_best, c_best[j_best], dy_best * s_best[j_best]) * inv_dist_best;
                    }
                    grad_norm2_best = fmaf(gi_best, gi_best, grad_norm2_best);
                    ++j_best;
                }
            }
        }

        const bool stagnation = (no_improve > noImproveThrDim) && (grad_norm2_best < 0.5e-1f);

        const float r_eff =
            (dim > 2)
            ? fmaf(-fmaf(p_arg,
                         fmaf(p_arg,
                              fmaf(p_arg,
                                   fmaf(p_arg, 0.00833333377f, 0.0416666679f),
                                   0.16666667f),
                              0.5f),
                         1.0f) + 1.05f,
                   fmaf(sqrtf(dim_f - 1), current_r, 0.0f),
                   0.0f)
            : fmaf(-fmaf(p_arg,
                         fmaf(p_arg,
                              fmaf(p_arg,
                                   fmaf(p_arg, 0.00833333377f, 0.0416666679f),
                                   0.16666667f),
                              0.5f),
                         1.0f) + 1.05f,
                   current_r,
                   0.0f);

        if (stagnation) {
            stag_boost_remaining = n_stag_iters;

            float t_seeds[32];
            int seed_count = 0;

            if (hasTransitionRefAngles) {
                alignas(16) float q_center[FAST_STATE_DIM];
                std::memcpy(q_center, ref_state, static_cast<size_t>(dim) * sizeof(float));
                t_seeds[seed_count++] = clamp01(map.pointToT(q_center));

                if (!bestQ.empty() && seed_count < 32) {
                    t_seeds[seed_count++] = clamp01(map.pointToT(bestQ.data()));
                }

                unsigned st_local = seed + static_cast<unsigned>(it) + 911u;
                const int local_variants = 2 + static_cast<int>(sqrtf(dim_f));

                for (int v = 0; v < local_variants && seed_count < 32; ++v) {
                    float q_temp[FAST_STATE_DIM];
                    std::memcpy(q_temp, q_center, static_cast<size_t>(dim) * sizeof(float));

                    st_local ^= st_local << 13;
                    st_local ^= st_local >> 17;
                    st_local ^= st_local << 5;
                    const float rnd0 =
                        static_cast<float>(st_local & 0xFFFFFFu) * 5.9604645e-8f;

                    const int aidx = v % n;
                    q_temp[aidx] =
                        fmaf((2.0f * rnd0 - 1.0f), 0.25f * transitionMaxAngleStep, q_temp[aidx]);

                    if (cost.variableLen && hasTransitionRefLengths) {
                        st_local ^= st_local << 13;
                        st_local ^= st_local >> 17;
                        st_local ^= st_local << 5;
                        const float rnd1 =
                            static_cast<float>(st_local & 0xFFFFFFu) * 5.9604645e-8f;
                        const int lidx = v % n;
                        q_temp[n + lidx] =
                            fmaf((2.0f * rnd1 - 1.0f), 0.25f * cost.max_length_step(lidx), q_temp[n + lidx]);
                    }

                    clampPointProjected(q_temp);
                    t_seeds[seed_count++] = clamp01(map.pointToT(q_temp));
                }
            } else {
                const int num_ik = 1 + static_cast<int>(sqrtf(dim_f));
                const float dist_to_target = sqrtf(cost.targetX * cost.targetX + cost.targetY * cost.targetY);

                float max_reach = 0.0f;
                if (cost.variableLen) {
                    for (int ii = 0; ii < n; ++ii) max_reach += map.high[n + ii];
                } else {
                    max_reach = static_cast<float>(n) * cost.fixedLength;
                }

                const float ratio = dist_to_target / fmaxf(max_reach, 1.0e-20f);
                const bool prefer_extended = (ratio > 0.7f);
                const bool prefer_compact = (ratio < 0.4f);
                const bool use_ik = !(ratio > 0.4f && ratio < 0.7f);

                if (!use_ik) {
                    AlignedFloatVec temp_S(
                        static_cast<size_t>(FAST_MAX_SEEDS) * static_cast<size_t>(FAST_SEED_STRIDE),
                        0.0f);

                    const int sobol_gen =
                        generate_sobol_seeds_limited(
                            map, dim, temp_S.data(), FAST_SEED_STRIDE,
                            seed + static_cast<unsigned>(it), FAST_MAX_SEEDS);

                    const int num_sobol = num_ik;
                    for (int kk = 0; kk < num_sobol && kk < sobol_gen && seed_count < 32; ++kk) {
                        const float* s =
                            temp_S.data() + static_cast<size_t>(kk) * static_cast<size_t>(FAST_SEED_STRIDE);
                        t_seeds[seed_count++] = clamp01(map.pointToT(s));
                    }
                } else {
                    float angles_ccd[32] = { 0.0f };
                    float lengths_ccd[32];

                    if (cost.variableLen) {
                        const float len_low = map.low[n];
                        const float len_high = map.high[n];
                        const float avg_len = 0.5f * (len_low + len_high);
                        for (int ii = 0; ii < n; ++ii) lengths_ccd[ii] = avg_len;
                    } else {
                        for (int ii = 0; ii < n; ++ii) lengths_ccd[ii] = cost.fixedLength;
                    }

                    ccd_ik(cost.targetX, cost.targetY, lengths_ccd, n, angles_ccd, 10);

                    float angles_fabrik[32] = { 0.0f };
                    float lengths_fabrik[32];

                    if (cost.variableLen) {
                        for (int ii = 0; ii < n; ++ii) lengths_fabrik[ii] = lengths_ccd[ii];
                    } else {
                        for (int ii = 0; ii < n; ++ii) lengths_fabrik[ii] = cost.fixedLength;
                    }

                    float targetX_fab = cost.targetX;
                    float targetY_fab = cost.targetY;
                    for (int iter_fab = 0; iter_fab < 3; ++iter_fab) {
                        float prevX = targetX_fab;
                        float prevY = targetY_fab;
                        for (int j = n - 1; j >= 0; --j) {
                            const float len = lengths_fabrik[j];
                            const float angle_to_target = atan2f(prevY, prevX);
                            angles_fabrik[j] = angle_to_target;
                            float s_val, c_val;
                            FABE13_SINCOS(&angle_to_target, &s_val, &c_val, 1);
                            prevX = prevX - len * c_val;
                            prevY = prevY - len * s_val;
                        }
                    }

                    if (prefer_extended) {
                        {
                            float q_ccd[FAST_STATE_DIM];
                            for (int ii = 0; ii < n; ++ii) q_ccd[ii] = angles_ccd[ii];
                            if (cost.variableLen) {
                                for (int ii = 0; ii < n; ++ii) q_ccd[n + ii] = lengths_ccd[ii];
                            }
                            t_seeds[seed_count++] = clamp01(map.pointToT(q_ccd));
                        }

                        unsigned st_ik = seed + static_cast<unsigned>(it) + 222u;
                        const int remaining = num_ik - 1;
                        for (int v = 0; v < remaining && seed_count < 32; ++v) {
                            float noisy_angles[32];
                            float noisy_lengths[32];

                            for (int ii = 0; ii < n; ++ii) {
                                st_ik ^= st_ik << 13;
                                st_ik ^= st_ik >> 17;
                                st_ik ^= st_ik << 5;
                                const float rnd =
                                    static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f;
                                noisy_angles[ii] = angles_ccd[ii] + (2.0f * rnd - 1.0f) * 0.1f;
                                const float lo = (ii == 0) ? -1.0471975511965976f : -2.6179938779914944f;
                                const float hi =  2.6179938779914944f;
                                if (noisy_angles[ii] < lo) noisy_angles[ii] = lo;
                                if (noisy_angles[ii] > hi) noisy_angles[ii] = hi;
                            }

                            if (cost.variableLen) {
                                for (int ii = 0; ii < n; ++ii) {
                                    st_ik ^= st_ik << 13;
                                    st_ik ^= st_ik >> 17;
                                    st_ik ^= st_ik << 5;
                                    const float rnd =
                                        static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f;
                                    noisy_lengths[ii] = lengths_ccd[ii] + (2.0f * rnd - 1.0f) * 0.05f;
                                    if (noisy_lengths[ii] < 0.5f) noisy_lengths[ii] = 0.5f;
                                    if (noisy_lengths[ii] > 2.0f) noisy_lengths[ii] = 2.0f;
                                }
                            }

                            float q_temp[FAST_STATE_DIM];
                            for (int ii = 0; ii < n; ++ii) q_temp[ii] = noisy_angles[ii];
                            if (cost.variableLen) {
                                for (int ii = 0; ii < n; ++ii) q_temp[n + ii] = noisy_lengths[ii];
                            }
                            t_seeds[seed_count++] = clamp01(map.pointToT(q_temp));
                        }
                    } else if (prefer_compact) {
                        {
                            float q_fabrik[FAST_STATE_DIM];
                            for (int ii = 0; ii < n; ++ii) q_fabrik[ii] = angles_fabrik[ii];
                            if (cost.variableLen) {
                                for (int ii = 0; ii < n; ++ii) q_fabrik[n + ii] = lengths_fabrik[ii];
                            }
                            t_seeds[seed_count++] = clamp01(map.pointToT(q_fabrik));
                        }

                        unsigned st_ik = seed + static_cast<unsigned>(it) + 222u;
                        const int remaining = num_ik - 1;
                        for (int v = 0; v < remaining && seed_count < 32; ++v) {
                            float noisy_angles[32];
                            float noisy_lengths[32];

                            for (int ii = 0; ii < n; ++ii) {
                                st_ik ^= st_ik << 13;
                                st_ik ^= st_ik >> 17;
                                st_ik ^= st_ik << 5;
                                const float rnd =
                                    static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f;
                                noisy_angles[ii] = angles_fabrik[ii] + (2.0f * rnd - 1.0f) * 0.06f;
                                const float lo = (ii == 0) ? -1.0471975511965976f : -2.6179938779914944f;
                                const float hi =  2.6179938779914944f;
                                if (noisy_angles[ii] < lo) noisy_angles[ii] = lo;
                                if (noisy_angles[ii] > hi) noisy_angles[ii] = hi;
                            }

                            if (cost.variableLen) {
                                for (int ii = 0; ii < n; ++ii) {
                                    st_ik ^= st_ik << 13;
                                    st_ik ^= st_ik >> 17;
                                    st_ik ^= st_ik << 5;
                                    const float rnd =
                                        static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f;
                                    noisy_lengths[ii] = lengths_fabrik[ii] + (2.0f * rnd - 1.0f) * 0.03f;
                                    if (noisy_lengths[ii] < 0.5f) noisy_lengths[ii] = 0.5f;
                                    if (noisy_lengths[ii] > 2.0f) noisy_lengths[ii] = 2.0f;
                                }
                            }

                            float q_temp[FAST_STATE_DIM];
                            for (int ii = 0; ii < n; ++ii) q_temp[ii] = noisy_angles[ii];
                            if (cost.variableLen) {
                                for (int ii = 0; ii < n; ++ii) q_temp[n + ii] = noisy_lengths[ii];
                            }
                            t_seeds[seed_count++] = clamp01(map.pointToT(q_temp));
                        }
                    }
                }
            }

            for (int s = 0; s < seed_count; ++s) {
                const float t_center = t_seeds[s];
                const float interval_size = fmaf(0.00031f, static_cast<float>(dim), 0.0f);
                const float t1b = clamp01(t_center - 0.5f * interval_size);
                const float t2b = clamp01(t_center + 0.5f * interval_size);
                const float mid = 0.5f * (t1b + t2b);

                const TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true);
                const TrialPoint right_tp = evaluate_trial_from_t(t2b, mid, t2b, true);

                IntervalND* I = make_interval_from_trials(left_tp, right_tp);
                if (I == nullptr) continue;

                update_pockets_and_Mmax(I);
                I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
                I->R = fmaf(I->R, fmaf(0.01f, dim_f, 0.85f), 0.0f);
                H.emplace_back(I);
                std::push_heap(H.begin(), H.end(), ComparePtrND);
            }

            recompute_dmax();
            no_improve = 0;
        }

        const float bestFOld = bestF;
        const bool local_idle = !ensure_queue_nonempty(r_eff);
        const bool do_hj_this_iter = should_run_hj_this_iter(p);

        if (!local_idle) {
            if (do_hj_this_iter) {
                run_hooke_jeeves_iteration(r_eff, adaptive_coeff_, p);
            } else {
                std::pop_heap(H.begin(), H.end(), ComparePtrND);
                IntervalND* cur = H.back();
                H.pop_back();

                const float x1 = cur->x1;
                const float x2 = cur->x2;
                const float y1 = cur->y1;
                const float y2 = cur->y2;

                float m = fmaf(r_eff, Mmax, 0.0f);
                const float tBase = step(m, x1, x2, y1, y2, dim_f, r_eff, cur->idx1, cur->idx2);
                const TrialPoint trNew = evaluate_trial_from_t(tBase, x1, x2, true);

                IntervalND* L = new IntervalND(x1, trNew.t, y1, trNew.f, cur->idx1, trNew.idx);
                IntervalND* Rv = new IntervalND(trNew.t, x2, trNew.f, y2, trNew.idx, cur->idx2);
                finalize_interval_geometry(L);
                finalize_interval_geometry(Rv);

                const float Mloc = fmaxf(L->M, Rv->M);
                const float prevMmax = Mmax;
                update_pockets_and_Mmax(L);
                update_pockets_and_Mmax(Rv);

                m = fmaf(r_eff, Mmax, 0.0f);

                const float len1 = fmaf(trNew.t, 1.0f, -x1);
                const float len2 = fmaf(x2, 1.0f, -trNew.t);

                if (fmaf(len1, 1.0f, len2) >= dmax - T_EPS) {
                    dmax = fmaxf(len1, len2);
                    for (auto* pI : H) {
                        const float Ls = fmaf(pI->x2, 1.0f, -pI->x1);
                        if (Ls > dmax) dmax = Ls;
                    }
                }

                if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) {
                    const float alpha = p * p;
                    const float beta = fmaf(-alpha, 1.0f, 2.0f);
                    const float MULT = (1.0f / fmaxf(dmax, 1.0e-20f)) * Mmax;
                    const float global_coeff = fmaf(MULT, r_eff, -MULT);
                    const float GF = beta * global_coeff;

                    L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f)));
                    Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f)));

                    const size_t sz = H.size();
                    if (sz > 0u) {
                        if (!hasHardConstraints) RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha);
                        else RecomputeR_AffineM_Mixed_ND(H.data(), sz, GF, alpha);
                        std::make_heap(H.begin(), H.end(), ComparePtrND);
                    }
                } else {
                    L->ChangeCharacteristic(m);
                    Rv->ChangeCharacteristic(m);
                    if (Mloc > prevMmax && Mloc > fmaf(adaptive_coeff_, prevMmax, 0.0f)) {
                        recompute_heap_constM(m);
                    }
                }

                H.emplace_back(L);
                std::push_heap(H.begin(), H.end(), ComparePtrND);
                H.emplace_back(Rv);
                std::push_heap(H.begin(), H.end(), ComparePtrND);
                delete cur;
            }
        }

        if (!H.empty()) {
            _mm_prefetch(reinterpret_cast<const char*>(H[0]), _MM_HINT_T0);
            if (H.size() > 1u) _mm_prefetch(reinterpret_cast<const char*>(H[1]), _MM_HINT_T0);
        }

        if (!H.empty() &&
            (stagnation || bestF < fmaf(fmaf(bestFOld, 0.6f, 0.0f), adaptive_coeff__, 0.0f)) &&
            it - last_send_T >= send_interval_T) {
            last_send_T = it;
            progress_outgoing();

            const size_t scan_count = H.size();
            unsigned intervals_to_send =
                (dim < 12) ? static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f))) : 7u;
            if (intervals_to_send > scan_count) intervals_to_send = static_cast<unsigned>(scan_count);

            if (intervals_to_send > 0u) {
                const float alpha = 0.63f - sqrtf(fmaxf(-0.113f * (p - 1.0f), 0.0f));
                const float beta = 1.0f - alpha;
                const float w_pos = 3.085f * (-sqrtf(fmaxf(0.007f * p, 0.0f)) + 1.0f);
                const float w_size = 5.085f - w_pos;
                const int num_bins = (p < 1.0f - 1.0e-9f) ? static_cast<int>(1.0f / (1.0f - p)) : 1;

                std::vector<float> pos_metric(scan_count);
                std::vector<float> size_metric(scan_count);
                std::vector<float> R_val(scan_count);

                float R_max = -FLT_MAX;
                float R_min = FLT_MAX;
                float size_max = -FLT_MAX;
                float size_min = FLT_MAX;
                float pos_max = -FLT_MAX;
                float pos_min = FLT_MAX;

                for (size_t idx = 0u; idx < scan_count; ++idx) {
                    IntervalND* I = H[idx];
                    const float center = 0.5f * (I->x1 + I->x2);
                    const float len = fmaf(I->x2, 1.0f, -I->x1);
                    const float size = len * static_cast<float>(1 << I->span_level);

                    if (p < 0.95f && num_bins > 1) {
                        float bin = floorf(center * static_cast<float>(num_bins));
                        if (bin >= static_cast<float>(num_bins)) bin = static_cast<float>(num_bins - 1);
                        pos_metric[idx] = bin / static_cast<float>(num_bins - 1);
                    } else {
                        pos_metric[idx] = center;
                    }

                    size_metric[idx] = size;
                    R_val[idx] = I->R;

                    if (pos_metric[idx] > pos_max) pos_max = pos_metric[idx];
                    if (pos_metric[idx] < pos_min) pos_min = pos_metric[idx];
                    if (size > size_max) size_max = size;
                    if (size < size_min) size_min = size;
                    if (I->R > R_max) R_max = I->R;
                    if (I->R < R_min) R_min = I->R;
                }

                auto norm = [](float val, float minv, float maxv) noexcept -> float {
                    const float denom = maxv - minv;
                    if (fabsf(denom) <= 1.0e-20f) return 0.0f;
                    return (val - minv) / denom;
                };

                std::vector<float> R_norm(scan_count);
                std::vector<float> pos_norm(scan_count);
                std::vector<float> size_norm(scan_count);

                for (size_t idx = 0u; idx < scan_count; ++idx) {
                    R_norm[idx] = norm(R_val[idx], R_min, R_max);
                    pos_norm[idx] = norm(pos_metric[idx], pos_min, pos_max);
                    size_norm[idx] = norm(size_metric[idx], size_min, size_max);
                }

                std::vector<int> selected;
                selected.reserve(intervals_to_send);
                std::vector<unsigned char> used(scan_count, 0u);

                selected.push_back(0);
                used[0] = true;

                while (selected.size() < intervals_to_send) {
                    int best_idx = -1;
                    float best_score = -FLT_MAX;

                    for (size_t idx = 0u; idx < scan_count; ++idx) {
                        if (used[idx] != 0u) continue;

                        float min_dist = FLT_MAX;
                        for (int s : selected) {
                            float d2 = 0.0f;
                            float d = pos_norm[idx] - pos_norm[static_cast<size_t>(s)];
                            d2 += w_pos * d * d;
                            d = size_norm[idx] - size_norm[static_cast<size_t>(s)];
                            d2 += w_size * d * d;
                            if (d2 < min_dist) min_dist = d2;
                        }

                        const float novelty =
                            sqrtf(fmaxf(min_dist, 0.0f)) / sqrtf(fmaxf(w_pos + w_size, 1.0e-20f));
                        const float score = alpha * R_norm[idx] + beta * novelty;

                        if (score > best_score) {
                            best_score = score;
                            best_idx = static_cast<int>(idx);
                        }
                    }

                    if (best_idx < 0) break;
                    selected.push_back(best_idx);
                    used[static_cast<size_t>(best_idx)] = 1u;
                }

                MultiCrossMsg out{};
                out.count = static_cast<unsigned>(selected.size());
                float* dest = out.intervals;
                for (unsigned s = 0u; s < out.count; ++s) {
                    IntervalND* Tt = H[static_cast<size_t>(selected[s])];
                    dest[0] = Tt->x1;
                    dest[1] = 0.0f;
                    dest[2] = Tt->x2;
                    dest[3] = 0.0f;
                    dest[4] = Tt->R;
                    dest += 5;
                }

                const size_t iterations = comm_levels;
                bool active = true;
                const bool invert_T = ((static_cast<int>(exchange_counter_T) + 1) & 1) != 0;
                size_t ii2 = 0u;
                while (ii2 < iterations && active) {
                    const size_t step = 1ULL << ii2;
                    const int partner = rank ^ static_cast<int>(step);
                    if (partner < world) {
                        const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
                        if (am_sender) {
                            g_pendingMulti.emplace_back(*g_world, partner, out);
                            if (g_pendingMulti.size() > MAX_INFLIGHT_MULTI) {
                                g_pendingMulti.front().req.wait();
                                g_pendingMulti.pop_front();
                            }
                            active = false;
                        }
                    }
                    ++ii2;
                }

                ++exchange_counter_T;
            }
        }

        const bool indexedBestImprovedSinceLastSend =
            !bestQIndexed.empty() &&
            better_indexed(bestIndexFound, bestIndexValue,
                           lastSentBestMsg.bestIndex, lastSentBestMsg.bestF);

        if (indexedBestImprovedSinceLastSend && it - last_send_best >= send_interval_best) {
            last_send_best = it;
            progress_outgoing();

            const size_t iterations = comm_levels;
            bool active = true;
            const bool invert_best = ((static_cast<int>(exchange_counter) + 1) & 1) != 0;

            size_t ii2 = 0u;
            while (ii2 < iterations && active) {
                const size_t step = 1ULL << ii2;
                const int partner = rank ^ static_cast<int>(step);
                if (partner < world) {
                    const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_best);
                    if (am_sender) {
                        BestSolutionMsg outMsg;
                        InitBestSolutionMsg(outMsg);
                        if (FillBestSolutionMsg(outMsg,
                                                bestIndexFound, bestIndexValue,
                                                bestIndexedX, bestIndexedY,
                                                bestQIndexed) &&
                            better_indexed(outMsg.bestIndex, outMsg.bestF,
                                           lastSentBestMsg.bestIndex, lastSentBestMsg.bestF)) {
                            g_pendingBest.emplace_back(*g_world, partner, outMsg);
                            lastSentBestMsg = outMsg;
                        }
                        if (g_pendingBest.size() > MAX_INFLIGHT_BEST) {
                            g_pendingBest.front().req.wait();
                            g_pendingBest.pop_front();
                        }
                        active = false;
                    }
                }
                ++ii2;
            }

            ++exchange_counter;
        }

        while (g_world->iprobe(boost::mpi::any_source, 0)) {
            MultiCrossMsg in;
            g_world->recv(boost::mpi::any_source, 0, in);

            for (unsigned ii = 0u; ii < in.count; ++ii) {
                const float* d = &in.intervals[ii * 5u];
                const float sx = d[0];
                const float ex = d[2];

                alignas(16) float ql[FAST_STATE_DIM];
                float tx = 0.0f;
                float ty = 0.0f;
                unsigned v1i = 0u;
                unsigned v2i = 0u;
                float y1i = 0.0f;
                float y2i = 0.0f;

                map.map01ToPoint(sx, ql);
                cost.evaluate_indexed(ql, tx, ty, v1i, y1i);
                map.map01ToPoint(ex, ql);
                cost.evaluate_indexed(ql, tx, ty, v2i, y2i);

                IntervalND* inj = new IntervalND(sx, ex, y1i, y2i, v1i, v2i);
                finalize_interval_geometry(inj);

                const float prevMmax = Mmax;
                const float candidateMmax = fmaxf(Mmax, inj->M);
                const float m_inj = fmaf(r_eff, candidateMmax, 0.0f);
                inj->ChangeCharacteristic(m_inj);

                if (H.empty() || inj->R > fmaf(adaptive_coeff, H.front()->R, 0.0f) || stagnation) {
                    update_pockets_and_Mmax(inj);
                    H.emplace_back(inj);
                    std::push_heap(H.begin(), H.end(), ComparePtrND);
                    if (Mmax > fmaf(adaptive_coeff_, prevMmax, 0.0f)) {
                        recompute_heap_constM(fmaf(r_eff, Mmax, 0.0f));
                    }
                } else {
                    delete inj;
                }
            }
        }

        while (g_world->iprobe(boost::mpi::any_source, 2)) {
            BestSolutionMsg incoming;
            g_world->recv(boost::mpi::any_source, 2, incoming);
            UpdateIndexedBestFromMessage(
                incoming, fullConstraintIndex,
                bestIndexFound, bestIndexValue,
                bestQIndexed, bestIndexedX, bestIndexedY,
                bestF, bestQ, bestX, bestY);
        }

        const float interval_len =
            H.empty() ? 0.0f : ((dim > 1) ? fmaf(H.front()->x2, 1.0f, -H.front()->x1) : H.front()->diam);

        if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps ||
            it == maxIter - 1) {
            for (auto& s : g_pendingMulti) s.req.wait();
            for (auto& s : g_pendingBest) s.req.wait();
            g_pendingMulti.clear();
            g_pendingBest.clear();

            while (g_world->iprobe(boost::mpi::any_source, 2)) {
                BestSolutionMsg incoming;
                g_world->recv(boost::mpi::any_source, 2, incoming);
                UpdateIndexedBestFromMessage(
                    incoming, fullConstraintIndex,
                    bestIndexFound, bestIndexValue,
                    bestQIndexed, bestIndexedX, bestIndexedY,
                    bestF, bestQ, bestX, bestY);
            }

            g_world->barrier();

            BestSolutionMsg bestMsg;
            InitBestSolutionMsg(bestMsg);
            if (!bestQIndexed.empty()) {
                FillBestSolutionMsg(bestMsg,
                                    bestIndexFound, bestIndexValue,
                                    bestIndexedX, bestIndexedY,
                                    bestQIndexed);
            }

            const size_t iterations = comm_levels;
            bool active = true;
            size_t itx = 0u;
            while (itx < iterations && active) {
                const size_t step = 1ULL << itx;
                const int partner = rank ^ static_cast<int>(step);
                if (partner < world) {
                    const bool am_sender = (rank & static_cast<int>(step)) != 0;
                    if (am_sender) {
                        g_world->send(partner, 3, bestMsg);
                        active = false;
                    } else {
                        BestSolutionMsg inBest;
                        InitBestSolutionMsg(inBest);
                        g_world->recv(partner, 3, inBest);
                        if (better_indexed(inBest.bestIndex, inBest.bestF,
                                           bestMsg.bestIndex, bestMsg.bestF)) {
                            bestMsg = inBest;
                        }
                    }
                }
                ++itx;
            }

            if (rank == 0) {
                UpdateIndexedBestFromMessage(
                    bestMsg, fullConstraintIndex,
                    bestIndexFound, bestIndexValue,
                    bestQIndexed, bestIndexedX, bestIndexedY,
                    bestF, bestQ, bestX, bestY);
            }

            const bool haveFeasibleBest =
                (bestIndexFound == fullConstraintIndex) && !bestQ.empty();

            if (haveFeasibleBest && !transitionMode) {
                alignas(16) float q_local[FAST_STATE_DIM];
                std::memcpy(q_local, bestQ.data(), static_cast<size_t>(dim) * sizeof(float));

                float x_final = bestX;
                float y_final = bestY;
                const float f_final = bestF;

                const int last = n - 1;
                const float lo =
                    (last == 0)
                    ? -1.0471975511965977461542144610932f
                    : -2.6179938779914943653855361527329f;
                const float hi = 2.6179938779914943653855361527329f;

                float bestLocF = f_final;
                float saved = q_local[last];
                float delta = 0.05f;

                while (delta >= 0.00625f) {
                    int sgn = -1;
                    while (sgn < 2) {
                        float cand = fmaf(static_cast<float>(sgn), delta, saved);
                        if (cand < lo) cand = lo;
                        else if (cand > hi) cand = hi;

                        const float backup = q_local[last];
                        q_local[last] = cand;

                        float x2_loc = 0.0f;
                        float y2_loc = 0.0f;
                        unsigned idx2 = 0u;
                        float val2 = 0.0f;
                        const bool feasible2 = cost.evaluate_indexed(q_local, x2_loc, y2_loc, idx2, val2);

                        if (feasible2 && val2 < bestLocF) {
                            bestLocF = val2;
                            x_final = x2_loc;
                            y_final = y2_loc;
                            saved = cand;
                        }

                        q_local[last] = backup;
                        sgn += 2;
                    }
                    delta *= 0.5f;
                }

                if (bestLocF < f_final) {
                    q_local[last] = saved;
                    bestF = bestLocF;
                    bestX = x_final;
                    bestY = y_final;
                    bestQ.assign(q_local, q_local + dim);
                    bestIndexFound = fullConstraintIndex;
                    bestIndexValue = bestF;
                    bestQIndexed = bestQ;
                    bestIndexedX = bestX;
                    bestIndexedY = bestY;
                }
            } else if (haveFeasibleBest) {
                bestIndexFound = fullConstraintIndex;
                bestIndexValue = bestF;
                bestQIndexed = bestQ;
                bestIndexedX = bestX;
                bestIndexedY = bestY;
            } else {
                bestQ = bestQIndexed;
                bestF = bestIndexValue;
                bestX = bestIndexedX;
                bestY = bestIndexedY;
            }

            out_iterations = static_cast<size_t>(it);
            out_achieved_epsilon = interval_len;
            return;
        }

        ++it;
    }
}

Если хочешь, следующим сообщением я могу отдельно очень коротко перечислить именно те 6–8 мест, где быстрая версия ломала Transition, чтобы ты мог точечно проверить это у себя в профилировщике и тестах.

сейчас я пришлю тебе две версии моего алгоритма, о...

Pergunta

Pensamento

Considering potential optimizations and guarding edge cases

Ensuring safety while maintaining performance

Resposta

Compartilhar esta Q&A