сейчас я пришлю тебе две версии моего алгоритма, о...

Criado em: 31 de março de 2026

Respondido usando GPT-5.4 Thinking por Chat01

Pergunta

сейчас я пришлю тебе две версии моего алгоритма, одну обычную, вторую оптимизированную по скорости, обычная версия работает и в обычном режиме и в режиме transition, оптимизированная версия прекрасно быстро работает в обычном режиме и не работает в режиме transition, твоя задача будет скопировать поведение оптимизированной версии везде где это касается обычного режима но сделать её рабочей и ЧТО ВАЖНО ОПТИМИЗИРОВАННОЙ ТАКЖЕ ДЛЯ TRANSITION режима - для этого надо будет внимательно проанализировать за счёт чего обычная версия остаётся рабочей в transition режиме, и прислать мне полный код целиком оптимизированной по скорости версии которая работает быстро в обоих режимах, причём безопасность/читаемость/требования по памяти/поддерживаемость кода не важны, важна лишь максимальная скорость, важно что ты должен прислать именно код максимального оптимизированного под скорость метода целиком а не просто набор правок, код обычной версии: "static declspec(noalias) void agp_run_branch_mpi( const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed, std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H, std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ, float& bestF, float& bestX, float& bestY, size_t& out_iterations, float& out_achieved_epsilon, float M_prior) noexcept { struct TrialPoint { alignas(16) float q[AGP_MAX_STATE_DIM]; float t = 0.0f; float f = FLT_MAX; float x = 0.0f; float y = 0.0f; unsigned idx = 0u; bool feasible = false; }; using AlignedFloatVec = std::vector<float, boost::alignment::aligned_allocator<float, 16u>>; constexpr float T_EPS = 1.0e-7f; const int n = cost.n; const int dim = n + (cost.variableLen ? n : 0); const float dim_f = static_cast<float>(dim); const unsigned fullConstraintIndex = cost.feasible_index(); const bool hasHardConstraints = (fullConstraintIndex != 0u); if (n > 32 || dim > AGP_MAX_STATE_DIM) { bestQ.clear(); bestF = FLT_MAX; bestX = 0.0f; bestY = 0.0f; out_iterations = 0u; out_achieved_epsilon = 0.0f; return; } unsigned exchange_counter = 0u; unsigned exchange_counter_T = 0u; alignas(16) float M_by_span[12]; for (int i = 0; i < 12; ++i) M_by_span[i] = M_prior; float Mmax = M_prior; alignas(16) float phi[AGP_MAX_STATE_DIM]; alignas(16) float s_arr[AGP_MAX_STATE_DIM]; alignas(16) float c_arr[AGP_MAX_STATE_DIM]; alignas(16) float sum_s[AGP_MAX_STATE_DIM]; alignas(16) float sum_c[AGP_MAX_STATE_DIM]; alignas(16) float q_try[AGP_MAX_STATE_DIM]; bestQ.reserve(static_cast<size_t>(dim)); AlignedFloatVec bestQIndexed; bestQIndexed.reserve(static_cast<size_t>(dim)); AlignedFloatVec transitionReferenceSeed; float transitionRefF = FLT_MAX; unsigned transitionRefIdx = 0u; float transitionRefX = 0.0f; float transitionRefY = 0.0f; bool haveTransitionRefSeed = false; if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles()) { const int refDim = cost.full_state_dim(); transitionReferenceSeed.resize(static_cast<size_t>(refDim)); std::memcpy( transitionReferenceSeed.data(), cost.referenceState, static_cast<size_t>(refDim) * sizeof(float)); float xr = 0.0f; float yr = 0.0f; unsigned idxr = 0u; float fr = 0.0f; const bool ref_ok = cost.evaluate_indexed(transitionReferenceSeed.data(), xr, yr, idxr, fr); transitionRefF = fr; transitionRefIdx = idxr; transitionRefX = xr; transitionRefY = yr; haveTransitionRefSeed = ref_ok; } unsigned bestIndexFound = 0u; float bestIndexValue = FLT_MAX; float bestIndexedX = 0.0f; float bestIndexedY = 0.0f; BestSolutionMsg lastSentBestMsg; InitBestSolutionMsg(lastSentBestMsg); int no_improve = 0; const float a = 0.0f; const float b = 1.0f; float p = 0.0f; float dmax = b - a; const float initial_len = dmax; const float A_dim = fmaf(1.0f / sqrtf(dim_f + 6.75f), 5.535f, 0.0f); const float A_dim = fmaf(1.0f / sqrtf(dim_f + 6.75f), 3.425f, 0.0f); const float B_dim = fmaf(A_dim, 0.7f, 0.0f); const float B_dim__ = fmaf(A_dim__, 4.325f, 0.0f); const float log_argument = A_dim - 2.03f; const float log_argument__ = A_dim__ - 2.0f; const float C_dim = fmaf( log_argument, fmaf( log_argument, fmaf( log_argument, fmaf( log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim; const float C_dim__ = fmaf( log_argument__, fmaf( log_argument__, fmaf( log_argument__, fmaf( log_argument__, fmaf(log_argument__, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument__) - B_dim__; const float adaptive_coeff_addition = fmaf( C_dim, fmaf( C_dim, fmaf( C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float adaptive_coeff_addition__ = fmaf( C_dim__, fmaf( C_dim__, fmaf( C_dim__, fmaf(C_dim__, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); float adaptive_coeff = A_dim - adaptive_coeff_addition; float adaptive_coeff__ = A_dim__ - adaptive_coeff_addition__; int it = 0; int stag_boost_remaining = 0; float stag_r_multiplier = 0.0f; int last_send_T = 0; const int send_interval_T = 7; int last_send_best = 0; const int send_interval_best = 2; const int n_stag_iters = static_cast<int>(3.0f + 2.045f * sqrtf(dim_f)); const int noImproveThrDim = static_cast<int>(fmaf(7.5f, exp2f(-0.1f * sqrtf(dim_f)), 0.0f)); const AGPHookeJeevesParams& hj_cfg = g_agp_hj_params; auto clamp01 = [](float v) noexcept -> float { if (v < 0.0f) return 0.0f; if (v > 1.0f) return 1.0f; return v; }; auto lerp_hj = [](float a0, float a1, float t) noexcept -> float { return fmaf(t, a1 - a0, a0); }; auto select_hj_iteration_schedule = [&](float progress, int& period_out, int& phase_out) noexcept { period_out = hj_cfg.iteration_period; phase_out = hj_cfg.iteration_phase; int count = hj_cfg.iteration_schedule_count; if (count <= 0) return; if (count > AGP_HJ_MAX_PROGRESS_RULES) count = AGP_HJ_MAX_PROGRESS_RULES; const float p01 = clamp01(progress); for (int i = 0; i < count; ++i) { const AGPHJIterationScheduleRule& rule = hj_cfg.iteration_schedule[i]; if (p01 < rule.progress_lt || i + 1 == count) { period_out = rule.period; phase_out = rule.phase; break; } } if (period_out < 1) period_out = 1; phase_out %= period_out; if (phase_out < 0) phase_out += period_out; }; auto select_hj_coord_profile = [&](float progress) noexcept -> AGPHJCoordScaleProfile { int count = hj_cfg.coord_profile_schedule_count; if (count > AGP_HJ_MAX_PROGRESS_RULES) count = AGP_HJ_MAX_PROGRESS_RULES; const float p01 = clamp01(progress); if (count > 0) { for (int i = 0; i < count; ++i) { const AGPHJCoordScaleRule& rule = hj_cfg.coord_profile_schedule[i]; if (p01 < rule.progress_lt || i + 1 == count) return rule.profile; } } AGPHJCoordScaleProfile fallback{}; fallback.angle_scale_proximal = hj_cfg.angle_scale_first; fallback.angle_scale_distal = hj_cfg.angle_scale_last; fallback.length_scale_proximal = hj_cfg.length_scale_first; fallback.length_scale_distal = hj_cfg.length_scale_last; return fallback; }; auto select_hj_order_mode = [&](float progress) noexcept -> int { int order_mode = hj_cfg.order_mode; int count = hj_cfg.order_schedule_count; if (count <= 0) return order_mode; if (count > AGP_HJ_MAX_PROGRESS_RULES) count = AGP_HJ_MAX_PROGRESS_RULES; const float p01 = clamp01(progress); for (int i = 0; i < count; ++i) { const AGPHJOrderScheduleRule& rule = hj_cfg.order_schedule[i]; if (p01 < rule.progress_lt || i + 1 == count) { order_mode = rule.order_mode; break; } } return order_mode; }; auto coord_profile_scale = [&](int d, const AGPHJCoordScaleProfile& profile) noexcept -> float { if (d < n) { const float t01 = (n > 1) ? (static_cast<float>(d) / static_cast<float>(n - 1)) : 0.0f; return lerp_hj(profile.angle_scale_proximal, profile.angle_scale_distal, t01); } if (cost.variableLen) { const int j = d - n; const float t01 = (n > 1) ? (static_cast<float>(j) / static_cast<float>(n - 1)) : 0.0f; return lerp_hj(profile.length_scale_proximal, profile.length_scale_distal, t01); } return 1.0f; }; alignas(16) float hj_delta[AGP_MAX_STATE_DIM]; alignas(16) float hj_profile_scale_applied[AGP_MAX_STATE_DIM]; alignas(64) int hj_order_natural[AGP_MAX_STATE_DIM]; alignas(64) int hj_order_reverse[AGP_MAX_STATE_DIM]; alignas(64) int hj_order_interleaved_proximal[AGP_MAX_STATE_DIM]; alignas(64) int hj_order_interleaved_distal[AGP_MAX_STATE_DIM]; alignas(64) int hj_order_lengths_first[AGP_MAX_STATE_DIM]; { int pos = 0; while (pos < dim) { hj_order_natural[pos] = pos; ++pos; } pos = 0; int d = dim - 1; while (d >= 0) { hj_order_reverse[pos] = d; ++pos; --d; } pos = 0; if (cost.variableLen) { int j = 0; while (j < n) { hj_order_interleaved_proximal[pos] = j; ++pos; hj_order_interleaved_proximal[pos] = n + j; ++pos; ++j; } } else { int j = 0; while (j < n) { hj_order_interleaved_proximal[pos] = j; ++pos; ++j; } } pos = 0; if (cost.variableLen) { int j = n - 1; while (j >= 0) { hj_order_interleaved_distal[pos] = j; ++pos; hj_order_interleaved_distal[pos] = n + j; ++pos; --j; } } else { int j = n - 1; while (j >= 0) { hj_order_interleaved_distal[pos] = j; ++pos; --j; } } pos = 0; if (cost.variableLen) { int j = 0; while (j < n) { hj_order_lengths_first[pos] = n + j; ++pos; ++j; } } int j = 0; while (j < n) { hj_order_lengths_first[pos] = j; ++pos; ++j; } } auto select_hj_order_ptr = [&](const int order_mode) noexcept -> const int* { switch (order_mode) { case AGP_HJ_ORDER_NATURAL: return hj_order_natural; case AGP_HJ_ORDER_REVERSE: return hj_order_reverse; case AGP_HJ_ORDER_INTERLEAVED_PROXIMAL: return hj_order_interleaved_proximal; case AGP_HJ_ORDER_LENGTHS_FIRST: return hj_order_lengths_first; default: return hj_order_interleaved_distal; } }; int hj_profile_cache_key = INT_MIN; auto apply_hj_coord_profile = [&](const float progress) noexcept { AGPHJCoordScaleProfile profile{}; int key = -2; int count = hj_cfg.coord_profile_schedule_count; if (count > AGP_HJ_MAX_PROGRESS_RULES) count = AGP_HJ_MAX_PROGRESS_RULES; if (count > 0) { int i = 0; while (i < count) { const AGPHJCoordScaleRule& rule = hj_cfg.coord_profile_schedule[i]; if (progress < rule.progress_lt || i + 1 == count) { profile = rule.profile; key = i; break; } ++i; } } else { profile.angle_scale_proximal = hj_cfg.angle_scale_first; profile.angle_scale_distal = hj_cfg.angle_scale_last; profile.length_scale_proximal = hj_cfg.length_scale_first; profile.length_scale_distal = hj_cfg.length_scale_last; } if (key == hj_profile_cache_key) return; int d = 0; #pragma loop ivdep while (d < dim) { const float new_scale = fmaxf(coord_profile_scale(d, profile), 1.0e-12f); const float old_scale = fmaxf(hj_profile_scale_applied[d], 1.0e-12f); if (fabsf(new_scale - old_scale) > 1.0e-12f) { hj_delta[d] = fmaxf(hj_delta[d] * (new_scale / old_scale), hj_cfg.min_delta); hj_profile_scale_applied[d] = new_scale; } ++d; } hj_profile_cache_key = key; }; { const AGPHJCoordScaleProfile initial_hj_profile = select_hj_coord_profile(0.0f); for (int d = 0; d < dim; ++d) { const float span = fmaf(map.high[d], 1.0f, -map.low[d]); const float scale = fmaxf(coord_profile_scale(d, initial_hj_profile), 1.0e-12f); hj_profile_scale_applied[d] = scale; hj_delta[d] = fmaxf(span * hj_cfg.init_delta_frac * scale, hj_cfg.init_delta_abs); } } auto clampPoint = [&](float* q) noexcept { int i = 0; #pragma loop ivdep while (i < n) { float lo = (i == 0) ? -1.0471975511965977f : -2.6179938779914944f; float hi = 2.6179938779914944f; if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles() && i < cost.referenceAngleCount) { const float ref = cost.referenceState[i]; const float lo_ref = ref - cost.transitionMaxAngleStep; const float hi_ref = ref + cost.transitionMaxAngleStep; if (lo_ref > lo) lo = lo_ref; if (hi_ref < hi) hi = hi_ref; } if (cost.use_joint_angle_limit_constraints()) { const float hard = fabsf(cost.minTheta); const float lo_hard = -hard; const float hi_hard = hard; if (lo_hard > lo) lo = lo_hard; if (hi_hard < hi) hi = hi_hard; } if (q[i] < lo) q[i] = lo; else if (q[i] > hi) q[i] = hi; if (cost.solveMode == ManipSolveMode::Transition && cost.transitionPreserveJointMagnitudeFloor && cost.has_transition_reference_angles() && i < cost.referenceAngleCount) { const float ref = cost.referenceState[i]; if (fabsf(ref) > cost.transitionJointSignDeadband) { const float min_abs = cost.min_joint_abs_angle_bound(i); if (fabsf(q[i]) < min_abs) { q[i] = copysignf(min_abs, ref); if (q[i] < lo) q[i] = lo; else if (q[i] > hi) q[i] = hi; } } } ++i; } if (cost.variableLen) { i = 0; #pragma loop ivdep while (i < n) { float lo = fmaf(1.0f / cost.stretchFactor, cost.fixedLength, 0.0f); float hi = fmaf(cost.stretchFactor, cost.fixedLength, 0.0f); if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_lengths() && i < cost.referenceLengthCount) { const float refL = cost.referenceState[n + i]; const float dL = cost.max_length_step(i); const float lo_ref = refL - dL; const float hi_ref = refL + dL; if (lo_ref > lo) lo = lo_ref; if (hi_ref < hi) hi = hi_ref; } if (q[n + i] < lo) q[n + i] = lo; else if (q[n + i] > hi) q[n + i] = hi; ++i; } } }; auto t_to_idx = [&](float t) -> unsigned long long { return static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f)); }; auto finalize_interval_geometry = [&](IntervalND* I) noexcept { I->i1 = t_to_idx(I->x1); I->i2 = t_to_idx(I->x2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); }; auto update_pockets_and_Mmax = [&](IntervalND* I) noexcept { const int k = I->span_level; if (I->M > M_by_span[k]) M_by_span[k] = I->M; if (M_by_span[k] > Mmax) Mmax = M_by_span[k]; }; auto recompute_dmax = & noexcept { float new_dmax = 0.0f; for (auto* pI : H) { const float Ls = fmaf(pI->x2, 1.0f, -pI->x1); if (Ls > new_dmax) new_dmax = Ls; } dmax = (new_dmax > 0.0f) ? new_dmax : initial_len; }; auto recompute_heap_constM = [&](float m_cur) noexcept { const size_t sz = H.size(); if (sz == 0u) return; if (!hasHardConstraints) RecomputeR_ConstM_AVX2_ND(H.data(), sz, m_cur); else RecomputeR_ConstM_Mixed_ND(H.data(), sz, m_cur); std::make_heap(H.begin(), H.end(), ComparePtrND); }; auto update_best_from_trial = [&](const TrialPoint& tr) noexcept { if (better_indexed(tr.idx, tr.f, bestIndexFound, bestIndexValue)) { bestIndexFound = tr.idx; bestIndexValue = tr.f; agp_store_q(bestQIndexed, tr.q, dim); bestIndexedX = tr.x; bestIndexedY = tr.y; } if (tr.feasible) { if (tr.f < bestF) { bestF = tr.f; agp_store_q(bestQ, tr.q, dim); bestX = tr.x; bestY = tr.y; no_improve = 0; } else { ++no_improve; } } }; auto refine_trial = [&](float* q_inout, float& x_io, float& y_io, unsigned& idx_io, float& f_io, float t_lo, float t_hi, bool enforce_t_bounds) noexcept { if (!(f_io < fmaf(bestF, adaptive_coeff, 0.0f))) return; const float f_start = f_io; const float c1 = 1e-4f; const float tau = 0.5f; const int max_outer_iters = static_cast<int>(50.0f * (1.0f + 0.65f * p)); const int max_backtrack = static_cast<int>(20.0f * (1.0f + 0.65f * p)); const float lbfgs_trigger = 0.6f; const int m_lbfgs = 9; const int max_lbfgs_iters = static_cast<int>(25.0f * (1.0f + 0.65f * p)); const float eps_lbfgs_curv = 1e-6f; const float eps_descent = 1e-12f; float eta = 2.0f / sqrtf(dim_f); auto computeGrad = [&](const float* q_in, const float x_in, const float y_in, float* grad_out, float& grad_norm2_out) noexcept { float phi_acc_local = 0.0f; int ii = 0; #pragma loop ivdep while (ii < n) { phi_acc_local = fmaf(q_in[ii], 1.0f, phi_acc_local); phi[ii] = phi_acc_local; ++ii; } FABE13_SINCOS(phi, s_arr, c_arr, n); float acc_s = 0.0f; float acc_c = 0.0f; int kk = n - 1; while (kk >= 0) { const float Lk = cost.link_length(q_in, kk); acc_s = fmaf(Lk, s_arr[kk], acc_s); acc_c = fmaf(Lk, c_arr[kk], acc_c); sum_s[kk] = acc_s; sum_c[kk] = acc_c; --kk; } alignas(16) float prefix_tail_grad[AGP_MAX_STATE_DIM]{}; if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles() && cost.transitionPrefixEnergyWeight > 0.0f) { alignas(16) float prefix_delta[AGP_MAX_STATE_DIM]{}; float phi_prefix = 0.0f; int jj = 0; #pragma loop ivdep while (jj < cost.referenceAngleCount) { phi_prefix += q_in[jj]; prefix_delta[jj] = ManipCost::wrap_pi(phi_prefix - cost.referencePrefixAngles[jj]); ++jj; } float tail = 0.0f; jj = cost.referenceAngleCount - 1; while (jj >= 0) { tail += prefix_delta[jj]; prefix_tail_grad[jj] = tail; --jj; } } const float dx = fmaf(x_in, 1.0f, -cost.targetX); const float dy = fmaf(y_in, 1.0f, -cost.targetY); const float dist2 = fmaf(dx, dx, dy * dy); const float dist = sqrtf(dist2); const float inv_dist = (dist > 1.0e-20f) ? (1.0f / dist) : 0.0f; grad_norm2_out = 0.0f; int i = 0; #pragma loop ivdep while (i < n) { float gpen = 0.0f; float g_main = 0.0f; if (cost.solveMode == ManipSolveMode::Transition) { const float dtheta = (i < cost.referenceAngleCount) ? cost.wrapped_angle_delta(q_in, i) : 0.0f; gpen = 2.0f * cost.transitionEnergyWeight * dtheta; gpen = fmaf(2.0f * cost.transitionPrefixEnergyWeight, prefix_tail_grad[i], gpen); g_main = 2.0f * cost.transitionCaptureWeight * fmaf(dx, -sum_s[i], dy * sum_c[i]); } else { { const float tsg = fmaf(-q_in[i], cost.archBiasK, 0.0f); const float exp_arg = -tsg; const float exp_val = fmaf( exp_arg, fmaf( exp_arg, fmaf( exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float sig = 1.0f / (exp_val + 1.0f); gpen = fmaf(-cost.archBiasW * cost.archBiasK, sig, gpen); } g_main = fmaf(dx, -sum_s[i], dy * sum_c[i]) * inv_dist; } const float gi = g_main + gpen; grad_out[i] = gi; grad_norm2_out = fmaf(gi, gi, grad_norm2_out); ++i; } if (cost.variableLen) { int j = 0; #pragma loop ivdep while (j < n) { float gi = 0.0f; if (cost.solveMode == ManipSolveMode::Transition) { float gpenL = 0.0f; if (cost.has_transition_reference_lengths() && j < cost.referenceLengthCount) { gpenL = 2.0f * cost.transitionLengthEnergyWeight * (q_in[n + j] - cost.referenceState[n + j]); } gi = 2.0f * cost.transitionCaptureWeight * fmaf(dx, c_arr[j], dy * s_arr[j]) + gpenL; } else { gi = fmaf(dx, c_arr[j], dy * s_arr[j]) * inv_dist; } grad_out[n + j] = gi; grad_norm2_out = fmaf(gi, gi, grad_norm2_out); ++j; } } }; auto armijoLineSearch = [&](const float* q_base, const float f_base, const float* dir, const float gtd, float& alpha_io, float* q_out, float& f_out, float& x_out, float& y_out, unsigned& idx_out) noexcept -> bool { float alpha = alpha_io; int backtrack = 0; while (backtrack < max_backtrack) { int i = 0; #pragma loop ivdep while (i < dim) { q_out[i] = fmaf(alpha, dir[i], q_base[i]); ++i; } clampPoint(q_out); float x2 = 0.0f; float y2 = 0.0f; unsigned idx_try = 0u; float val_try = 0.0f; const bool feasible_try = cost.evaluate_indexed(q_out, x2, y2, idx_try, val_try); if (!feasible_try || !(val_try == val_try)) { alpha = tau; ++backtrack; continue; } const float t_try = clamp01(map.pointToT(q_out)); if (enforce_t_bounds) { if (t_hi - t_lo > 2.0f * T_EPS) { if (!(t_try > t_lo + T_EPS && t_try < t_hi - T_EPS)) { alpha = tau; ++backtrack; continue; } } else { if (fabsf(t_try - t_lo) > T_EPS) { alpha = tau; ++backtrack; continue; } } } if (val_try <= f_base + c1 * alpha * gtd) { alpha_io = alpha; f_out = val_try; x_out = x2; y_out = y2; idx_out = idx_try; return true; } alpha = tau; ++backtrack; } return false; }; bool lbfgs_already_tried = false; int outer = 0; while (outer < max_outer_iters) { float grad[AGP_MAX_STATE_DIM]; float grad_norm2 = 0.0f; computeGrad(q_inout, x_io, y_io, grad, grad_norm2); if (!(grad_norm2 == grad_norm2) || grad_norm2 < 1e-12f) break; float dir_gd[AGP_MAX_STATE_DIM]; int i = 0; #pragma loop ivdep while (i < dim) { dir_gd[i] = -grad[i]; ++i; } const float gtd_gd = -grad_norm2; float eta_trial = eta; float f_new = f_io; float x_new = x_io; float y_new = y_io; unsigned idx_new = idx_io; const bool found = armijoLineSearch( q_inout, f_io, dir_gd, gtd_gd, eta_trial, q_try, f_new, x_new, y_new, idx_new); if (!found) break; std::memcpy(q_inout, q_try, static_cast<size_t>(dim) * sizeof(float)); f_io = f_new; x_io = x_new; y_io = y_new; idx_io = idx_new; eta = eta_trial; const float rel_impr = (f_start > 1.0e-20f) ? ((f_start - f_io) / f_start) : 0.0f; if (!lbfgs_already_tried && rel_impr >= lbfgs_trigger) { lbfgs_already_tried = true; float q_resume[AGP_MAX_STATE_DIM]; std::memcpy(q_resume, q_inout, static_cast<size_t>(dim) * sizeof(float)); const float f_resume = f_io; const float x_resume = x_io; const float y_resume = y_io; const unsigned idx_resume = idx_io; const float eta_resume = eta; float q_best_lbfgs[AGP_MAX_STATE_DIM]; std::memcpy(q_best_lbfgs, q_inout, static_cast<size_t>(dim) * sizeof(float)); float f_best_lbfgs = f_io; float x_best_lbfgs = x_io; float y_best_lbfgs = y_io; unsigned idx_best_lbfgs = idx_io; float s_hist[m_lbfgs][AGP_MAX_STATE_DIM]; float y_hist[m_lbfgs][AGP_MAX_STATE_DIM]; float rho_hist[m_lbfgs]; float alpha_hist[m_lbfgs]; int hist_size = 0; float gk[AGP_MAX_STATE_DIM]; float gk_norm2 = 0.0f; computeGrad(q_inout, x_io, y_io, gk, gk_norm2); bool lbfgs_ok = true; float alpha_k = 2.0f / sqrtf(dim_f); int it_lbfgs = 0; while (it_lbfgs < max_lbfgs_iters) { if (!(gk_norm2 == gk_norm2) || gk_norm2 < 1e-12f) break; float dir[AGP_MAX_STATE_DIM]; if (hist_size == 0) { int d = 0; #pragma loop ivdep while (d < dim) { dir[d] = -gk[d]; ++d; } } else { float q_vec[AGP_MAX_STATE_DIM]; int d = 0; #pragma loop ivdep while (d < dim) { q_vec[d] = gk[d]; ++d; } for (int jj = hist_size - 1; jj >= 0; --jj) { float dot_sq = 0.0f; d = 0; #pragma loop ivdep while (d < dim) { dot_sq = fmaf(s_hist[jj][d], q_vec[d], dot_sq); ++d; } const float a_coeff = dot_sq * rho_hist[jj]; alpha_hist[jj] = a_coeff; d = 0; #pragma loop ivdep while (d < dim) { q_vec[d] = fmaf(-a_coeff, y_hist[jj][d], q_vec[d]); ++d; } } float gamma = 1.0f; { const int last = hist_size - 1; float yy = 0.0f; int d = 0; #pragma loop ivdep while (d < dim) { yy = fmaf(y_hist[last][d], y_hist[last][d], yy); ++d; } const float ys = 1.0f / rho_hist[last]; if (yy > 0.0f) gamma = ys / yy; } float r_vec[AGP_MAX_STATE_DIM]; d = 0; #pragma loop ivdep while (d < dim) { r_vec[d] = gamma * q_vec[d]; ++d; } for (int jj = 0; jj < hist_size; ++jj) { float dot_yr = 0.0f; int d = 0; #pragma loop ivdep while (d < dim) { dot_yr = fmaf(y_hist[jj][d], r_vec[d], dot_yr); ++d; } const float b_coeff = dot_yr * rho_hist[jj]; const float coeff = alpha_hist[jj] - b_coeff; d = 0; #pragma loop ivdep while (d < dim) { r_vec[d] = fmaf(coeff, s_hist[jj][d], r_vec[d]); ++d; } } d = 0; #pragma loop ivdep while (d < dim) { dir[d] = -r_vec[d]; ++d; } } float gtd = 0.0f; int d = 0; #pragma loop ivdep while (d < dim) { gtd = fmaf(gk[d], dir[d], gtd); ++d; } if (!(gtd == gtd) || gtd >= -eps_descent) { d = 0; #pragma loop ivdep while (d < dim) { dir[d] = -gk[d]; ++d; } gtd = -gk_norm2; if (gtd >= -eps_descent) { lbfgs_ok = false; break; } } float q_old[AGP_MAX_STATE_DIM]; std::memcpy(q_old, q_inout, static_cast<size_t>(dim) * sizeof(float)); float g_old[AGP_MAX_STATE_DIM]; std::memcpy(g_old, gk, static_cast<size_t>(dim) * sizeof(float)); float alpha_try = alpha_k; float f_try = f_io; float x_try = x_io; float y_try = y_io; unsigned idx_try = idx_io; const bool step_ok = armijoLineSearch( q_inout, f_io, dir, gtd, alpha_try, q_try, f_try, x_try, y_try, idx_try); if (!step_ok) { lbfgs_ok = false; break; } std::memcpy(q_inout, q_try, static_cast<size_t>(dim) * sizeof(float)); f_io = f_try; x_io = x_try; y_io = y_try; idx_io = idx_try; alpha_k = alpha_try; computeGrad(q_inout, x_io, y_io, gk, gk_norm2); if (!(gk_norm2 == gk_norm2)) { lbfgs_ok = false; break; } if (f_io < f_best_lbfgs) { f_best_lbfgs = f_io; std::memcpy(q_best_lbfgs, q_inout, static_cast<size_t>(dim) * sizeof(float)); x_best_lbfgs = x_io; y_best_lbfgs = y_io; idx_best_lbfgs = idx_io; } float ys = 0.0f; float s_new[AGP_MAX_STATE_DIM]; float y_new[AGP_MAX_STATE_DIM]; d = 0; #pragma loop ivdep while (d < dim) { const float sd = q_inout[d] - q_old[d]; const float yd = gk[d] - g_old[d]; s_new[d] = sd; y_new[d] = yd; ys = fmaf(yd, sd, ys); ++d; } if (!(ys == ys) || ys <= eps_lbfgs_curv) { hist_size = 0; } else { if (hist_size < m_lbfgs) { const int idx_hist = hist_size; rho_hist[idx_hist] = 1.0f / ys; d = 0; #pragma loop ivdep while (d < dim) { s_hist[idx_hist][d] = s_new[d]; y_hist[idx_hist][d] = y_new[d]; ++d; } ++hist_size; } else { for (int jj = 0; jj < m_lbfgs - 1; ++jj) { rho_hist[jj] = rho_hist[jj + 1]; d = 0; #pragma loop ivdep while (d < dim) { s_hist[jj][d] = s_hist[jj + 1][d]; y_hist[jj][d] = y_hist[jj + 1][d]; ++d; } } const int idx_hist = m_lbfgs - 1; rho_hist[idx_hist] = 1.0f / ys; d = 0; #pragma loop ivdep while (d < dim) { s_hist[idx_hist][d] = s_new[d]; y_hist[idx_hist][d] = y_new[d]; ++d; } hist_size = m_lbfgs; } } if (gk_norm2 < 1e-12f) break; ++it_lbfgs; } if (lbfgs_ok) { std::memcpy(q_inout, q_best_lbfgs, static_cast<size_t>(dim) * sizeof(float)); f_io = f_best_lbfgs; x_io = x_best_lbfgs; y_io = y_best_lbfgs; idx_io = idx_best_lbfgs; break; } else { if (f_best_lbfgs < f_resume) { std::memcpy(q_inout, q_best_lbfgs, static_cast<size_t>(dim) * sizeof(float)); f_io = f_best_lbfgs; x_io = x_best_lbfgs; y_io = y_best_lbfgs; idx_io = idx_best_lbfgs; } else { std::memcpy(q_inout, q_resume, static_cast<size_t>(dim) * sizeof(float)); f_io = f_resume; x_io = x_resume; y_io = y_resume; idx_io = idx_resume; } eta = eta_resume; } } ++outer; } }; auto evaluate_trial_from_q = [&](const float q_seed, float t_lo, float t_hi, bool enforce_t_bounds) noexcept -> TrialPoint { TrialPoint tr{}; std::memcpy(tr.q, q_seed, static_cast<size_t>(dim) * sizeof(float)); tr.feasible = cost.evaluate_indexed(tr.q, tr.x, tr.y, tr.idx, tr.f); if (tr.feasible) { refine_trial(tr.q, tr.x, tr.y, tr.idx, tr.f, t_lo, t_hi, enforce_t_bounds); } tr.t = clamp01(map.pointToT(tr.q)); if (enforce_t_bounds) { if (tr.t < t_lo) tr.t = t_lo; if (tr.t > t_hi) tr.t = t_hi; } update_best_from_trial(tr); return tr; }; auto evaluate_trial_from_t = [&](float t_base, float t_lo, float t_hi, bool enforce_t_bounds) noexcept -> TrialPoint { alignas(16) float q_seed[AGP_MAX_STATE_DIM]; map.map01ToPoint(t_base, q_seed); return evaluate_trial_from_q(q_seed, t_lo, t_hi, enforce_t_bounds); }; auto make_interval_from_trials = [&](const TrialPoint& Lp, const TrialPoint& Rp) noexcept -> IntervalND { if (!(Lp.t + T_EPS < Rp.t)) return nullptr; IntervalND I = new IntervalND(Lp.t, Rp.t, Lp.f, Rp.f, Lp.idx, Rp.idx); finalize_interval_geometry(I); return I; }; auto find_container_index = [&](float t) noexcept -> int { int best_pos = -1; float best_len = FLT_MAX; for (size_t i = 0u; i < H.size(); ++i) { IntervalND I = H[i]; if (t > I->x1 + T_EPS && t < I->x2 - T_EPS) { const float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } } if (best_pos >= 0) return best_pos; for (size_t i = 0u; i < H.size(); ++i) { IntervalND* I = H[i]; if (t >= I->x1 - T_EPS && t <= I->x2 + T_EPS) { const float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } } return best_pos; }; auto inject_trial_into_queue = [&](const TrialPoint& tr, float r_eff_cur, float adaptive_coeff_) noexcept { if (H.empty()) return; const int pos = find_container_index(tr.t); if (pos < 0) return; IntervalND* src = H[static_cast<size_t>(pos)]; const float src_len = fmaf(src->x2, 1.0f, -src->x1); if (!(tr.t > src->x1 + T_EPS && tr.t < src->x2 - T_EPS)) return; IntervalND* L = new IntervalND(src->x1, tr.t, src->y1, tr.f, src->idx1, tr.idx); IntervalND* Rv = new IntervalND(tr.t, src->x2, tr.f, src->y2, tr.idx, src->idx2); finalize_interval_geometry(L); finalize_interval_geometry(Rv); const float prevMmax = Mmax; H[static_cast<size_t>(pos)] = H.back(); H.pop_back(); delete src; if (!H.empty()) std::make_heap(H.begin(), H.end(), ComparePtrND); update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv); const float m_cur = fmaf(r_eff_cur, Mmax, 0.0f); L->ChangeCharacteristic(m_cur); Rv->ChangeCharacteristic(m_cur); H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND); H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND); if (Mmax > fmaf(adaptive_coeff_, prevMmax, 0.0f)) recompute_heap_constM(m_cur); if (src_len + T_EPS >= dmax) recompute_dmax(); }; auto find_container_index_bounds = [&](const float t, float& lo, float& hi) noexcept -> int { int best_pos = -1; float best_len = FLT_MAX; size_t i = 0u; while (i < H.size()) { IntervalND* const I = H[i]; if (t > I->x1 + T_EPS && t < I->x2 - T_EPS) { const float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } ++i; } if (best_pos < 0) { i = 0u; while (i < H.size()) { IntervalND* const I = H[i]; if (t >= I->x1 - T_EPS && t <= I->x2 + T_EPS) { const float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } ++i; } } if (best_pos >= 0) { const IntervalND* const I = H[static_cast<size_t>(best_pos)]; lo = I->x1; hi = I->x2; } else { lo = 0.0f; hi = 1.0f; } return best_pos; }; auto inject_trial_into_queue_fast = [&](const TrialPoint& tr, const int pos, float r_eff_cur, float adaptive_coeff_) noexcept { if (pos < 0 || static_cast<size_t>(pos) >= H.size()) { inject_trial_into_queue(tr, r_eff_cur, adaptive_coeff_); return; } IntervalND* const src = H[static_cast<size_t>(pos)]; if (!(tr.t > src->x1 + T_EPS && tr.t < src->x2 - T_EPS)) { inject_trial_into_queue(tr, r_eff_cur, adaptive_coeff_); return; } const float src_len = fmaf(src->x2, 1.0f, -src->x1); IntervalND* const L = new IntervalND(src->x1, tr.t, src->y1, tr.f, src->idx1, tr.idx); IntervalND* const Rv = new IntervalND(tr.t, src->x2, tr.f, src->y2, tr.idx, src->idx2); finalize_interval_geometry(L); finalize_interval_geometry(Rv); const float prevMmax = Mmax; H[static_cast<size_t>(pos)] = H.back(); H.pop_back(); delete src; if (!H.empty()) std::make_heap(H.begin(), H.end(), ComparePtrND); update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv); const float m_cur = fmaf(r_eff_cur, Mmax, 0.0f); L->ChangeCharacteristic(m_cur); Rv->ChangeCharacteristic(m_cur); H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND); H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND); if (Mmax > fmaf(adaptive_coeff_, prevMmax, 0.0f)) recompute_heap_constM(m_cur); if (src_len + T_EPS >= dmax) recompute_dmax(); }; auto progress_outgoing = & noexcept { for (auto itReq = g_pendingMulti.begin(); itReq != g_pendingMulti.end();) { if (itReq->req.test()) itReq = g_pendingMulti.erase(itReq); else ++itReq; } for (auto itReq = g_pendingBest.begin(); itReq != g_pendingBest.end();) { if (itReq->req.test()) itReq = g_pendingBest.erase(itReq); else ++itReq; } }; auto select_hj_seed = [&](const AlignedFloatVec*& seed_vec, float& seed_f, unsigned& seed_idx, float& seed_x, float& seed_y) noexcept -> bool { auto take_indexed = & noexcept -> bool { if (bestQIndexed.empty()) return false; seed_vec = &bestQIndexed; seed_f = bestIndexValue; seed_idx = bestIndexFound; seed_x = bestIndexedX; seed_y = bestIndexedY; return true; }; auto take_feasible = & noexcept -> bool { if (bestQ.empty()) return false; seed_vec = &bestQ; seed_f = bestF; seed_idx = fullConstraintIndex; seed_x = bestX; seed_y = bestY; return true; }; auto take_reference = & noexcept -> bool { if (!haveTransitionRefSeed) return false; seed_vec = &transitionReferenceSeed; seed_f = transitionRefF; seed_idx = transitionRefIdx; seed_x = transitionRefX; seed_y = transitionRefY; return true; }; seed_vec = nullptr; seed_f = FLT_MAX; seed_idx = 0u; seed_x = 0.0f; seed_y = 0.0f; if (cost.solveMode == ManipSolveMode::Transition) return take_feasible() || take_reference() || take_indexed(); return take_indexed() || take_feasible(); }; auto should_run_hj_this_iter = [&](float progress) noexcept -> bool { if (!hj_cfg.enabled) return false; if (hj_cfg.require_adaptive_flag && !adaptive) return false; int period = 1; int phase = 0; select_hj_iteration_schedule(progress, period, phase); if (period <= 1) return true; return (it % period) == phase; }; auto run_hooke_jeeves_iteration = [&](float r_eff_cur, float adaptive_coeff_, float progress) noexcept { if (H.empty()) return; const AlignedFloatVec* seedQ = nullptr; float seedF = FLT_MAX; float seedX = 0.0f; float seedY = 0.0f; unsigned seedIdx = 0u; if (!select_hj_seed(seedQ, seedF, seedIdx, seedX, seedY)) return; TrialPoint base{}; std::memcpy(base.q, seedQ->data(), static_cast<size_t>(dim) * sizeof(float)); base.t = clamp01(map.pointToT(base.q)); base.f = seedF; base.idx = seedIdx; base.x = seedX; base.y = seedY; base.feasible = (base.idx == fullConstraintIndex); auto evaluate_hj_candidate = [&](const float* q_candidate) noexcept -> TrialPoint { const float t_guess = clamp01(map.pointToT(q_candidate)); float lo = 0.0f; float hi = 1.0f; const int pos = find_container_index_bounds(t_guess, lo, hi); const TrialPoint tr = evaluate_trial_from_q(q_candidate, lo, hi, true); inject_trial_into_queue_fast(tr, pos, r_eff_cur, adaptive_coeff_); return tr; }; apply_hj_coord_profile(progress); const int hj_order_mode = select_hj_order_mode(progress); const int* const hj_order = select_hj_order_ptr(select_hj_order_mode(progress)); TrialPoint current = base; TrialPoint base_before = base; bool improved_any = false; int ord = 0; while (ord < dim) { const int d = hj_order[ord]; const float step = hj_delta[d]; const TrialPoint axis_base = current; auto try_direction = [&](int sgn, TrialPoint& accepted_tp) noexcept -> bool { float q_candidate[AGP_MAX_STATE_DIM]; std::memcpy(q_candidate, axis_base.q, static_cast<size_t>(dim) * sizeof(float)); q_candidate[d] = fmaf(static_cast<float>(sgn), step, q_candidate[d]); clampPoint(q_candidate); if (fabsf(q_candidate[d] - axis_base.q[d]) <= 1.0e-12f) return false; const TrialPoint tp = evaluate_hj_candidate(q_candidate); if (better_indexed(tp.idx, tp.f, axis_base.idx, axis_base.f)) { accepted_tp = tp; return true; } return false; }; const int first_sign = hj_cfg.plus_first ? +1 : -1; const int second_sign = -first_sign; bool improved_this_axis = false; TrialPoint best_axis_tp{}; bool have_axis_tp = false; TrialPoint tp_try{}; if (try_direction(first_sign, tp_try)) { best_axis_tp = tp_try; have_axis_tp = true; if (hj_cfg.opportunistic_axis) { current = best_axis_tp; improved_any = true; improved_this_axis = true; } } if (!hj_cfg.opportunistic_axis || !have_axis_tp) { if (try_direction(second_sign, tp_try)) { if (!have_axis_tp || better_indexed(tp_try.idx, tp_try.f, best_axis_tp.idx, best_axis_tp.f)) { best_axis_tp = tp_try; have_axis_tp = true; } } if (have_axis_tp) { current = best_axis_tp; improved_any = true; improved_this_axis = true; } } if (improved_this_axis && hj_cfg.expand_delta_on_success) { hj_delta[d] = fmaxf(hj_cfg.min_delta, hj_delta[d] * hj_cfg.expand_factor); } ++ord; } if (improved_any) { if (hj_cfg.use_pattern) { float q_pattern[AGP_MAX_STATE_DIM]; for (int d = 0; d < dim; ++d) q_pattern[d] = fmaf(hj_cfg.pattern_gain, current.q[d] - base_before.q[d], current.q[d]); clampPoint(q_pattern); TrialPoint pat_tp = evaluate_hj_candidate(q_pattern); if (better_indexed(pat_tp.idx, pat_tp.f, current.idx, current.f)) current = pat_tp; } } else { for (int d = 0; d < dim; ++d) hj_delta[d] = fmaxf(hj_delta[d] * hj_cfg.shrink_factor, hj_cfg.min_delta); } }; auto ensure_queue_nonempty = [&](float r_eff_cur) noexcept -> bool { if (!H.empty()) return true; const AlignedFloatVec* seedQ = nullptr; float seedF = FLT_MAX; float seedX = 0.0f; float seedY = 0.0f; unsigned seedIdx = 0u; if (!select_hj_seed(seedQ, seedF, seedIdx, seedX, seedY)) return false; (void)seedF; (void)seedX; (void)seedY; (void)seedIdx; const float t_center = clamp01(map.pointToT(seedQ->data())); const float half = fmaxf(1.0e-4f * dim_f, 1.0e-6f); const float t1b = clamp01(t_center - half); const float t2b = clamp01(t_center + half); const float mid = 0.5f * (t1b + t2b); const TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true); const TrialPoint right_tp = evaluate_trial_from_t(t2b, mid, t2b, true); IntervalND* I = make_interval_from_trials(left_tp, right_tp); if (I == nullptr) return false; update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff_cur, Mmax, 0.0f)); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); recompute_dmax(); return true; }; while (g_world->iprobe(boost::mpi::any_source, 0)) { MultiCrossMsg dummy; g_world->recv(boost::mpi::any_source, 0, dummy); } while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg dummy; g_world->recv(boost::mpi::any_source, 2, dummy); } const int seedStride = AGP_MAX_SEED_STRIDE; AlignedFloatVec seeds(static_cast<size_t>(AGP_MAX_GENERATED_SEEDS) * static_cast<size_t>(seedStride), 0.0f); const int seedCnt = generate_heuristic_seeds( cost, map, dim, seeds.data(), seedStride, static_cast<unsigned>(fmaf(static_cast<float>(g_world->rank()), 7919.0f, static_cast<float>(seed)))); const int K_raw = static_cast<int>(fmaf(-fmaf(sqrtf(dim_f), dim_f, 0.0f), 0.725f, 10.95f)); const int K = (K_raw > 2) ? K_raw : 2; H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + static_cast<size_t>(seedCnt) + 32u); const int rank = g_world->rank(); const int world = g_world->size(); const size_t comm_levels = std::bit_width(static_cast<size_t>(world - 1)); for (int i = 0; i < seedCnt; ++i) { const float* s = seeds.data() + static_cast<size_t>(i) * static_cast<size_t>(seedStride); const float t_seed = clamp01(map.pointToT(s)); const float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f) : fmaf( fmaf(0.00031f, static_cast<float>(dim), 0.0f), exp2f( (1.0f / static_cast<float>(fmaxf(static_cast<float>(seedCnt - 4), 1.0f))) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)), 0.0f); const float t1b = clamp01(fmaf(-interval_size, 1.0f, t_seed)); const float t2b = clamp01(fmaf(interval_size, 1.0f, t_seed)); const float mid = 0.5f * (t1b + t2b); const TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true); const TrialPoint right_tp = evaluate_trial_from_t(t2b, mid, t2b, true); IntervalND* I = make_interval_from_trials(left_tp, right_tp); if (I == nullptr) continue; update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); if (i < 3) { I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f); } else { const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f); const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f); const float mult = fmaf( start_mult, exp2f( (1.0f / static_cast<float>(fmaxf(static_cast<float>(seedCnt - 4), 1.0f))) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)), 0.0f); I->R = fmaf(I->R, mult, 0.0f); } H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } std::vector<float> grid_base; grid_base.reserve(static_cast<size_t>(K) + 2u); grid_base.emplace_back(a); for (int k = 1; k <= K; ++k) { const float t = clamp01( fmaf( fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1)))); grid_base.emplace_back(t); } grid_base.emplace_back(b); std::vector<TrialPoint> grid_trials(grid_base.size()); for (size_t i = 0u; i < grid_base.size(); ++i) { float lo = grid_base[i]; float hi = grid_base[i]; if (i == 0u) { lo = hi = a; } else if (i + 1u == grid_base.size()) { lo = hi = b; } else { lo = 0.5f * (grid_base[i - 1u] + grid_base[i]); hi = 0.5f * (grid_base[i] + grid_base[i + 1u]); } grid_trials[i] = evaluate_trial_from_t(grid_base[i], lo, hi, true); } for (size_t i = 1u; i < grid_trials.size(); ++i) { IntervalND* I = make_interval_from_trials(grid_trials[i - 1u], grid_trials[i]); if (I == nullptr) continue; update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } recompute_dmax(); if (haveTransitionRefSeed) { TrialPoint ref_tp = evaluate_trial_from_q(transitionReferenceSeed.data(), 0.0f, 1.0f, false); if (!H.empty()) { inject_trial_into_queue(ref_tp, r, 1.0f); recompute_dmax(); } else { ensure_queue_nonempty(r); } } g_world->barrier(); while (true) { p = fmaf(-1.0f / initial_len, dmax, 1.0f); stag_r_multiplier = 1.4f - 1.1f * fmaf( 0.65f * p - 0.45f, fmaf( 0.65f * p - 0.45f, fmaf( 0.65f * p - 0.45f, fmaf( 0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), 0.65f * p - 0.45f); const float p_arg = fmaf(p, 2.3f, -2.9775f); float current_r = r; if (stag_boost_remaining > 0) { current_r = r * stag_r_multiplier; --stag_boost_remaining; } const float exp_arg = fmaf(B_dim, p, 0.0f); const float exp_arg__ = fmaf(B_dim__, p, 0.0f); adaptive_coeff = fmaf( -fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim); const float first_sqrt = sqrtf(fmaf(1.0f / dim_f, 2.0f, 0.0f)); const float second_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 5.0f, 0.0f)); const float third_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 9.0f, 0.0f)); const float fourth_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 6.5f, 0.0f)); const float rr = sqrtf(fmaf(-p, 1.0f, 1.0f)); const float xx = p * p; const float tt = fmaf(500.0f, p, -486.95472f); const float adaptive_coeff_ = (p < 0.95f) ? fmaf(fmaf(first_sqrt, xx, 0.0f), 0.0130349902f, fmaf(-0.04f, p, fmaf(fmaf(first_sqrt, rr, 0.0f), 0.15f, 1.1f))) : (p < 0.97390944f) ? fmaf(second_sqrt, rr, 0.9396f) : (p < 0.97590944f) ? fmaf( fmaf( fmaf(fmaf(third_sqrt, tt, 0.0f), tt, 0.0f), fmaf(-2.0f, tt, 3.0f), 0.0f), fmaf(0.25f, rr, -0.0396f), fmaf(fmaf(third_sqrt, rr, 0.0f), 0.75f, 0.9396f)) : fmaf(fourth_sqrt, rr, 0.925f); adaptive_coeff__ = fmaf( fmaf(exp_arg__, fmaf(exp_arg__, fmaf(exp_arg__, fmaf(exp_arg__, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition__, 2.0f - A_dim__); float grad_norm2_best = 0.0f; if (no_improve > 0 && !bestQ.empty()) { float acc_best = 0.0f; float phi_best[AGP_MAX_STATE_DIM]; int ii_best = 0; #pragma loop ivdep while (ii_best < n) { acc_best = fmaf(bestQ[ii_best], 1.0f, acc_best); phi_best[ii_best] = acc_best; ++ii_best; } float s_best[AGP_MAX_STATE_DIM]; float c_best[AGP_MAX_STATE_DIM]; FABE13_SINCOS(phi_best, s_best, c_best, n); float as_best = 0.0f; float ac_best = 0.0f; float sum_s_best[AGP_MAX_STATE_DIM]; float sum_c_best[AGP_MAX_STATE_DIM]; int k_best = n - 1; while (k_best >= 0) { const float Lk = cost.variableLen ? bestQ[n + k_best] : cost.fixedLength; as_best = fmaf(Lk, s_best[k_best], as_best); ac_best = fmaf(Lk, c_best[k_best], ac_best); sum_s_best[k_best] = as_best; sum_c_best[k_best] = ac_best; --k_best; } alignas(16) float prefix_tail_best[AGP_MAX_STATE_DIM]{}; if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles() && cost.transitionPrefixEnergyWeight > 0.0f) { alignas(16) float prefix_delta_best[AGP_MAX_STATE_DIM]{}; float phi_pref = 0.0f; int jj = 0; #pragma loop ivdep while (jj < cost.referenceAngleCount) { phi_pref += bestQ[jj]; prefix_delta_best[jj] = ManipCost::wrap_pi(phi_pref - cost.referencePrefixAngles[jj]); ++jj; } float tail = 0.0f; jj = cost.referenceAngleCount - 1; while (jj >= 0) { tail += prefix_delta_best[jj]; prefix_tail_best[jj] = tail; --jj; } } const float dx_best = fmaf(bestX, 1.0f, -cost.targetX); const float dy_best = fmaf(bestY, 1.0f, -cost.targetY); const float dist_best = sqrtf(fmaf(dx_best, dx_best, dy_best * dy_best)); const float inv_dist_best = (dist_best > 1.0e-20f) ? (1.0f / dist_best) : 0.0f; int i_best = 0; #pragma loop ivdep while (i_best < n) { float gpen_best = 0.0f; float g_main_best = 0.0f; if (cost.solveMode == ManipSolveMode::Transition) { const float dtheta_best = (i_best < cost.referenceAngleCount) ? cost.wrapped_angle_delta(bestQ.data(), i_best) : 0.0f; gpen_best = 2.0f * cost.transitionEnergyWeight * dtheta_best; gpen_best = fmaf(2.0f * cost.transitionPrefixEnergyWeight, prefix_tail_best[i_best], gpen_best); g_main_best = 2.0f * cost.transitionCaptureWeight * fmaf(dx_best, -sum_s_best[i_best], dy_best * sum_c_best[i_best]); } else { { const float tsg = fmaf(-bestQ[i_best], cost.archBiasK, 0.0f); const float exp_arg_g = -tsg; const float exp_val = fmaf( exp_arg_g, fmaf( exp_arg_g, fmaf( exp_arg_g, fmaf(exp_arg_g, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float sig = 1.0f / (exp_val + 1.0f); gpen_best = fmaf(-cost.archBiasW * cost.archBiasK, sig, gpen_best); } g_main_best = fmaf(dx_best, -sum_s_best[i_best], dy_best * sum_c_best[i_best]) * inv_dist_best; } const float gi_best = g_main_best + gpen_best; grad_norm2_best = fmaf(gi_best, gi_best, grad_norm2_best); ++i_best; } if (cost.variableLen) { int j_best = 0; #pragma loop ivdep while (j_best < n) { float gi_best = 0.0f; if (cost.solveMode == ManipSolveMode::Transition) { float gpenL_best = 0.0f; if (cost.has_transition_reference_lengths() && j_best < cost.referenceLengthCount) { gpenL_best = 2.0f * cost.transitionLengthEnergyWeight * (bestQ[n + j_best] - cost.referenceState[n + j_best]); } gi_best = 2.0f * cost.transitionCaptureWeight * fmaf(dx_best, c_best[j_best], dy_best * s_best[j_best]) + gpenL_best; } else { gi_best = fmaf(dx_best, c_best[j_best], dy_best * s_best[j_best]) * inv_dist_best; } grad_norm2_best = fmaf(gi_best, gi_best, grad_norm2_best); ++j_best; } } } const bool stagnation = (no_improve > noImproveThrDim) && (grad_norm2_best < 0.5e-1f); const float r_eff = (dim > 2) ? fmaf( -fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, fmaf(sqrtf(dim_f - 1), current_r, 0.0f), 0.0f) : fmaf( -fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, current_r, 0.0f); if (stagnation) { stag_boost_remaining = n_stag_iters; float t_seeds[32]; int seed_count = 0; if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles()) { alignas(16) float q_center[AGP_MAX_STATE_DIM]; std::memcpy(q_center, cost.referenceState, static_cast<size_t>(dim) * sizeof(float)); t_seeds[seed_count++] = clamp01(map.pointToT(q_center)); if (!bestQ.empty() && seed_count < 32) t_seeds[seed_count++] = clamp01(map.pointToT(bestQ.data())); unsigned st_local = seed + static_cast<unsigned>(it) + 911u; const int local_variants = 2 + static_cast<int>(sqrtf(dim_f)); for (int v = 0; v < local_variants && seed_count < 32; ++v) { float q_temp[AGP_MAX_STATE_DIM]; std::memcpy(q_temp, q_center, static_cast<size_t>(dim) * sizeof(float)); st_local ^= st_local << 13; st_local ^= st_local >> 17; st_local ^= st_local << 5; const float rnd0 = static_cast<float>(st_local & 0xFFFFFFu) * 5.9604645e-8f; const int aidx = v % n; q_temp[aidx] = fmaf((2.0f * rnd0 - 1.0f), 0.25f * cost.transitionMaxAngleStep, q_temp[aidx]); if (cost.variableLen && cost.has_transition_reference_lengths()) { st_local ^= st_local << 13; st_local ^= st_local >> 17; st_local ^= st_local << 5; const float rnd1 = static_cast<float>(st_local & 0xFFFFFFu) * 5.9604645e-8f; const int lidx = v % n; q_temp[n + lidx] = fmaf((2.0f * rnd1 - 1.0f), 0.25f * cost.max_length_step(lidx), q_temp[n + lidx]); } clampPoint(q_temp); t_seeds[seed_count++] = clamp01(map.pointToT(q_temp)); } } else { const int num_ik = 1 + static_cast<int>(sqrtf(dim_f)); float dist_to_target = sqrtf(cost.targetX * cost.targetX + cost.targetY * cost.targetY); float max_reach = 0.0f; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) max_reach += map.high[n + ii]; } else { max_reach = static_cast<float>(n) * cost.fixedLength; } const float ratio = dist_to_target / fmaxf(max_reach, 1.0e-20f); const bool prefer_extended = (ratio > 0.7f); const bool prefer_compact = (ratio < 0.4f); const bool use_ik = !(ratio > 0.4f && ratio < 0.7f); if (!use_ik) { AlignedFloatVec temp_S(static_cast<size_t>(AGP_MAX_GENERATED_SEEDS) * static_cast<size_t>(AGP_MAX_SEED_STRIDE), 0.0f); const int sobol_gen = generate_sobol_seeds_limited( map, dim, temp_S.data(), AGP_MAX_SEED_STRIDE, seed + static_cast<unsigned>(it), AGP_MAX_GENERATED_SEEDS); const int num_sobol = num_ik; for (int kk = 0; kk < num_sobol && kk < sobol_gen && seed_count < 32; ++kk) { const float* s = temp_S.data() + static_cast<size_t>(kk) * static_cast<size_t>(AGP_MAX_SEED_STRIDE); t_seeds[seed_count++] = clamp01(map.pointToT(s)); } } else { float angles_ccd[32] = { 0.0f }; float lengths_ccd[32]; if (cost.variableLen) { const float len_low = map.low[n]; const float len_high = map.high[n]; const float avg_len = 0.5f * (len_low + len_high); for (int ii = 0; ii < n; ++ii) lengths_ccd[ii] = avg_len; } else { for (int ii = 0; ii < n; ++ii) lengths_ccd[ii] = cost.fixedLength; } ccd_ik(cost.targetX, cost.targetY, lengths_ccd, n, angles_ccd, 10); float angles_fabrik[32] = { 0.0f }; float lengths_fabrik[32]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) lengths_fabrik[ii] = lengths_ccd[ii]; } else { for (int ii = 0; ii < n; ++ii) lengths_fabrik[ii] = cost.fixedLength; } float targetX_fab = cost.targetX; float targetY_fab = cost.targetY; for (int iter_fab = 0; iter_fab < 3; ++iter_fab) { float prevX = targetX_fab; float prevY = targetY_fab; for (int j = n - 1; j >= 0; --j) { const float len = lengths_fabrik[j]; const float angle_to_target = atan2f(prevY, prevX); angles_fabrik[j] = angle_to_target; float s_val; float c_val; FABE13_SINCOS(&angle_to_target, &s_val, &c_val, 1); prevX = prevX - len * c_val; prevY = prevY - len * s_val; } } if (prefer_extended) { { float q_ccd[AGP_MAX_STATE_DIM]; for (int ii = 0; ii < n; ++ii) q_ccd[ii] = angles_ccd[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) q_ccd[n + ii] = lengths_ccd[ii]; } t_seeds[seed_count++] = clamp01(map.pointToT(q_ccd)); } unsigned st_ik = seed + static_cast<unsigned>(it) + 222u; const int remaining = num_ik - 1; for (int v = 0; v < remaining && seed_count < 32; ++v) { float noisy_angles[32]; float noisy_lengths[32]; for (int ii = 0; ii < n; ++ii) { st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; const float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f; noisy_angles[ii] = angles_ccd[ii] + (2.0f * rnd - 1.0f) * 0.1f; const float lo = (ii == 0) ? -1.0471975511965976f : -2.6179938779914944f; const float hi = 2.6179938779914944f; if (noisy_angles[ii] < lo) noisy_angles[ii] = lo; if (noisy_angles[ii] > hi) noisy_angles[ii] = hi; } if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) { st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; const float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f; noisy_lengths[ii] = lengths_ccd[ii] + (2.0f * rnd - 1.0f) * 0.05f; if (noisy_lengths[ii] < 0.5f) noisy_lengths[ii] = 0.5f; if (noisy_lengths[ii] > 2.0f) noisy_lengths[ii] = 2.0f; } } float q_temp[AGP_MAX_STATE_DIM]; for (int ii = 0; ii < n; ++ii) q_temp[ii] = noisy_angles[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) q_temp[n + ii] = noisy_lengths[ii]; } t_seeds[seed_count++] = clamp01(map.pointToT(q_temp)); } } else if (prefer_compact) { { float q_fabrik[AGP_MAX_STATE_DIM]; for (int ii = 0; ii < n; ++ii) q_fabrik[ii] = angles_fabrik[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) q_fabrik[n + ii] = lengths_fabrik[ii]; } t_seeds[seed_count++] = clamp01(map.pointToT(q_fabrik)); } unsigned st_ik = seed + static_cast<unsigned>(it) + 222u; const int remaining = num_ik - 1; for (int v = 0; v < remaining && seed_count < 32; ++v) { float noisy_angles[32]; float noisy_lengths[32]; for (int ii = 0; ii < n; ++ii) { st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; const float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f; noisy_angles[ii] = angles_fabrik[ii] + (2.0f * rnd - 1.0f) * 0.06f; const float lo = (ii == 0) ? -1.0471975511965976f : -2.6179938779914944f; const float hi = 2.6179938779914944f; if (noisy_angles[ii] < lo) noisy_angles[ii] = lo; if (noisy_angles[ii] > hi) noisy_angles[ii] = hi; } if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) { st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; const float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f; noisy_lengths[ii] = lengths_fabrik[ii] + (2.0f * rnd - 1.0f) * 0.03f; if (noisy_lengths[ii] < 0.5f) noisy_lengths[ii] = 0.5f; if (noisy_lengths[ii] > 2.0f) noisy_lengths[ii] = 2.0f; } } float q_temp[AGP_MAX_STATE_DIM]; for (int ii = 0; ii < n; ++ii) q_temp[ii] = noisy_angles[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) q_temp[n + ii] = noisy_lengths[ii]; } t_seeds[seed_count++] = clamp01(map.pointToT(q_temp)); } } } } for (int s = 0; s < seed_count; ++s) { const float t_center = t_seeds[s]; const float interval_size = fmaf(0.00031f, static_cast<float>(dim), 0.0f); const float t1b = clamp01(t_center - 0.5f * interval_size); const float t2b = clamp01(t_center + 0.5f * interval_size); const float mid = 0.5f * (t1b + t2b); const TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true); const TrialPoint right_tp = evaluate_trial_from_t(t2b, mid, t2b, true); IntervalND* I = make_interval_from_trials(left_tp, right_tp); if (I == nullptr) continue; update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); I->R = fmaf(I->R, fmaf(0.01f, dim_f, 0.85f), 0.0f); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } recompute_dmax(); no_improve = 0; } const float bestFOld = bestF; const bool local_idle = !ensure_queue_nonempty(r_eff); const bool do_hj_this_iter = should_run_hj_this_iter(p); if (!local_idle) { if (do_hj_this_iter) { run_hooke_jeeves_iteration(r_eff, adaptive_coeff_, p); } else { std::pop_heap(H.begin(), H.end(), ComparePtrND); IntervalND* cur = H.back(); H.pop_back(); const float x1 = cur->x1; const float x2 = cur->x2; const float y1 = cur->y1; const float y2 = cur->y2; float m = fmaf(r_eff, Mmax, 0.0f); const float tBase = step(m, x1, x2, y1, y2, dim_f, r_eff, cur->idx1, cur->idx2); const TrialPoint trNew = evaluate_trial_from_t(tBase, x1, x2, true); IntervalND* L = new IntervalND(x1, trNew.t, y1, trNew.f, cur->idx1, trNew.idx); IntervalND* Rv = new IntervalND(trNew.t, x2, trNew.f, y2, trNew.idx, cur->idx2); finalize_interval_geometry(L); finalize_interval_geometry(Rv); const float Mloc = fmaxf(L->M, Rv->M); const float prevMmax = Mmax; update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv); m = fmaf(r_eff, Mmax, 0.0f); const float len1 = fmaf(trNew.t, 1.0f, -x1); const float len2 = fmaf(x2, 1.0f, -trNew.t); if (fmaf(len1, 1.0f, len2) >= dmax - T_EPS) { dmax = fmaxf(len1, len2); for (auto* pI : H) { const float Ls = fmaf(pI->x2, 1.0f, -pI->x1); if (Ls > dmax) dmax = Ls; } } if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) { const float alpha = p * p; const float beta = fmaf(-alpha, 1.0f, 2.0f); const float MULT = (1.0f / fmaxf(dmax, 1.0e-20f)) * Mmax; const float global_coeff = fmaf(MULT, r_eff, -MULT); const float GF = beta * global_coeff; L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f))); Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f))); const size_t sz = H.size(); if (sz > 0u) { if (!hasHardConstraints) RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha); else RecomputeR_AffineM_Mixed_ND(H.data(), sz, GF, alpha); std::make_heap(H.begin(), H.end(), ComparePtrND); } } else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); if (Mloc > prevMmax && Mloc > fmaf(adaptive_coeff_, prevMmax, 0.0f)) recompute_heap_constM(m); } H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND); H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND); delete cur; } } if (!H.empty()) { mm_prefetch(reinterpret_cast<const char*>(H[0]), MM_HINT_T0); if (H.size() > 1u) mm_prefetch(reinterpret_cast<const char*>(H[1]), MM_HINT_T0); } if (!H.empty() && (stagnation || bestF < fmaf(fmaf(bestFOld, 0.6f, 0.0f), adaptive_coeff, 0.0f)) && it - last_send_T >= send_interval_T) { last_send_T = it; progress_outgoing(); const size_t scan_count = H.size(); unsigned intervals_to_send = (dim < 12) ? static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f))) : 7u; if (intervals_to_send > scan_count) intervals_to_send = static_cast<unsigned>(scan_count); if (intervals_to_send > 0u) { const float alpha = 0.63f - sqrtf(fmaxf(-0.113f * (p - 1.0f), 0.0f)); const float beta = 1.0f - alpha; const float w_pos = 3.085f * (-sqrtf(fmaxf(0.007f * p, 0.0f)) + 1.0f); const float w_size = 5.085f - w_pos; const int num_bins = (p < 1.0f - 1.0e-9f) ? static_cast<int>(1.0f / (1.0f - p)) : 1; std::vector<float> pos_metric(scan_count); std::vector<float> size_metric(scan_count); std::vector<float> R_val(scan_count); float R_max = -FLT_MAX; float R_min = FLT_MAX; float size_max = -FLT_MAX; float size_min = FLT_MAX; float pos_max = -FLT_MAX; float pos_min = FLT_MAX; for (size_t idx = 0u; idx < scan_count; ++idx) { IntervalND* I = H[idx]; const float center = 0.5f * (I->x1 + I->x2); const float len = fmaf(I->x2, 1.0f, -I->x1); const float size = len * static_cast<float>(1 << I->span_level); if (p < 0.95f && num_bins > 1) { float bin = floorf(center * static_cast<float>(num_bins)); if (bin >= static_cast<float>(num_bins)) bin = static_cast<float>(num_bins - 1); pos_metric[idx] = bin / static_cast<float>(num_bins - 1); } else { pos_metric[idx] = center; } size_metric[idx] = size; R_val[idx] = I->R; if (pos_metric[idx] > pos_max) pos_max = pos_metric[idx]; if (pos_metric[idx] < pos_min) pos_min = pos_metric[idx]; if (size > size_max) size_max = size; if (size < size_min) size_min = size; if (I->R > R_max) R_max = I->R; if (I->R < R_min) R_min = I->R; } auto norm = [](float val, float minv, float maxv) noexcept -> float { const float denom = maxv - minv; if (fabsf(denom) <= 1.0e-20f) return 0.0f; return (val - minv) / denom; }; std::vector<float> R_norm(scan_count); std::vector<float> pos_norm(scan_count); std::vector<float> size_norm(scan_count); for (size_t idx = 0u; idx < scan_count; ++idx) { R_norm[idx] = norm(R_val[idx], R_min, R_max); pos_norm[idx] = norm(pos_metric[idx], pos_min, pos_max); size_norm[idx] = norm(size_metric[idx], size_min, size_max); } std::vector<int> selected; selected.reserve(intervals_to_send); std::vector<unsigned char> used(scan_count, 0u); selected.push_back(0); used[0] = true; while (selected.size() < intervals_to_send) { int best_idx = -1; float best_score = -FLT_MAX; for (size_t idx = 0u; idx < scan_count; ++idx) { if (used[idx] != 0u) continue; float min_dist = FLT_MAX; for (int s : selected) { float d2 = 0.0f; float d = pos_norm[idx] - pos_norm[static_cast<size_t>(s)]; d2 += w_pos * d * d; d = size_norm[idx] - size_norm[static_cast<size_t>(s)]; d2 += w_size * d * d; if (d2 < min_dist) min_dist = d2; } const float novelty = sqrtf(fmaxf(min_dist, 0.0f)) / sqrtf(fmaxf(w_pos + w_size, 1.0e-20f)); const float score = alpha * R_norm[idx] + beta * novelty; if (score > best_score) { best_score = score; best_idx = static_cast<int>(idx); } } if (best_idx < 0) break; selected.push_back(best_idx); used[static_cast<size_t>(best_idx)] = 1u; } MultiCrossMsg out{}; out.count = static_cast<unsigned>(selected.size()); float* dest = out.intervals; for (unsigned s = 0u; s < out.count; ++s) { IntervalND* Tt = H[static_cast<size_t>(selected[s])]; dest[0] = Tt->x1; dest[1] = 0.0f; dest[2] = Tt->x2; dest[3] = 0.0f; dest[4] = Tt->R; dest += 5; } const size_t iterations = comm_levels; bool active = true; const bool invert_T = ((static_cast<int>(exchange_counter_T) + 1) & 1) != 0; size_t ii2 = 0u; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ static_cast<int>(step); if (partner < world) { const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T); if (am_sender) { g_pendingMulti.emplace_back(g_world, partner, out); if (g_pendingMulti.size() > MAX_INFLIGHT_MULTI) { g_pendingMulti.front().req.wait(); g_pendingMulti.pop_front(); } active = false; } } ++ii2; } ++exchange_counter_T; } } const bool indexedBestImprovedSinceLastSend = !bestQIndexed.empty() && better_indexed(bestIndexFound, bestIndexValue, lastSentBestMsg.bestIndex, lastSentBestMsg.bestF); if (indexedBestImprovedSinceLastSend && it - last_send_best >= send_interval_best) { last_send_best = it; progress_outgoing(); const size_t iterations = comm_levels; bool active = true; const bool invert_best = ((static_cast<int>(exchange_counter) + 1) & 1) != 0; size_t ii2 = 0u; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ static_cast<int>(step); if (partner < world) { const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_best); if (am_sender) { BestSolutionMsg outMsg; InitBestSolutionMsg(outMsg); if (FillBestSolutionMsg( outMsg, bestIndexFound, bestIndexValue, bestIndexedX, bestIndexedY, bestQIndexed) && better_indexed( outMsg.bestIndex, outMsg.bestF, lastSentBestMsg.bestIndex, lastSentBestMsg.bestF)) { g_pendingBest.emplace_back(g_world, partner, outMsg); lastSentBestMsg = outMsg; } if (g_pendingBest.size() > MAX_INFLIGHT_BEST) { g_pendingBest.front().req.wait(); g_pendingBest.pop_front(); } active = false; } } ++ii2; } ++exchange_counter; } while (g_world->iprobe(boost::mpi::any_source, 0)) { MultiCrossMsg in; g_world->recv(boost::mpi::any_source, 0, in); for (unsigned ii = 0u; ii < in.count; ++ii) { const float d = &in.intervals[ii * 5u]; const float sx = d[0]; const float ex = d[2]; alignas(16) float ql[AGP_MAX_STATE_DIM]; float tx = 0.0f; float ty = 0.0f; unsigned v1i = 0u; unsigned v2i = 0u; float y1i = 0.0f; float y2i = 0.0f; map.map01ToPoint(sx, ql); cost.evaluate_indexed(ql, tx, ty, v1i, y1i); map.map01ToPoint(ex, ql); cost.evaluate_indexed(ql, tx, ty, v2i, y2i); IntervalND inj = new IntervalND(sx, ex, y1i, y2i, v1i, v2i); finalize_interval_geometry(inj); const float prevMmax = Mmax; const float candidateMmax = fmaxf(Mmax, inj->M); const float m_inj = fmaf(r_eff, candidateMmax, 0.0f); inj->ChangeCharacteristic(m_inj); if (H.empty() || inj->R > fmaf(adaptive_coeff, H.front()->R, 0.0f) || stagnation) { update_pockets_and_Mmax(inj); H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtrND); if (Mmax > fmaf(adaptive_coeff, prevMmax, 0.0f)) recompute_heap_constM(fmaf(r_eff, Mmax, 0.0f)); } else { delete inj; } } } while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg incoming; g_world->recv(boost::mpi::any_source, 2, incoming); UpdateIndexedBestFromMessage( incoming, fullConstraintIndex, bestIndexFound, bestIndexValue, bestQIndexed, bestIndexedX, bestIndexedY, bestF, bestQ, bestX, bestY); } const float interval_len = H.empty() ? 0.0f : ((dim > 1) ? fmaf(H.front()->x2, 1.0f, -H.front()->x1) : H.front()->diam); if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter - 1) { for (auto& s : g_pendingMulti) s.req.wait(); for (auto& s : g_pendingBest) s.req.wait(); g_pendingMulti.clear(); g_pendingBest.clear(); while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg incoming; g_world->recv(boost::mpi::any_source, 2, incoming); UpdateIndexedBestFromMessage( incoming, fullConstraintIndex, bestIndexFound, bestIndexValue, bestQIndexed, bestIndexedX, bestIndexedY, bestF, bestQ, bestX, bestY); } g_world->barrier(); BestSolutionMsg bestMsg; InitBestSolutionMsg(bestMsg); if (!bestQIndexed.empty()) { FillBestSolutionMsg( bestMsg, bestIndexFound, bestIndexValue, bestIndexedX, bestIndexedY, bestQIndexed); } const size_t iterations = comm_levels; bool active = true; size_t itx = 0u; while (itx < iterations && active) { const size_t step = 1ULL << itx; const int partner = rank ^ static_cast<int>(step); if (partner < world) { const bool am_sender = (rank & static_cast<int>(step)) != 0; if (am_sender) { g_world->send(partner, 3, bestMsg); active = false; } else { BestSolutionMsg inBest; InitBestSolutionMsg(inBest); g_world->recv(partner, 3, inBest); if (better_indexed(inBest.bestIndex, inBest.bestF, bestMsg.bestIndex, bestMsg.bestF)) bestMsg = inBest; } } ++itx; } if (rank == 0) { UpdateIndexedBestFromMessage( bestMsg, fullConstraintIndex, bestIndexFound, bestIndexValue, bestQIndexed, bestIndexedX, bestIndexedY, bestF, bestQ, bestX, bestY); } const bool haveFeasibleBest = (bestIndexFound == fullConstraintIndex) && !bestQ.empty(); if (haveFeasibleBest && cost.solveMode != ManipSolveMode::Transition) { alignas(16) float q_local[AGP_MAX_STATE_DIM]; std::memcpy(q_local, bestQ.data(), static_cast<size_t>(dim) * sizeof(float)); float x_final = bestX; float y_final = bestY; float f_final = bestF; const int last = n - 1; const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f; const float hi = 2.6179938779914943653855361527329f; float bestLocF = f_final; float saved = q_local[last]; float delta = 0.05f; while (delta >= 0.00625f) { int sgn = -1; while (sgn < 2) { float cand = fmaf(static_cast<float>(sgn), delta, saved); if (cand < lo) cand = lo; else if (cand > hi) cand = hi; const float backup = q_local[last]; q_local[last] = cand; float x2_loc = 0.0f; float y2_loc = 0.0f; unsigned idx2 = 0u; float val2 = 0.0f; const bool feasible2 = cost.evaluate_indexed(q_local, x2_loc, y2_loc, idx2, val2); if (feasible2 && val2 < bestLocF) { bestLocF = val2; x_final = x2_loc; y_final = y2_loc; saved = cand; } q_local[last] = backup; sgn += 2; } delta = 0.5f; } if (bestLocF < f_final) { q_local[last] = saved; bestF = bestLocF; bestX = x_final; bestY = y_final; bestQ.assign(q_local, q_local + dim); bestIndexFound = fullConstraintIndex; bestIndexValue = bestF; bestQIndexed = bestQ; bestIndexedX = bestX; bestIndexedY = bestY; } } else if (haveFeasibleBest) { bestIndexFound = fullConstraintIndex; bestIndexValue = bestF; bestQIndexed = bestQ; bestIndexedX = bestX; bestIndexedY = bestY; } else { bestQ = bestQIndexed; bestF = bestIndexValue; bestX = bestIndexedX; bestY = bestIndexedY; } out_iterations = static_cast<size_t>(it); out_achieved_epsilon = interval_len; return; } ++it; } }", код оптимизированной версии: "static __declspec(noalias) void agp_run_branch_mpi(const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed, std::vector<IntervalND, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H, std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ, float& bestF, float& bestX, float& bestY, size_t& out_iterations, float& out_achieved_epsilon, float M_prior)noexcept {
struct TrialPoint { alignas(16)float q[64]; float t, f, x, y; unsigned idx; bool feasible; };
using AlignedFloatVec = std::vector<float, boost::alignment::aligned_allocator<float, 16u>>;
constexpr float T_EPS = 1e-7f;
int n = cost.n, dim = n + (cost.variableLen ? n : 0);
float dim_f = static_cast<float>(dim);
unsigned fullConstraintIndex = cost.feasible_index();
const bool hasHardConstraints = (fullConstraintIndex != 0u);
unsigned exchange_counter = 0u, exchange_counter_T = 0u;
alignas(16)float M_by_span[12];
for (int i = 0; i < 12; ++i)M_by_span[i] = M_prior;
float Mmax = M_prior;
alignas(16)float phi[64], s_arr[64], c_arr[64], sum_s[64], sum_c[64], q_try[64];
bestQ.reserve(static_cast<size_t>(dim));
AlignedFloatVec bestQIndexed; bestQIndexed.reserve(static_cast<size_t>(dim));
AlignedFloatVec transitionReferenceSeed;
float transitionRefF = FLT_MAX; unsigned transitionRefIdx = 0u; float transitionRefX = 0.0f, transitionRefY = 0.0f; bool haveTransitionRefSeed = false;
if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles()) {
int refDim = cost.full_state_dim();
transitionReferenceSeed.resize(static_cast<size_t>(refDim));
std::memcpy(transitionReferenceSeed.data(), cost.referenceState, static_cast<size_t>(refDim) * sizeof(float));
float xr = 0.0f, yr = 0.0f, fr = 0.0f; unsigned idxr = 0u;
cost.evaluate_indexed(transitionReferenceSeed.data(), xr, yr, idxr, fr);
transitionRefF = fr; transitionRefIdx = idxr; transitionRefX = xr; transitionRefY = yr; haveTransitionRefSeed = true;
}
unsigned bestIndexFound = 0u; float bestIndexValue = FLT_MAX, bestIndexedX = 0.0f, bestIndexedY = 0.0f;
BestSolutionMsg lastSentBestMsg; InitBestSolutionMsg(lastSentBestMsg);
int no_improve = 0;
float a = 0.0f, b = 1.0f, p = 0.0f, dmax = b - a, initial_len = dmax;
float A_dim = fmaf(1.0f / sqrtf(dim_f + 6.75f), 5.535f, 0.0f), A_dim
_ = fmaf(1.0f / sqrtf(dim_f + 6.75f), 3.425f, 0.0f), B_dim = fmaf(A_dim, 0.7f, 0.0f), B_dim__ = fmaf(A_dim__, 4.325f, 0.0f), log_argument = A_dim - 2.03f, log_argument__ = A_dim__ - 2.0f;
float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim;
float C_dim__ = fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument__) - B_dim__;
float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
float adaptive_coeff_addition__ = fmaf(C_dim__, fmaf(C_dim__, fmaf(C_dim__, fmaf(C_dim__, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
float adaptive_coeff = A_dim - adaptive_coeff_addition, adaptive_coeff__ = A_dim__ - adaptive_coeff_addition__;
int it = 0, stag_boost_remaining = 0; float stag_r_multiplier = 0.0f;
int last_send_T = 0, send_interval_T = 7, last_send_best = 0, send_interval_best = 2, n_stag_iters = static_cast<int>(3.0f + 2.045f * sqrtf(dim_f)), noImproveThrDim = static_cast<int>(fmaf(7.5f, exp2f(-0.1f * sqrtf(dim_f)), 0.0f));
const AGPHookeJeevesParams& hj_cfg = g_agp_hj_params;
auto clamp01 = [](float v)noexcept->float {return v < 0.0f ? 0.0f : (v > 1.0f ? 1.0f : v); };
auto lerp_hj = [](float a0, float a1, float t)noexcept->float {return fmaf(t, a1 - a0, a0); };
auto select_hj_iteration_schedule = [&](float progress, int& period_out, int& phase_out)noexcept {
period_out = hj_cfg.iteration_period; phase_out = hj_cfg.iteration_phase;
int count = hj_cfg.iteration_schedule_count; if (count <= 0)return;
if (count > AGP_HJ_MAX_PROGRESS_RULES)count = AGP_HJ_MAX_PROGRESS_RULES;
float p01 = clamp01(progress);
for (int i = 0; i < count; ++i) {
const AGPHJIterationScheduleRule& rule = hj_cfg.iteration_schedule[i];
if (p01 < rule.progress_lt || i + 1 == count) { period_out = rule.period; phase_out = rule.phase; break; }
}
if (period_out < 1)period_out = 1; phase_out %= period_out; if (phase_out < 0)phase_out += period_out;
};
auto select_hj_coord_profile = [&](float progress)noexcept->AGPHJCoordScaleProfile {
int count = hj_cfg.coord_profile_schedule_count;
if (count > AGP_HJ_MAX_PROGRESS_RULES)count = AGP_HJ_MAX_PROGRESS_RULES;
float p01 = clamp01(progress);
if (count > 0) {
for (int i = 0; i < count; ++i) {
const AGPHJCoordScaleRule& rule = hj_cfg.coord_profile_schedule[i];
if (p01 < rule.progress_lt || i + 1 == count)return rule.profile;
}
}
AGPHJCoordScaleProfile fallback{};
fallback.angle_scale_proximal = hj_cfg.angle_scale_first;
fallback.angle_scale_distal = hj_cfg.angle_scale_last;
fallback.length_scale_proximal = hj_cfg.length_scale_first;
fallback.length_scale_distal = hj_cfg.length_scale_last;
return fallback;
};
auto select_hj_order_mode = [&](float progress)noexcept->int {
int order_mode = hj_cfg.order_mode;
int count = hj_cfg.order_schedule_count;
if (count <= 0)return order_mode;
if (count > AGP_HJ_MAX_PROGRESS_RULES)count = AGP_HJ_MAX_PROGRESS_RULES;
float p01 = clamp01(progress);
for (int i = 0; i < count; ++i) {
const AGPHJOrderScheduleRule& rule = hj_cfg.order_schedule[i];
if (p01 < rule.progress_lt || i + 1 == count) { order_mode = rule.order_mode; break; }
}
return order_mode;
};
auto coord_profile_scale = [&](int d, const AGPHJCoordScaleProfile& profile)noexcept->float {
if (d < n) {
float t01 = (n > 1) ? (static_cast<float>(d) / static_cast<float>(n - 1)) : 0.0f;
return lerp_hj(profile.angle_scale_proximal, profile.angle_scale_distal, t01);
}
if (cost.variableLen) {
int j = d - n;
float t01 = (n > 1) ? (static_cast<float>(j) / static_cast<float>(n - 1)) : 0.0f;
return lerp_hj(profile.length_scale_proximal, profile.length_scale_distal, t01);
}
return 1.0f;
};
alignas(16)float hj_delta[64], hj_profile_scale_applied[64];
alignas(64)int hj_order_natural[64], hj_order_reverse[64], hj_order_interleaved_proximal[64], hj_order_interleaved_distal[64], hj_order_lengths_first[64];
{
int pos = 0; while (pos < dim) { hj_order_natural[pos] = pos; ++pos; }
pos = 0; int d = dim - 1; while (d >= 0) { hj_order_reverse[pos] = d; ++pos; --d; }
pos = 0; if (cost.variableLen) { int j = 0; while (j < n) { hj_order_interleaved_proximal[pos] = j; ++pos; hj_order_interleaved_proximal[pos] = n + j; ++pos; ++j; } }
else { int j = 0; while (j < n) { hj_order_interleaved_proximal[pos] = j; ++pos; ++j; } }
pos = 0; if (cost.variableLen) { int j = n - 1; while (j >= 0) { hj_order_interleaved_distal[pos] = j; ++pos; hj_order_interleaved_distal[pos] = n + j; ++pos; --j; } }
else { int j = n - 1; while (j >= 0) { hj_order_interleaved_distal[pos] = j; ++pos; --j; } }
pos = 0; if (cost.variableLen) { int j = 0; while (j < n) { hj_order_lengths_first[pos] = n + j; ++pos; ++j; } }int j = 0; while (j < n) { hj_order_lengths_first[pos] = j; ++pos; ++j; }
}
auto select_hj_order_ptr = [&](int order_mode)noexcept->const int* {
switch (order_mode) {
case AGP_HJ_ORDER_NATURAL:return hj_order_natural;
case AGP_HJ_ORDER_REVERSE:return hj_order_reverse;
case AGP_HJ_ORDER_INTERLEAVED_PROXIMAL:return hj_order_interleaved_proximal;
case AGP_HJ_ORDER_LENGTHS_FIRST:return hj_order_lengths_first;
default:return hj_order_interleaved_distal;
}
};
int hj_profile_cache_key = INT_MIN;
auto apply_hj_coord_profile = [&](float progress)noexcept {
AGPHJCoordScaleProfile profile{};
int key = -2;
int count = hj_cfg.coord_profile_schedule_count;
if (count > AGP_HJ_MAX_PROGRESS_RULES)count = AGP_HJ_MAX_PROGRESS_RULES;
if (count > 0) {
int i = 0; while (i < count) {
const AGPHJCoordScaleRule& rule = hj_cfg.coord_profile_schedule[i];
if (progress < rule.progress_lt || i + 1 == count) { profile = rule.profile; key = i; break; }
++i;
}
}
else {
profile.angle_scale_proximal = hj_cfg.angle_scale_first;
profile.angle_scale_distal = hj_cfg.angle_scale_last;
profile.length_scale_proximal = hj_cfg.length_scale_first;
profile.length_scale_distal = hj_cfg.length_scale_last;
}
if (key == hj_profile_cache_key)return;
int d = 0; while (d < dim) {
float new_scale = fmaxf(coord_profile_scale(d, profile), 1e-12f), old_scale = fmaxf(hj_profile_scale_applied[d], 1e-12f);
if (fabsf(new_scale - old_scale) > 1e-12f) { hj_delta[d] = fmaxf(hj_delta[d] * (new_scale / old_scale), hj_cfg.min_delta); hj_profile_scale_applied[d] = new_scale; }
++d;
}
hj_profile_cache_key = key;
};
{
const AGPHJCoordScaleProfile initial_hj_profile = select_hj_coord_profile(0.0f);
for (int d = 0; d < dim; ++d) {
float span = fmaf(map.high[d], 1.0f, -map.low[d]), scale = fmaxf(coord_profile_scale(d, initial_hj_profile), 1e-12f);
hj_profile_scale_applied[d] = scale;
hj_delta[d] = fmaxf(span * hj_cfg.init_delta_frac * scale, hj_cfg.init_delta_abs);
}
}
auto t_to_idx = [&](float t)->unsigned long long {return static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f)); };
auto finalize_interval_geometry = [&](IntervalND* I)noexcept {I->i1 = t_to_idx(I->x1); I->i2 = t_to_idx(I->x2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); };
auto update_pockets_and_Mmax = [&](IntervalND* I)noexcept {int k = I->span_level; if (I->M > M_by_span[k])M_by_span[k] = I->M; if (M_by_span[k] > Mmax)Mmax = M_by_span[k]; };
auto recompute_dmax = &noexcept {float new_dmax = 0.0f; for (auto* pI : H) { float Ls = fmaf(pI->x2, 1.0f, -pI->x1); if (Ls > new_dmax)new_dmax = Ls; }dmax = (new_dmax > 0.0f) ? new_dmax : initial_len; };
auto recompute_heap_constM = [&](float m_cur)noexcept {size_t sz = H.size(); if (!hasHardConstraints)RecomputeR_ConstM_AVX2_ND(H.data(), sz, m_cur); else RecomputeR_ConstM_Mixed_ND(H.data(), sz, m_cur); std::make_heap(H.begin(), H.end(), ComparePtrND); };
auto update_best_from_trial = [&](const TrialPoint& tr)noexcept {
if (better_indexed(tr.idx, tr.f, bestIndexFound, bestIndexValue)) { bestIndexFound = tr.idx; bestIndexValue = tr.f; agp_store_q(bestQIndexed, tr.q, dim); bestIndexedX = tr.x; bestIndexedY = tr.y; }
if (tr.feasible) { if (tr.f < bestF) { bestF = tr.f; agp_store_q(bestQ, tr.q, dim); bestX = tr.x; bestY = tr.y; no_improve = 0; } else { ++no_improve; } }};
auto refine_trial = [&](float* q_inout, float& x_io, float& y_io, unsigned& idx_io, float& f_io, float t_lo, float t_hi, bool enforce_t_bounds)noexcept {
if (!(f_io < fmaf(bestF, adaptive_coeff, 0.0f)))return;
float f_start = f_io, c1 = 1e-4f, tau = 0.5f;
int max_outer_iters = static_cast<int>(50.0f * (1.0f + 0.65f * p)), max_backtrack = static_cast<int>(20.0f * (1.0f + 0.65f * p));
float lbfgs_trigger = 0.6f; int m_lbfgs = 9, max_lbfgs_iters = static_cast<int>(25.0f * (1.0f + 0.65f * p));
float eps_lbfgs_curv = 1e-6f, eps_descent = 1e-12f, eta = 2.0f / sqrtf(dim_f);
auto computeGrad = [&](const float* q_in, float x_in, float y_in, float* grad_out, float& grad_norm2_out)noexcept {
float phi_acc_local = 0.0f; int ii = 0; while (ii < n) { phi_acc_local = fmaf(q_in[ii], 1.0f, phi_acc_local); phi[ii] = phi_acc_local; ++ii; }
FABE13_SINCOS(phi, s_arr, c_arr, n);
float acc_s = 0.0f, acc_c = 0.0f; int kk = n - 1; while (kk >= 0) { float Lk = cost.link_length(q_in, kk); acc_s = fmaf(Lk, s_arr[kk], acc_s); acc_c = fmaf(Lk, c_arr[kk], acc_c); sum_s[kk] = acc_s; sum_c[kk] = acc_c; --kk; }
alignas(16)float prefix_tail_grad[64] = { 0.0f };
if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles() && cost.transitionPrefixEnergyWeight > 0.0f) {
alignas(16)float prefix_delta[64] = { 0.0f }; float phi_prefix = 0.0f; int jj = 0;
while (jj < cost.referenceAngleCount) { phi_prefix += q_in[jj]; prefix_delta[jj] = ManipCost::wrap_pi(phi_prefix - cost.referencePrefixAngles[jj]); ++jj; }
float tail = 0.0f; jj = cost.referenceAngleCount - 1; while (jj >= 0) { tail += prefix_delta[jj]; prefix_tail_grad[jj] = tail; --jj; }
}
float dx = fmaf(x_in, 1.0f, -cost.targetX), dy = fmaf(y_in, 1.0f, -cost.targetY), dist2 = fmaf(dx, dx, dy * dy), dist = sqrtf(dist2), inv_dist = (dist > 1e-20f) ? (1.0f / dist) : 0.0f;
grad_norm2_out = 0.0f; int i = 0; while (i < n) {
float gpen = 0.0f, g_main = 0.0f;
if (cost.solveMode == ManipSolveMode::Transition) {
float dtheta = (i < cost.referenceAngleCount) ? cost.wrapped_angle_delta(q_in, i) : 0.0f;
gpen = 2.0f * cost.transitionEnergyWeight * dtheta;
gpen = fmaf(2.0f * cost.transitionPrefixEnergyWeight, prefix_tail_grad[i], gpen);
g_main = 2.0f * cost.transitionCaptureWeight * fmaf(dx, -sum_s[i], dy * sum_c[i]);
}
else {
float tsg = fmaf(-q_in[i], cost.archBiasK, 0.0f), exp_arg = -tsg, exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), sig = 1.0f / (exp_val + 1.0f);
gpen = fmaf(-cost.archBiasW * cost.archBiasK, sig, gpen);
g_main = fmaf(dx, -sum_s[i], dy * sum_c[i]) * inv_dist;
}
float gi = g_main + gpen; grad_out[i] = gi; grad_norm2_out = fmaf(gi, gi, grad_norm2_out); ++i;
}
if (cost.variableLen) {
int j = 0; while (j < n) {
float gi = 0.0f;
if (cost.solveMode == ManipSolveMode::Transition) {
float gpenL = 0.0f; if (cost.has_transition_reference_lengths() && j < cost.referenceLengthCount)gpenL = 2.0f * cost.transitionLengthEnergyWeight * (q_in[n + j] - cost.referenceState[n + j]);
gi = 2.0f * cost.transitionCaptureWeight * fmaf(dx, c_arr[j], dy * s_arr[j]) + gpenL;
}
else { gi = fmaf(dx, c_arr[j], dy * s_arr[j]) * inv_dist; }
grad_out[n + j] = gi; grad_norm2_out = fmaf(gi, gi, grad_norm2_out); ++j;
}
}
};
auto armijoLineSearch = [&](const float* q_base, float f_base, const float* dir, float gtd, float& alpha_io, float* q_out, float& f_out, float& x_out, float& y_out, unsigned& idx_out)noexcept->bool {
float alpha = alpha_io; int backtrack = 0;
while (backtrack < max_backtrack) {
int i = 0; while (i < dim) { q_out[i] = fmaf(alpha, dir[i], q_base[i]); ++i; }
float x2 = 0.0f, y2 = 0.0f; unsigned idx_try = 0u; float val_try = 0.0f; cost.evaluate_indexed(q_out, x2, y2, idx_try, val_try);
float t_try = clamp01(map.pointToT(q_out));
if (enforce_t_bounds) {
if (t_hi - t_lo > 2.0f * T_EPS) { if (!(t_try > t_lo + T_EPS && t_try < t_hi - T_EPS)) { alpha = tau; ++backtrack; continue; } }
else { if (fabsf(t_try - t_lo) > T_EPS) { alpha = tau; ++backtrack; continue; } }
}
if (val_try <= f_base + c1 * alpha * gtd) { alpha_io = alpha; f_out = val_try; x_out = x2; y_out = y2; idx_out = idx_try; return true; }
alpha = tau; ++backtrack;
}
return false;
};
bool lbfgs_already_tried = false; int outer = 0;
while (outer < max_outer_iters) {
float grad[64], grad_norm2 = 0.0f; computeGrad(q_inout, x_io, y_io, grad, grad_norm2);
if (!(grad_norm2 == grad_norm2) || grad_norm2 < 1e-12f)break;
float dir_gd[64]; int i = 0; while (i < dim) { dir_gd[i] = -grad[i]; ++i; }
float gtd_gd = -grad_norm2, eta_trial = eta, f_new = f_io, x_new = x_io, y_new = y_io; unsigned idx_new = idx_io;
bool found = armijoLineSearch(q_inout, f_io, dir_gd, gtd_gd, eta_trial, q_try, f_new, x_new, y_new, idx_new);
if (!found)break;
std::memcpy(q_inout, q_try, static_cast<size_t>(dim) * sizeof(float));
f_io = f_new; x_io = x_new; y_io = y_new; idx_io = idx_new; eta = eta_trial;
float rel_impr = (f_start > 1e-20f) ? ((f_start - f_io) / f_start) : 0.0f;
if (!lbfgs_already_tried && rel_impr >= lbfgs_trigger) {
lbfgs_already_tried = true;
float q_resume[64]; std::memcpy(q_resume, q_inout, static_cast<size_t>(dim) * sizeof(float));
float f_resume = f_io, x_resume = x_io, y_resume = y_io; unsigned idx_resume = idx_io; float eta_resume = eta;
float q_best_lbfgs[64]; std::memcpy(q_best_lbfgs, q_inout, static_cast<size_t>(dim) * sizeof(float));
float f_best_lbfgs = f_io, x_best_lbfgs = x_io, y_best_lbfgs = y_io; unsigned idx_best_lbfgs = idx_io;
float s_hist[9][64], y_hist[9][64], rho_hist[9], alpha_hist[9]; int hist_size = 0;
float gk[64], gk_norm2 = 0.0f; computeGrad(q_inout, x_io, y_io, gk, gk_norm2);
bool lbfgs_ok = true; float alpha_k = 2.0f / sqrtf(dim_f); int it_lbfgs = 0;
while (it_lbfgs < max_lbfgs_iters) {
if (!(gk_norm2 == gk_norm2) || gk_norm2 < 1e-12f)break;
float dir[64];
if (hist_size == 0) { int d = 0; while (d < dim) { dir[d] = -gk[d]; ++d; } }
else {
float q_vec[64]; int d = 0; while (d < dim) { q_vec[d] = gk[d]; ++d; }
for (int jj = hist_size - 1; jj >= 0; --jj) {
float dot_sq = 0.0f; int d = 0; while (d < dim) { dot_sq = fmaf(s_hist[jj][d], q_vec[d], dot_sq); ++d; }
float a_coeff = dot_sq * rho_hist[jj]; alpha_hist[jj] = a_coeff;
d = 0; while (d < dim) { q_vec[d] = fmaf(-a_coeff, y_hist[jj][d], q_vec[d]); ++d; }
}
float gamma = 1.0f; { int last = hist_size - 1; float yy = 0.0f; int d = 0; while (d < dim) { yy = fmaf(y_hist[last][d], y_hist[last][d], yy); ++d; }float ys = 1.0f / rho_hist[last]; if (yy > 0.0f)gamma = ys / yy; }
float r_vec[64]; d = 0; while (d < dim) { r_vec[d] = gamma * q_vec[d]; ++d; }
for (int jj = 0; jj < hist_size; ++jj) {
float dot_yr = 0.0f; int d = 0; while (d < dim) { dot_yr = fmaf(y_hist[jj][d], r_vec[d], dot_yr); ++d; }
float b_coeff = dot_yr * rho_hist[jj], coeff = alpha_hist[jj] - b_coeff;
d = 0; while (d < dim) { r_vec[d] = fmaf(coeff, s_hist[jj][d], r_vec[d]); ++d; }
}
d = 0; while (d < dim) { dir[d] = -r_vec[d]; ++d; }
}
float gtd = 0.0f; int d = 0; while (d < dim) { gtd = fmaf(gk[d], dir[d], gtd); ++d; }
if (!(gtd == gtd) || gtd >= -eps_descent) { d = 0; while (d < dim) { dir[d] = -gk[d]; ++d; }gtd = -gk_norm2; if (gtd >= -eps_descent) { lbfgs_ok = false; break; } }
float q_old[64]; std::memcpy(q_old, q_inout, static_cast<size_t>(dim) * sizeof(float));
float g_old[64]; std::memcpy(g_old, gk, static_cast<size_t>(dim) * sizeof(float));
float alpha_try = alpha_k, f_try = f_io, x_try = x_io, y_try = y_io; unsigned idx_try = idx_io;
bool step_ok = armijoLineSearch(q_inout, f_io, dir, gtd, alpha_try, q_try, f_try, x_try, y_try, idx_try);
if (!step_ok) { lbfgs_ok = false; break; }
std::memcpy(q_inout, q_try, static_cast<size_t>(dim) * sizeof(float));
f_io = f_try; x_io = x_try; y_io = y_try; idx_io = idx_try; alpha_k = alpha_try;
computeGrad(q_inout, x_io, y_io, gk, gk_norm2);
if (!(gk_norm2 == gk_norm2)) { lbfgs_ok = false; break; }
if (f_io < f_best_lbfgs) { f_best_lbfgs = f_io; std::memcpy(q_best_lbfgs, q_inout, static_cast<size_t>(dim) * sizeof(float)); x_best_lbfgs = x_io; y_best_lbfgs = y_io; idx_best_lbfgs = idx_io; }
float ys = 0.0f, s_new[64], y_new[64]; d = 0; while (d < dim) { float sd = q_inout[d] - q_old[d], yd = gk[d] - g_old[d]; s_new[d] = sd; y_new[d] = yd; ys = fmaf(yd, sd, ys); ++d; }
if (!(ys == ys) || ys <= eps_lbfgs_curv) { hist_size = 0; }
else {
if (hist_size < m_lbfgs) { int idx_hist = hist_size; rho_hist[idx_hist] = 1.0f / ys; d = 0; while (d < dim) { s_hist[idx_hist][d] = s_new[d]; y_hist[idx_hist][d] = y_new[d]; ++d; }++hist_size; }
else { for (int jj = 0; jj < m_lbfgs - 1; ++jj) { rho_hist[jj] = rho_hist[jj + 1]; d = 0; while (d < dim) { s_hist[jj][d] = s_hist[jj + 1][d]; y_hist[jj][d] = y_hist[jj + 1][d]; ++d; } }int idx_hist = m_lbfgs - 1; rho_hist[idx_hist] = 1.0f / ys; d = 0; while (d < dim) { s_hist[idx_hist][d] = s_new[d]; y_hist[idx_hist][d] = y_new[d]; ++d; }hist_size = m_lbfgs; }
}
if (gk_norm2 < 1e-12f)break; ++it_lbfgs;
}
if (lbfgs_ok) { std::memcpy(q_inout, q_best_lbfgs, static_cast<size_t>(dim) * sizeof(float)); f_io = f_best_lbfgs; x_io = x_best_lbfgs; y_io = y_best_lbfgs; idx_io = idx_best_lbfgs; break; }
else { if (f_best_lbfgs < f_resume) { std::memcpy(q_inout, q_best_lbfgs, static_cast<size_t>(dim) * sizeof(float)); f_io = f_best_lbfgs; x_io = x_best_lbfgs; y_io = y_best_lbfgs; idx_io = idx_best_lbfgs; } else { std::memcpy(q_inout, q_resume, static_cast<size_t>(dim) * sizeof(float)); f_io = f_resume; x_io = x_resume; y_io = y_resume; idx_io = idx_resume; }eta = eta_resume; }
}
++outer;
}
};
auto evaluate_trial_from_q = [&](const float
q_seed, float t_lo, float t_hi, bool enforce_t_bounds)noexcept->TrialPoint {
TrialPoint tr{}; std::memcpy(tr.q, q_seed, static_cast<size_t>(dim) * sizeof(float));
tr.feasible = cost.evaluate_indexed(tr.q, tr.x, tr.y, tr.idx, tr.f);
if (tr.feasible)refine_trial(tr.q, tr.x, tr.y, tr.idx, tr.f, t_lo, t_hi, enforce_t_bounds);
tr.t = clamp01(map.pointToT(tr.q));
if (enforce_t_bounds) { if (tr.t < t_lo)tr.t = t_lo; if (tr.t > t_hi)tr.t = t_hi; }
update_best_from_trial(tr);
return tr;
};
auto evaluate_trial_from_t = [&](float t_base, float t_lo, float t_hi, bool enforce_t_bounds)noexcept->TrialPoint {
alignas(16)float q_seed[64]; map.map01ToPoint(t_base, q_seed);
return evaluate_trial_from_q(q_seed, t_lo, t_hi, enforce_t_bounds);
};
auto make_interval_from_trials = [&](const TrialPoint& Lp, const TrialPoint& Rp)noexcept->IntervalND
{
if (!(Lp.t + T_EPS < Rp.t))return nullptr;
IntervalND
I = new IntervalND(Lp.t, Rp.t, Lp.f, Rp.f, Lp.idx, Rp.idx);
finalize_interval_geometry(I);
return I;
};
auto find_container_index = [&](float t)noexcept->int {
int best_pos = -1; float best_len = FLT_MAX;
for (size_t i = 0u; i < H.size(); ++i) { IntervalND* I = H[i]; if (t > I->x1 + T_EPS && t < I->x2 - T_EPS) { float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } }
if (best_pos >= 0)return best_pos;
for (size_t i = 0u; i < H.size(); ++i) { IntervalND* I = H[i]; if (t >= I->x1 - T_EPS && t <= I->x2 + T_EPS) { float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } }
return best_pos;
};
auto inject_trial_into_queue = [&](const TrialPoint& tr, float r_eff_cur, float adaptive_coeff)noexcept {
int pos = find_container_index(tr.t); if (pos < 0)return;
IntervalND* src = H[static_cast<size_t>(pos)];
if (!(tr.t > src->x1 + T_EPS && tr.t < src->x2 - T_EPS))return;
float src_len = fmaf(src->x2, 1.0f, -src->x1);
IntervalND* L = new IntervalND(src->x1, tr.t, src->y1, tr.f, src->idx1, tr.idx);
IntervalND* Rv = new IntervalND(tr.t, src->x2, tr.f, src->y2, tr.idx, src->idx2);
finalize_interval_geometry(L); finalize_interval_geometry(Rv);
float prevMmax = Mmax;
H[static_cast<size_t>(pos)] = H.back(); H.pop_back(); delete src;
if (!H.empty())std::make_heap(H.begin(), H.end(), ComparePtrND);
update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv);
float m_cur = fmaf(r_eff_cur, Mmax, 0.0f);
L->ChangeCharacteristic(m_cur); Rv->ChangeCharacteristic(m_cur);
H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND);
H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND);
if (Mmax > fmaf(adaptive_coeff, prevMmax, 0.0f))recompute_heap_constM(m_cur);
if (src_len + T_EPS >= dmax)recompute_dmax();
};
auto find_container_index_bounds = [&](float t, float& lo, float& hi)noexcept->int {
int best_pos = -1; float best_len = FLT_MAX;
for (size_t i = 0u; i < H.size(); ++i) { IntervalND* I = H[i]; if (t > I->x1 + T_EPS && t < I->x2 - T_EPS) { float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } }
if (best_pos < 0) { for (size_t i = 0u; i < H.size(); ++i) { IntervalND* I = H[i]; if (t >= I->x1 - T_EPS && t <= I->x2 + T_EPS) { float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } } }
if (best_pos >= 0) { const IntervalND* I = H[static_cast<size_t>(best_pos)]; lo = I->x1; hi = I->x2; }
else { lo = 0.0f; hi = 1.0f; }
return best_pos;
};
auto inject_trial_into_queue_fast = [&](const TrialPoint& tr, int pos, float r_eff_cur, float adaptive_coeff)noexcept {
if (pos < 0 || static_cast<size_t>(pos) >= H.size()) { inject_trial_into_queue(tr, r_eff_cur, adaptive_coeff); return; }
IntervalND* src = H[static_cast<size_t>(pos)];
if (!(tr.t > src->x1 + T_EPS && tr.t < src->x2 - T_EPS)) { inject_trial_into_queue(tr, r_eff_cur, adaptive_coeff); return; }
float src_len = fmaf(src->x2, 1.0f, -src->x1);
IntervalND* L = new IntervalND(src->x1, tr.t, src->y1, tr.f, src->idx1, tr.idx);
IntervalND* Rv = new IntervalND(tr.t, src->x2, tr.f, src->y2, tr.idx, src->idx2);
finalize_interval_geometry(L); finalize_interval_geometry(Rv);
float prevMmax = Mmax;
H[static_cast<size_t>(pos)] = H.back(); H.pop_back(); delete src;
if (!H.empty())std::make_heap(H.begin(), H.end(), ComparePtrND);
update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv);
float m_cur = fmaf(r_eff_cur, Mmax, 0.0f);
L->ChangeCharacteristic(m_cur); Rv->ChangeCharacteristic(m_cur);
H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND);
H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND);
if (Mmax > fmaf(adaptive_coeff, prevMmax, 0.0f))recompute_heap_constM(m_cur);
if (src_len + T_EPS >= dmax)recompute_dmax();
};
auto progress_outgoing = &noexcept {
for (auto itReq = g_pendingMulti.begin(); itReq != g_pendingMulti.end();) { if (itReq->req.test())itReq = g_pendingMulti.erase(itReq); else ++itReq; }
for (auto itReq = g_pendingBest.begin(); itReq != g_pendingBest.end();) { if (itReq->req.test())itReq = g_pendingBest.erase(itReq); else ++itReq; }
};
auto select_hj_seed = [&](const AlignedFloatVec*& seed_vec, float& seed_f, unsigned& seed_idx, float& seed_x, float& seed_y)noexcept->bool {
auto take_indexed = &noexcept->bool {if (bestQIndexed.empty())return false; seed_vec = &bestQIndexed; seed_f = bestIndexValue; seed_idx = bestIndexFound; seed_x = bestIndexedX; seed_y = bestIndexedY; return true; };
auto take_feasible = &noexcept->bool {if (bestQ.empty())return false; seed_vec = &bestQ; seed_f = bestF; seed_idx = fullConstraintIndex; seed_x = bestX; seed_y = bestY; return true; };
auto take_reference = &noexcept->bool {if (!haveTransitionRefSeed)return false; seed_vec = &transitionReferenceSeed; seed_f = transitionRefF; seed_idx = transitionRefIdx; seed_x = transitionRefX; seed_y = transitionRefY; return true; };
seed_vec = nullptr; seed_f = FLT_MAX; seed_idx = 0u; seed_x = seed_y = 0.0f;
if (cost.solveMode == ManipSolveMode::Transition)return take_feasible() || take_reference() || take_indexed();
return take_indexed() || take_feasible();
};
auto should_run_hj_this_iter = [&](float progress)noexcept->bool {
if (!hj_cfg.enabled)return false;
if (hj_cfg.require_adaptive_flag && !adaptive)return false;
int period = 1, phase = 0; select_hj_iteration_schedule(progress, period, phase);
if (period <= 1)return true;
return(it % period) == phase;
};
auto run_hooke_jeeves_iteration = [&](float r_eff_cur, float adaptive_coeff, float progress)noexcept {
if (H.empty())return;
const AlignedFloatVec* seedQ = nullptr; float seedF = FLT_MAX, seedX = 0.0f, seedY = 0.0f; unsigned seedIdx = 0u;
if (!select_hj_seed(seedQ, seedF, seedIdx, seedX, seedY))return;
TrialPoint base{}; std::memcpy(base.q, seedQ->data(), static_cast<size_t>(dim) * sizeof(float));
base.t = clamp01(map.pointToT(base.q)); base.f = seedF; base.idx = seedIdx; base.x = seedX; base.y = seedY; base.feasible = (base.idx == fullConstraintIndex);
auto evaluate_hj_candidate = [&](const float* q_candidate)noexcept->TrialPoint {
float t_guess = clamp01(map.pointToT(q_candidate)); float lo = 0.0f, hi = 1.0f;
int pos = find_container_index_bounds(t_guess, lo, hi);
TrialPoint tr = evaluate_trial_from_q(q_candidate, lo, hi, true);
inject_trial_into_queue_fast(tr, pos, r_eff_cur, adaptive_coeff);
return tr;
};
apply_hj_coord_profile(progress);
int hj_order_mode = select_hj_order_mode(progress);
const int* hj_order = select_hj_order_ptr(select_hj_order_mode(progress));
TrialPoint current = base, base_before = base; bool improved_any = false;
int ord = 0; while (ord < dim) {
int d = hj_order[ord]; float step = hj_delta[d];
TrialPoint axis_base = current;
auto try_direction = [&](int sgn, TrialPoint& accepted_tp)noexcept->bool {
float q_candidate[64]; std::memcpy(q_candidate, axis_base.q, static_cast<size_t>(dim) * sizeof(float));
q_candidate[d] = fmaf(static_cast<float>(sgn), step, q_candidate[d]);
if (fabsf(q_candidate[d] - axis_base.q[d]) <= 1e-12f)return false;
TrialPoint tp = evaluate_hj_candidate(q_candidate);
if (better_indexed(tp.idx, tp.f, axis_base.idx, axis_base.f)) { accepted_tp = tp; return true; }
return false;
};
int first_sign = hj_cfg.plus_first ? +1 : -1, second_sign = -first_sign;
bool improved_this_axis = false; TrialPoint best_axis_tp{}; bool have_axis_tp = false; TrialPoint tp_try{};
if (try_direction(first_sign, tp_try)) { best_axis_tp = tp_try; have_axis_tp = true; if (hj_cfg.opportunistic_axis) { current = best_axis_tp; improved_any = true; improved_this_axis = true; } }
if (!hj_cfg.opportunistic_axis || !have_axis_tp) {
if (try_direction(second_sign, tp_try)) { if (!have_axis_tp || better_indexed(tp_try.idx, tp_try.f, best_axis_tp.idx, best_axis_tp.f)) { best_axis_tp = tp_try; have_axis_tp = true; } }
if (have_axis_tp) { current = best_axis_tp; improved_any = true; improved_this_axis = true; }
}
if (improved_this_axis && hj_cfg.expand_delta_on_success) { hj_delta[d] = fmaxf(hj_cfg.min_delta, hj_delta[d] * hj_cfg.expand_factor); }
++ord;
}
if (improved_any) {
if (hj_cfg.use_pattern) {
float q_pattern[64]; for (int d = 0; d < dim; ++d)q_pattern[d] = fmaf(hj_cfg.pattern_gain, current.q[d] - base_before.q[d], current.q[d]);
TrialPoint pat_tp = evaluate_hj_candidate(q_pattern);
if (better_indexed(pat_tp.idx, pat_tp.f, current.idx, current.f))current = pat_tp;
}
}
else { for (int d = 0; d < dim; ++d)hj_delta[d] = fmaxf(hj_delta[d] * hj_cfg.shrink_factor, hj_cfg.min_delta); }
};
auto ensure_queue_nonempty = [&](float r_eff_cur)noexcept->bool {
if (!H.empty())return true;
const AlignedFloatVec* seedQ = nullptr; float seedF = FLT_MAX, seedX = 0.0f, seedY = 0.0f; unsigned seedIdx = 0u;
if (!select_hj_seed(seedQ, seedF, seedIdx, seedX, seedY))return false;
float t_center = clamp01(map.pointToT(seedQ->data())), half = fmaxf(1e-4f * dim_f, 1e-6f), t1b = clamp01(t_center - half), t2b = clamp01(t_center + half), mid = 0.5f * (t1b + t2b);
TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true), right_tp = evaluate_trial_from_t(t2b, mid, t2b, true);
IntervalND* I = make_interval_from_trials(left_tp, right_tp); if (I == nullptr)return false;
update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff_cur, Mmax, 0.0f));
H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND);
recompute_dmax(); return true;
};
while (g_world->iprobe(boost::mpi::any_source, 0)) { MultiCrossMsg dummy; g_world->recv(boost::mpi::any_source, 0, dummy); }
while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg dummy; g_world->recv(boost::mpi::any_source, 2, dummy); }
int seedStride = 64;
AlignedFloatVec seeds(static_cast<size_t>(1024) * static_cast<size_t>(seedStride), 0.0f);
int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds.data(), seedStride, static_cast<unsigned>(fmaf(static_cast<float>(g_world->rank()), 7919.0f, static_cast<float>(seed))));
int K_raw = static_cast<int>(fmaf(-fmaf(sqrtf(dim_f), dim_f, 0.0f), 0.725f, 10.95f)), K = (K_raw > 2) ? K_raw : 2;
H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + static_cast<size_t>(seedCnt) + 32u);
int rank = g_world->rank(), world = g_world->size();
size_t comm_levels = std::bit_width(static_cast<size_t>(world - 1));
for (int i = 0; i < seedCnt; ++i) {
const float* s = seeds.data() + static_cast<size_t>(i) * static_cast<size_t>(seedStride);
float t_seed = clamp01(map.pointToT(s));
float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f) : fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f), exp2f((1.0f / static_cast<float>(fmaxf(static_cast<float>(seedCnt - 4), 1.0f))) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
float t1b = clamp01(fmaf(-interval_size, 1.0f, t_seed)), t2b = clamp01(fmaf(interval_size, 1.0f, t_seed)), mid = 0.5f * (t1b + t2b);
TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true), right_tp = evaluate_trial_from_t(t2b, mid, t2b, true);
IntervalND* I = make_interval_from_trials(left_tp, right_tp); if (I == nullptr)continue;
update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
if (i < 3) { I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f); }
else { float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f), end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f), mult = fmaf(start_mult, exp2f((1.0f / static_cast<float>(fmaxf(static_cast<float>(seedCnt - 4), 1.0f))) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)), 0.0f); I->R = fmaf(I->R, mult, 0.0f); }
H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND);
}
std::vector<float> grid_base; grid_base.reserve(static_cast<size_t>(K) + 2u); grid_base.emplace_back(a);
for (int k = 1; k <= K; ++k) { float t = clamp01(fmaf(fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1)))); grid_base.emplace_back(t); }
grid_base.emplace_back(b);
std::vector<TrialPoint> grid_trials(grid_base.size());
for (size_t i = 0u; i < grid_base.size(); ++i) {
float lo = grid_base[i], hi = grid_base[i];
if (i == 0u) { lo = hi = a; }
else if (i + 1u == grid_base.size()) { lo = hi = b; }
else { lo = 0.5f * (grid_base[i - 1u] + grid_base[i]); hi = 0.5f * (grid_base[i] + grid_base[i + 1u]); }
grid_trials[i] = evaluate_trial_from_t(grid_base[i], lo, hi, true);
}
for (size_t i = 1u; i < grid_trials.size(); ++i) {
IntervalND* I = make_interval_from_trials(grid_trials[i - 1u], grid_trials[i]); if (I == nullptr)continue;
update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND);
}
recompute_dmax();
if (haveTransitionRefSeed) {
TrialPoint ref_tp = evaluate_trial_from_q(transitionReferenceSeed.data(), 0.0f, 1.0f, false);
if (!H.empty()) { inject_trial_into_queue(ref_tp, r, 1.0f); recompute_dmax(); }
else { ensure_queue_nonempty(r); }
}
g_world->barrier();
while (true) {
p = fmaf(-1.0f / initial_len, dmax, 1.0f);
stag_r_multiplier = 1.4f - 1.1f * fmaf(0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), 0.65f * p - 0.45f);
float p_arg = fmaf(p, 2.3f, -2.9775f);
float current_r = r; if (stag_boost_remaining > 0) { current_r = r * stag_r_multiplier; --stag_boost_remaining; }
float exp_arg = fmaf(B_dim, p, 0.0f), exp_arg__ = fmaf(B_dim__, p, 0.0f);
adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim);
float first_sqrt = sqrtf(fmaf(1.0f / dim_f, 2.0f, 0.0f)), second_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 5.0f, 0.0f)), third_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 9.0f, 0.0f)), fourth_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 6.5f, 0.0f)), rr = sqrtf(fmaf(-p, 1.0f, 1.0f)), xx = p * p, tt = fmaf(500.0f, p, -486.95472f);
float adaptive_coeff_ = (p < 0.95f) ? fmaf(fmaf(first_sqrt, xx, 0.0f), 0.0130349902f, fmaf(-0.04f, p, fmaf(fmaf(first_sqrt, rr, 0.0f), 0.15f, 1.1f))) : (p < 0.97390944f) ? fmaf(second_sqrt, rr, 0.9396f) : (p < 0.97590944f) ? fmaf(fmaf(fmaf(fmaf(third_sqrt, tt, 0.0f), tt, 0.0f), fmaf(-2.0f, tt, 3.0f), 0.0f), fmaf(0.25f, rr, -0.0396f), fmaf(fmaf(third_sqrt, rr, 0.0f), 0.75f, 0.9396f)) : fmaf(fourth_sqrt, rr, 0.925f);
adaptive_coeff__ = fmaf(fmaf(exp_arg__, fmaf(exp_arg__, fmaf(exp_arg__, fmaf(exp_arg__, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition__, 2.0f - A_dim__);
float grad_norm2_best = 0.0f;
if (no_improve > 0 && !bestQ.empty()) {
float acc_best = 0.0f, phi_best[64]; int ii_best = 0;
while (ii_best < n) { acc_best = fmaf(bestQ[ii_best], 1.0f, acc_best); phi_best[ii_best] = acc_best; ++ii_best; }
float s_best[64], c_best[64]; FABE13_SINCOS(phi_best, s_best, c_best, n);
float as_best = 0.0f, ac_best = 0.0f, sum_s_best[64], sum_c_best[64]; int k_best = n - 1;
while (k_best >= 0) { float Lk = cost.variableLen ? bestQ[n + k_best] : cost.fixedLength; as_best = fmaf(Lk, s_best[k_best], as_best); ac_best = fmaf(Lk, c_best[k_best], ac_best); sum_s_best[k_best] = as_best; sum_c_best[k_best] = ac_best; --k_best; }
alignas(16)float prefix_tail_best[64] = { 0.0f };
if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles() && cost.transitionPrefixEnergyWeight > 0.0f) {
alignas(16)float prefix_delta_best[64] = { 0.0f }; float phi_pref = 0.0f; int jj = 0;
while (jj < cost.referenceAngleCount) { phi_pref += bestQ[jj]; prefix_delta_best[jj] = ManipCost::wrap_pi(phi_pref - cost.referencePrefixAngles[jj]); ++jj; }
float tail = 0.0f; jj = cost.referenceAngleCount - 1; while (jj >= 0) { tail += prefix_delta_best[jj]; prefix_tail_best[jj] = tail; --jj; }
}
float dx_best = fmaf(bestX, 1.0f, -cost.targetX), dy_best = fmaf(bestY, 1.0f, -cost.targetY), dist_best = sqrtf(fmaf(dx_best, dx_best, dy_best * dy_best)), inv_dist_best = (dist_best > 1e-20f) ? (1.0f / dist_best) : 0.0f;
int i_best = 0; while (i_best < n) {
float gpen_best = 0.0f, g_main_best = 0.0f;
if (cost.solveMode == ManipSolveMode::Transition) {
float dtheta_best = (i_best < cost.referenceAngleCount) ? cost.wrapped_angle_delta(bestQ.data(), i_best) : 0.0f;
gpen_best = 2.0f * cost.transitionEnergyWeight * dtheta_best;
gpen_best = fmaf(2.0f * cost.transitionPrefixEnergyWeight, prefix_tail_best[i_best], gpen_best);
g_main_best = 2.0f * cost.transitionCaptureWeight * fmaf(dx_best, -sum_s_best[i_best], dy_best * sum_c_best[i_best]);
}
else {
float tsg = fmaf(-bestQ[i_best], cost.archBiasK, 0.0f), exp_arg_g = -tsg, exp_val = fmaf(exp_arg_g, fmaf(exp_arg_g, fmaf(exp_arg_g, fmaf(exp_arg_g, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), sig = 1.0f / (exp_val + 1.0f);
gpen_best = fmaf(-cost.archBiasW * cost.archBiasK, sig, gpen_best);
g_main_best = fmaf(dx_best, -sum_s_best[i_best], dy_best * sum_c_best[i_best]) * inv_dist_best;
}
float gi_best = g_main_best + gpen_best; grad_norm2_best = fmaf(gi_best, gi_best, grad_norm2_best); ++i_best;
}
if (cost.variableLen) {
int j_best = 0; while (j_best < n) {
float gi_best = 0.0f;
if (cost.solveMode == ManipSolveMode::Transition) {
float gpenL_best = 0.0f; if (cost.has_transition_reference_lengths() && j_best < cost.referenceLengthCount)gpenL_best = 2.0f * cost.transitionLengthEnergyWeight * (bestQ[n + j_best] - cost.referenceState[n + j_best]);
gi_best = 2.0f * cost.transitionCaptureWeight * fmaf(dx_best, c_best[j_best], dy_best * s_best[j_best]) + gpenL_best;
}
else { gi_best = fmaf(dx_best, c_best[j_best], dy_best * s_best[j_best]) * inv_dist_best; }
grad_norm2_best = fmaf(gi_best, gi_best, grad_norm2_best); ++j_best;
}
}
}
bool stagnation = (no_improve > noImproveThrDim) && (grad_norm2_best < 0.5e-1f);
float r_eff = (dim > 2) ? fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, fmaf(sqrtf(dim_f - 1), current_r, 0.0f), 0.0f) : fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, current_r, 0.0f);
if (stagnation) {
stag_boost_remaining = n_stag_iters;
float t_seeds[32]; int seed_count = 0;
if (cost.solveMode == ManipSolveMode::Transition && cost.has_transition_reference_angles()) {
alignas(16)float q_center[64]; std::memcpy(q_center, cost.referenceState, static_cast<size_t>(dim) * sizeof(float));
t_seeds[seed_count++] = clamp01(map.pointToT(q_center));
if (!bestQ.empty() && seed_count < 32)t_seeds[seed_count++] = clamp01(map.pointToT(bestQ.data()));
unsigned st_local = seed + static_cast<unsigned>(it) + 911u; int local_variants = 2 + static_cast<int>(sqrtf(dim_f));
for (int v = 0; v < local_variants && seed_count < 32; ++v) {
float q_temp[64]; std::memcpy(q_temp, q_center, static_cast<size_t>(dim) * sizeof(float));
st_local ^= st_local << 13; st_local ^= st_local >> 17; st_local ^= st_local << 5; float rnd0 = static_cast<float>(st_local & 0xFFFFFFu) * 5.9604645e-8f;
int aidx = v % n; q_temp[aidx] = fmaf((2.0f * rnd0 - 1.0f), 0.25f * cost.transitionMaxAngleStep, q_temp[aidx]);
if (cost.variableLen && cost.has_transition_reference_lengths()) {
st_local ^= st_local << 13; st_local ^= st_local >> 17; st_local ^= st_local << 5; float rnd1 = static_cast<float>(st_local & 0xFFFFFFu) * 5.9604645e-8f;
int lidx = v % n; q_temp[n + lidx] = fmaf((2.0f * rnd1 - 1.0f), 0.25f * cost.max_length_step(lidx), q_temp[n + lidx]);
}
t_seeds[seed_count++] = clamp01(map.pointToT(q_temp));
}
}
else {
int num_ik = 1 + static_cast<int>(sqrtf(dim_f));
float dist_to_target = sqrtf(cost.targetX * cost.targetX + cost.targetY * cost.targetY), max_reach = 0.0f;
if (cost.variableLen) { for (int ii = 0; ii < n; ++ii)max_reach += map.high[n + ii]; }
else { max_reach = static_cast<float>(n) * cost.fixedLength; }
float ratio = dist_to_target / fmaxf(max_reach, 1e-20f); bool prefer_extended = (ratio > 0.7f), prefer_compact = (ratio < 0.4f), use_ik = !(ratio > 0.4f && ratio < 0.7f);
if (!use_ik) {
AlignedFloatVec temp_S(static_cast<size_t>(1024) * static_cast<size_t>(64), 0.0f);
int sobol_gen = generate_sobol_seeds_limited(map, dim, temp_S.data(), 64, seed + static_cast<unsigned>(it), 1024);
int num_sobol = num_ik; for (int kk = 0; kk < num_sobol && kk < sobol_gen && seed_count < 32; ++kk) { const float* s = temp_S.data() + static_cast<size_t>(kk) * 64; t_seeds[seed_count++] = clamp01(map.pointToT(s)); }
}
else {
float angles_ccd[32] = { 0.0f }, lengths_ccd[32];
if (cost.variableLen) { float len_low = map.low[n], len_high = map.high[n], avg_len = 0.5f * (len_low + len_high); for (int ii = 0; ii < n; ++ii)lengths_ccd[ii] = avg_len; }
else { for (int ii = 0; ii < n; ++ii)lengths_ccd[ii] = cost.fixedLength; }
ccd_ik(cost.targetX, cost.targetY, lengths_ccd, n, angles_ccd, 10);
float angles_fabrik[32] = { 0.0f }, lengths_fabrik[32];
if (cost.variableLen) { for (int ii = 0; ii < n; ++ii)lengths_fabrik[ii] = lengths_ccd[ii]; }
else { for (int ii = 0; ii < n; ++ii)lengths_fabrik[ii] = cost.fixedLength; }
float targetX_fab = cost.targetX, targetY_fab = cost.targetY;
for (int iter_fab = 0; iter_fab < 3; ++iter_fab) {
float prevX = targetX_fab, prevY = targetY_fab;
for (int j = n - 1; j >= 0; --j) { float len = lengths_fabrik[j], angle_to_target = atan2f(prevY, prevX); angles_fabrik[j] = angle_to_target; float s_val, c_val; FABE13_SINCOS(&angle_to_target, &s_val, &c_val, 1); prevX = prevX - len * c_val; prevY = prevY - len * s_val; }
}
if (prefer_extended) {
{ float q_ccd[64]; for (int ii = 0; ii < n; ++ii)q_ccd[ii] = angles_ccd[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii)q_ccd[n + ii] = lengths_ccd[ii]; }t_seeds[seed_count++] = clamp01(map.pointToT(q_ccd)); }
unsigned st_ik = seed + static_cast<unsigned>(it) + 222u; int remaining = num_ik - 1;
for (int v = 0; v < remaining && seed_count < 32; ++v) {
float noisy_angles[32], noisy_lengths[32];
for (int ii = 0; ii < n; ++ii) {
st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f;
noisy_angles[ii] = angles_ccd[ii] + (2.0f * rnd - 1.0f) * 0.1f;
float lo = (ii == 0) ? -1.0471975511965976f : -2.6179938779914944f, hi = 2.6179938779914944f;
if (noisy_angles[ii] < lo)noisy_angles[ii] = lo; if (noisy_angles[ii] > hi)noisy_angles[ii] = hi;
}
if (cost.variableLen) {
for (int ii = 0; ii < n; ++ii) {
st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f;
noisy_lengths[ii] = lengths_ccd[ii] + (2.0f * rnd - 1.0f) * 0.05f;
if (noisy_lengths[ii] < 0.5f)noisy_lengths[ii] = 0.5f; if (noisy_lengths[ii] > 2.0f)noisy_lengths[ii] = 2.0f;
}
}
float q_temp[64]; for (int ii = 0; ii < n; ++ii)q_temp[ii] = noisy_angles[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii)q_temp[n + ii] = noisy_lengths[ii]; }
t_seeds[seed_count++] = clamp01(map.pointToT(q_temp));
}
}
else if (prefer_compact) {
{ float q_fabrik[64]; for (int ii = 0; ii < n; ++ii)q_fabrik[ii] = angles_fabrik[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii)q_fabrik[n + ii] = lengths_fabrik[ii]; }t_seeds[seed_count++] = clamp01(map.pointToT(q_fabrik)); }
unsigned st_ik = seed + static_cast<unsigned>(it) + 222u; int remaining = num_ik - 1;
for (int v = 0; v < remaining && seed_count < 32; ++v) {
float noisy_angles[32], noisy_lengths[32];
for (int ii = 0; ii < n; ++ii) {
st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f;
noisy_angles[ii] = angles_fabrik[ii] + (2.0f * rnd - 1.0f) * 0.06f;
float lo = (ii == 0) ? -1.0471975511965976f : -2.6179938779914944f, hi = 2.6179938779914944f;
if (noisy_angles[ii] < lo)noisy_angles[ii] = lo; if (noisy_angles[ii] > hi)noisy_angles[ii] = hi;
}
if (cost.variableLen) {
for (int ii = 0; ii < n; ++ii) {
st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f;
noisy_lengths[ii] = lengths_fabrik[ii] + (2.0f * rnd - 1.0f) * 0.03f;
if (noisy_lengths[ii] < 0.5f)noisy_lengths[ii] = 0.5f; if (noisy_lengths[ii] > 2.0f)noisy_lengths[ii] = 2.0f;
}
}
float q_temp[64]; for (int ii = 0; ii < n; ++ii)q_temp[ii] = noisy_angles[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii)q_temp[n + ii] = noisy_lengths[ii]; }
t_seeds[seed_count++] = clamp01(map.pointToT(q_temp));
}
}
}
}
for (int s = 0; s < seed_count; ++s) {
float t_center = t_seeds[s], interval_size = fmaf(0.00031f, static_cast<float>(dim), 0.0f), t1b = clamp01(t_center - 0.5f * interval_size), t2b = clamp01(t_center + 0.5f * interval_size), mid = 0.5f * (t1b + t2b);
TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true), right_tp = evaluate_trial_from_t(t2b, mid, t2b, true);
IntervalND* I = make_interval_from_trials(left_tp, right_tp); if (I == nullptr)continue;
update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, fmaf(0.01f, dim_f, 0.85f), 0.0f);
H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND);
}
recompute_dmax(); no_improve = 0;
}
float bestFOld = bestF;
bool local_idle = !ensure_queue_nonempty(r_eff);
bool do_hj_this_iter = should_run_hj_this_iter(p);
if (!local_idle) {
if (do_hj_this_iter) {
run_hooke_jeeves_iteration(r_eff, adaptive_coeff, p);
}
else {
std::pop_heap(H.begin(), H.end(), ComparePtrND); IntervalND* cur = H.back(); H.pop_back();
float x1 = cur->x1, x2 = cur->x2, y1 = cur->y1, y2 = cur->y2;
float m = fmaf(r_eff, Mmax, 0.0f), tBase = step(m, x1, x2, y1, y2, dim_f, r_eff, cur->idx1, cur->idx2);
TrialPoint trNew = evaluate_trial_from_t(tBase, x1, x2, true);
IntervalND* L = new IntervalND(x1, trNew.t, y1, trNew.f, cur->idx1, trNew.idx);
IntervalND* Rv = new IntervalND(trNew.t, x2, trNew.f, y2, trNew.idx, cur->idx2);
finalize_interval_geometry(L); finalize_interval_geometry(Rv);
float Mloc = fmaxf(L->M, Rv->M), prevMmax = Mmax;
update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv);
m = fmaf(r_eff, Mmax, 0.0f);
float len1 = fmaf(trNew.t, 1.0f, -x1), len2 = fmaf(x2, 1.0f, -trNew.t);
if (fmaf(len1, 1.0f, len2) >= dmax - T_EPS) { dmax = fmaxf(len1, len2); for (auto* pI : H) { float Ls = fmaf(pI->x2, 1.0f, -pI->x1); if (Ls > dmax)dmax = Ls; } }
if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) {
float alpha = p * p, beta = fmaf(-alpha, 1.0f, 2.0f), MULT = (1.0f / fmaxf(dmax, 1e-20f)) * Mmax, global_coeff = fmaf(MULT, r_eff, -MULT), GF = beta * global_coeff;
L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f)));
Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f)));
size_t sz = H.size(); if (sz > 0u) { if (!hasHardConstraints)RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha); else RecomputeR_AffineM_Mixed_ND(H.data(), sz, GF, alpha); std::make_heap(H.begin(), H.end(), ComparePtrND); }
}
else {
L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m);
if (Mloc > prevMmax && Mloc > fmaf(adaptive_coeff_, prevMmax, 0.0f))recompute_heap_constM(m);
}
H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND);
H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND);
delete cur;
}
}
if (!H.empty()) { _mm_prefetch(reinterpret_cast<const char*>(H[0]), _MM_HINT_T0); if (H.size() > 1u)_mm_prefetch(reinterpret_cast<const char*>(H[1]), _MM_HINT_T0); }
if (!H.empty() && (stagnation || bestF < fmaf(fmaf(bestFOld, 0.6f, 0.0f), adaptive_coeff, 0.0f)) && it - last_send_T >= send_interval_T) {
last_send_T = it; progress_outgoing();
size_t scan_count = H.size(); unsigned intervals_to_send = (dim < 12) ? static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f))) : 7u;
if (intervals_to_send > scan_count)intervals_to_send = static_cast<unsigned>(scan_count);
if (intervals_to_send > 0u) {
float alpha = 0.63f - sqrtf(fmaxf(-0.113f * (p - 1.0f), 0.0f)), beta = 1.0f - alpha;
float w_pos = 3.085f * (-sqrtf(fmaxf(0.007f * p, 0.0f)) + 1.0f), w_size = 5.085f - w_pos;
int num_bins = (p < 1.0f - 1e-9f) ? static_cast<int>(1.0f / (1.0f - p)) : 1;
std::vector<float> pos_metric(scan_count), size_metric(scan_count), R_val(scan_count);
float R_max = -FLT_MAX, R_min = FLT_MAX, size_max = -FLT_MAX, size_min = FLT_MAX, pos_max = -FLT_MAX, pos_min = FLT_MAX;
for (size_t idx = 0u; idx < scan_count; ++idx) {
IntervalND* I = H[idx]; float center = 0.5f * (I->x1 + I->x2), len = fmaf(I->x2, 1.0f, -I->x1), size = len * static_cast<float>(1 << I->span_level);
if (p < 0.95f && num_bins>1) { float bin = floorf(center * static_cast<float>(num_bins)); if (bin >= static_cast<float>(num_bins))bin = static_cast<float>(num_bins - 1); pos_metric[idx] = bin / static_cast<float>(num_bins - 1); }
else { pos_metric[idx] = center; }
size_metric[idx] = size; R_val[idx] = I->R;
if (pos_metric[idx] > pos_max)pos_max = pos_metric[idx]; if (pos_metric[idx] < pos_min)pos_min = pos_metric[idx];
if (size > size_max)size_max = size; if (size < size_min)size_min = size;
if (I->R > R_max)R_max = I->R; if (I->R < R_min)R_min = I->R;
}
auto norm = [&](float val, float minv, float maxv)noexcept->float {float denom = maxv - minv; if (fabsf(denom) <= 1e-20f)return 0.0f; return(val - minv) / denom; };
std::vector<float> R_norm(scan_count), pos_norm(scan_count), size_norm(scan_count);
for (size_t idx = 0u; idx < scan_count; ++idx) { R_norm[idx] = norm(R_val[idx], R_min, R_max); pos_norm[idx] = norm(pos_metric[idx], pos_min, pos_max); size_norm[idx] = norm(size_metric[idx], size_min, size_max); }
std::vector<int> selected; selected.reserve(intervals_to_send);
std::vector<unsigned char> used(scan_count, 0u); selected.push_back(0); used[0] = true;
while (selected.size() < intervals_to_send) {
int best_idx = -1; float best_score = -FLT_MAX;
for (size_t idx = 0u; idx < scan_count; ++idx) {
if (used[idx] != 0u)continue;
float min_dist = FLT_MAX; for (int s : selected) { float d2 = 0.0f, d = pos_norm[idx] - pos_norm[static_cast<size_t>(s)]; d2 += w_pos * d * d; d = size_norm[idx] - size_norm[static_cast<size_t>(s)]; d2 += w_size * d * d; if (d2 < min_dist)min_dist = d2; }
float novelty = sqrtf(fmaxf(min_dist, 0.0f)) / sqrtf(fmaxf(w_pos + w_size, 1e-20f)), score = alpha * R_norm[idx] + beta * novelty;
if (score > best_score) { best_score = score; best_idx = static_cast<int>(idx); }
}
if (best_idx < 0)break; selected.push_back(best_idx); used[static_cast<size_t>(best_idx)] = 1u;
}
MultiCrossMsg out{}; out.count = static_cast<unsigned>(selected.size()); float* dest = out.intervals;
for (unsigned s = 0u; s < out.count; ++s) { IntervalND* Tt = H[static_cast<size_t>(selected[s])]; dest[0] = Tt->x1; dest[1] = 0.0f; dest[2] = Tt->x2; dest[3] = 0.0f; dest[4] = Tt->R; dest += 5; }
size_t iterations = comm_levels; bool active = true; bool invert_T = ((static_cast<int>(exchange_counter_T) + 1) & 1) != 0; size_t ii2 = 0u;
while (ii2 < iterations && active) { size_t step = 1ULL << ii2; int partner = rank ^ static_cast<int>(step); if (partner < world) { bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T); if (am_sender) { g_pendingMulti.emplace_back(g_world, partner, out); if (g_pendingMulti.size() > MAX_INFLIGHT_MULTI) { g_pendingMulti.front().req.wait(); g_pendingMulti.pop_front(); }active = false; } }++ii2; }
++exchange_counter_T;
}
}
bool indexedBestImprovedSinceLastSend = !bestQIndexed.empty() && better_indexed(bestIndexFound, bestIndexValue, lastSentBestMsg.bestIndex, lastSentBestMsg.bestF);
if (indexedBestImprovedSinceLastSend && it - last_send_best >= send_interval_best) {
last_send_best = it; progress_outgoing();
size_t iterations = comm_levels; bool active = true; bool invert_best = ((static_cast<int>(exchange_counter) + 1) & 1) != 0; size_t ii2 = 0u;
while (ii2 < iterations && active) { size_t step = 1ULL << ii2; int partner = rank ^ static_cast<int>(step); if (partner < world) { bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_best); if (am_sender) { BestSolutionMsg outMsg; InitBestSolutionMsg(outMsg); if (FillBestSolutionMsg(outMsg, bestIndexFound, bestIndexValue, bestIndexedX, bestIndexedY, bestQIndexed) && better_indexed(outMsg.bestIndex, outMsg.bestF, lastSentBestMsg.bestIndex, lastSentBestMsg.bestF)) { g_pendingBest.emplace_back(g_world, partner, outMsg); lastSentBestMsg = outMsg; }if (g_pendingBest.size() > MAX_INFLIGHT_BEST) { g_pendingBest.front().req.wait(); g_pendingBest.pop_front(); }active = false; } }++ii2; }
++exchange_counter;
}
while (g_world->iprobe(boost::mpi::any_source, 0)) {
MultiCrossMsg in; g_world->recv(boost::mpi::any_source, 0, in);
for (unsigned ii = 0u; ii < in.count; ++ii) {
const float
d = &in.intervals[ii * 5u]; float sx = d[0], ex = d[2];
alignas(16)float ql[64]; float tx = 0.0f, ty = 0.0f; unsigned v1i = 0u, v2i = 0u; float y1i = 0.0f, y2i = 0.0f;
map.map01ToPoint(sx, ql); cost.evaluate_indexed(ql, tx, ty, v1i, y1i);
map.map01ToPoint(ex, ql); cost.evaluate_indexed(ql, tx, ty, v2i, y2i);
IntervalND
inj = new IntervalND(sx, ex, y1i, y2i, v1i, v2i);
finalize_interval_geometry(inj);
float prevMmax = Mmax, candidateMmax = fmaxf(Mmax, inj->M), m_inj = fmaf(r_eff, candidateMmax, 0.0f);
inj->ChangeCharacteristic(m_inj);
if (H.empty() || inj->R > fmaf(adaptive_coeff, H.front()->R, 0.0f) || stagnation) {
update_pockets_and_Mmax(inj); H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtrND);
if (Mmax > fmaf(adaptive_coeff, prevMmax, 0.0f))recompute_heap_constM(fmaf(r_eff, Mmax, 0.0f));
}
else { delete inj; }
}
}
while (g_world->iprobe(boost::mpi::any_source, 2)) {
BestSolutionMsg incoming; g_world->recv(boost::mpi::any_source, 2, incoming);
UpdateIndexedBestFromMessage(incoming, fullConstraintIndex, bestIndexFound, bestIndexValue, bestQIndexed, bestIndexedX, bestIndexedY, bestF, bestQ, bestX, bestY);
}
float interval_len = H.empty() ? 0.0f : ((dim > 1) ? fmaf(H.front()->x2, 1.0f, -H.front()->x1) : H.front()->diam);
if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter - 1) {
for (auto& s : g_pendingMulti)s.req.wait(); for (auto& s : g_pendingBest)s.req.wait();
g_pendingMulti.clear(); g_pendingBest.clear();
while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg incoming; g_world->recv(boost::mpi::any_source, 2, incoming); UpdateIndexedBestFromMessage(incoming, fullConstraintIndex, bestIndexFound, bestIndexValue, bestQIndexed, bestIndexedX, bestIndexedY, bestF, bestQ, bestX, bestY); }
g_world->barrier();
BestSolutionMsg bestMsg; InitBestSolutionMsg(bestMsg);
if (!bestQIndexed.empty()) { FillBestSolutionMsg(bestMsg, bestIndexFound, bestIndexValue, bestIndexedX, bestIndexedY, bestQIndexed); }
size_t iterations = comm_levels; bool active = true; size_t itx = 0u;
while (itx < iterations && active) { size_t step = 1ULL << itx; int partner = rank ^ static_cast<int>(step); if (partner < world) { bool am_sender = (rank & static_cast<int>(step)) != 0; if (am_sender) { g_world->send(partner, 3, bestMsg); active = false; } else { BestSolutionMsg inBest; InitBestSolutionMsg(inBest); g_world->recv(partner, 3, inBest); if (better_indexed(inBest.bestIndex, inBest.bestF, bestMsg.bestIndex, bestMsg.bestF))bestMsg = inBest; } }++itx; }
if (rank == 0) { UpdateIndexedBestFromMessage(bestMsg, fullConstraintIndex, bestIndexFound, bestIndexValue, bestQIndexed, bestIndexedX, bestIndexedY, bestF, bestQ, bestX, bestY); }
bool haveFeasibleBest = (bestIndexFound == fullConstraintIndex) && !bestQ.empty();
if (haveFeasibleBest && cost.solveMode != ManipSolveMode::Transition) {
alignas(16)float q_local[64]; std::memcpy(q_local, bestQ.data(), static_cast<size_t>(dim) * sizeof(float));
float x_final = bestX, y_final = bestY, f_final = bestF; int last = n - 1;
float lo = (last == 0) ? -1.0471975511965977f : -2.6179938779914944f, hi = 2.6179938779914944f;
float bestLocF = f_final, saved = q_local[last], delta = 0.05f;
while (delta >= 0.00625f) { int sgn = -1; while (sgn < 2) { float cand = fmaf(static_cast<float>(sgn), delta, saved); if (cand < lo)cand = lo; else if (cand > hi)cand = hi; float backup = q_local[last]; q_local[last] = cand; float x2_loc = 0.0f, y2_loc = 0.0f; unsigned idx2 = 0u; float val2 = 0.0f; cost.evaluate_indexed(q_local, x2_loc, y2_loc, idx2, val2); if (val2 < bestLocF) { bestLocF = val2; x_final = x2_loc; y_final = y2_loc; saved = cand; }q_local[last] = backup; sgn += 2; }delta *= 0.5f; }
if (bestLocF < f_final) { q_local[last] = saved; bestF = bestLocF; bestX = x_final; bestY = y_final; bestQ.assign(q_local, q_local + dim); bestIndexFound = fullConstraintIndex; bestIndexValue = bestF; bestQIndexed = bestQ; bestIndexedX = bestX; bestIndexedY = bestY; }
}
else if (haveFeasibleBest) { bestIndexFound = fullConstraintIndex; bestIndexValue = bestF; bestQIndexed = bestQ; bestIndexedX = bestX; bestIndexedY = bestY; }
else { bestQ = bestQIndexed; bestF = bestIndexValue; bestX = bestIndexedX; bestY = bestIndexedY; }
out_iterations = static_cast<size_t>(it); out_achieved_epsilon = interval_len;
return;
}
++it;
}
}"

Pensamento

Considering potential optimizations and guarding edge cases

I think I need to ensure that certain values, like q_candidate and q_try, are properly clamped before their use in specific stages like stagnation and line search. Also, it seems I should precompute hasTransitionRefSeed only when the reference is okay, and guard against an issue where empty heaps could cause crashes with certain operations.

Ensuring safety while maintaining performance

I need to add a guard to prevent an issue when recompute is called with empty heaps. Although it's not transition-specific, it could happen in general. I've kept the fast path identical for non-transition cases, which is reassuring for performance. I'll provide an explanation before the code for clarity.

Thought for 5m 33s

Resposta

Ниже — цельный метод. Я сохранил быстрый fast-path для обычного режима, а для Transition вернул только те вещи, без которых он ломается: проекцию кандидатов в допустимую область, корректную обработку ref_ok для transition-reference seed и отбрасывание невалидных шагов в line search. При этом extra-логика вынесена в отдельный projected-path, чтобы не замедлять обычный режим лишними ветками внутри горячих циклов.

cpp
static __declspec(noalias) void agp_run_branch_mpi( const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed, std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H, std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ, float& bestF, float& bestX, float& bestY, size_t& out_iterations, float& out_achieved_epsilon, float M_prior) noexcept { constexpr int FAST_STATE_DIM = AGP_MAX_STATE_DIM; constexpr int FAST_SEED_STRIDE = AGP_MAX_SEED_STRIDE; constexpr int FAST_MAX_SEEDS = AGP_MAX_GENERATED_SEEDS; struct TrialPoint { alignas(16) float q[FAST_STATE_DIM]; float t = 0.0f; float f = FLT_MAX; float x = 0.0f; float y = 0.0f; unsigned idx = 0u; bool feasible = false; }; using AlignedFloatVec = std::vector<float, boost::alignment::aligned_allocator<float, 16u>>; constexpr float T_EPS = 1.0e-7f; const int n = cost.n; const int dim = n + (cost.variableLen ? n : 0); const float dim_f = static_cast<float>(dim); const unsigned fullConstraintIndex = cost.feasible_index(); const bool hasHardConstraints = (fullConstraintIndex != 0u); if (n > 32 || dim > FAST_STATE_DIM) { bestQ.clear(); bestF = FLT_MAX; bestX = 0.0f; bestY = 0.0f; out_iterations = 0u; out_achieved_epsilon = 0.0f; return; } const bool transitionMode = (cost.solveMode == ManipSolveMode::Transition); const bool hasTransitionRefAngles = transitionMode && cost.has_transition_reference_angles(); const bool hasTransitionRefLengths = transitionMode && cost.has_transition_reference_lengths(); const bool useJointAngleLimitConstraints = cost.use_joint_angle_limit_constraints(); const bool needsProjectedCandidates = transitionMode || useJointAngleLimitConstraints; const bool preserveTransitionJointMagnitudeFloor = transitionMode && cost.transitionPreserveJointMagnitudeFloor && hasTransitionRefAngles; const float* const ref_state = cost.referenceState; const int referenceAngleCount = hasTransitionRefAngles ? cost.referenceAngleCount : 0; const int referenceLengthCount = hasTransitionRefLengths ? cost.referenceLengthCount : 0; const float transitionMaxAngleStep = transitionMode ? cost.transitionMaxAngleStep : 0.0f; const float transitionJointSignDeadband = transitionMode ? cost.transitionJointSignDeadband : 0.0f; const float jointAngleHard = useJointAngleLimitConstraints ? fabsf(cost.minTheta) : 0.0f; const float baseLengthLo = cost.variableLen ? (cost.fixedLength / cost.stretchFactor) : 0.0f; const float baseLengthHi = cost.variableLen ? (cost.fixedLength * cost.stretchFactor) : 0.0f; alignas(16) float angle_lo_bound[FAST_STATE_DIM]; alignas(16) float angle_hi_bound[FAST_STATE_DIM]; alignas(16) float length_lo_bound[FAST_STATE_DIM]; alignas(16) float length_hi_bound[FAST_STATE_DIM]; alignas(16) float joint_min_abs_bound[FAST_STATE_DIM]; if (needsProjectedCandidates) { int i = 0; while (i < n) { float lo = (i == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f; float hi = 2.6179938779914943653855361527329f; if (hasTransitionRefAngles && i < referenceAngleCount) { const float ref = ref_state[i]; const float lo_ref = ref - transitionMaxAngleStep; const float hi_ref = ref + transitionMaxAngleStep; if (lo_ref > lo) lo = lo_ref; if (hi_ref < hi) hi = hi_ref; } if (useJointAngleLimitConstraints) { const float lo_hard = -jointAngleHard; const float hi_hard = jointAngleHard; if (lo_hard > lo) lo = lo_hard; if (hi_hard < hi) hi = hi_hard; } angle_lo_bound[i] = lo; angle_hi_bound[i] = hi; if (preserveTransitionJointMagnitudeFloor && i < referenceAngleCount && fabsf(ref_state[i]) > transitionJointSignDeadband) { joint_min_abs_bound[i] = cost.min_joint_abs_angle_bound(i); } else { joint_min_abs_bound[i] = 0.0f; } ++i; } if (transitionMode && cost.variableLen) { i = 0; while (i < n) { float lo = baseLengthLo; float hi = baseLengthHi; if (hasTransitionRefLengths && i < referenceLengthCount) { const float refL = ref_state[n + i]; const float dL = cost.max_length_step(i); const float lo_ref = refL - dL; const float hi_ref = refL + dL; if (lo_ref > lo) lo = lo_ref; if (hi_ref < hi) hi = hi_ref; } length_lo_bound[i] = lo; length_hi_bound[i] = hi; ++i; } } } auto clampPointProjected = [&](float* q) noexcept { if (!needsProjectedCandidates) return; int i = 0; while (i < n) { float qi = q[i]; const float lo = angle_lo_bound[i]; const float hi = angle_hi_bound[i]; if (qi < lo) qi = lo; else if (qi > hi) qi = hi; const float min_abs = joint_min_abs_bound[i]; if (min_abs > 0.0f) { if (fabsf(qi) < min_abs) { qi = copysignf(min_abs, ref_state[i]); if (qi < lo) qi = lo; else if (qi > hi) qi = hi; } } q[i] = qi; ++i; } if (transitionMode && cost.variableLen) { i = 0; while (i < n) { float qi = q[n + i]; const float lo = length_lo_bound[i]; const float hi = length_hi_bound[i]; if (qi < lo) qi = lo; else if (qi > hi) qi = hi; q[n + i] = qi; ++i; } } }; unsigned exchange_counter = 0u; unsigned exchange_counter_T = 0u; alignas(16) float M_by_span[12]; for (int i = 0; i < 12; ++i) M_by_span[i] = M_prior; float Mmax = M_prior; alignas(16) float phi[FAST_STATE_DIM]; alignas(16) float s_arr[FAST_STATE_DIM]; alignas(16) float c_arr[FAST_STATE_DIM]; alignas(16) float sum_s[FAST_STATE_DIM]; alignas(16) float sum_c[FAST_STATE_DIM]; alignas(16) float q_try[FAST_STATE_DIM]; bestQ.reserve(static_cast<size_t>(dim)); AlignedFloatVec bestQIndexed; bestQIndexed.reserve(static_cast<size_t>(dim)); AlignedFloatVec transitionReferenceSeed; float transitionRefF = FLT_MAX; unsigned transitionRefIdx = 0u; float transitionRefX = 0.0f; float transitionRefY = 0.0f; bool haveTransitionRefSeed = false; if (hasTransitionRefAngles) { const int refDim = cost.full_state_dim(); transitionReferenceSeed.resize(static_cast<size_t>(refDim)); std::memcpy( transitionReferenceSeed.data(), ref_state, static_cast<size_t>(refDim) * sizeof(float)); float xr = 0.0f; float yr = 0.0f; unsigned idxr = 0u; float fr = 0.0f; const bool ref_ok = cost.evaluate_indexed(transitionReferenceSeed.data(), xr, yr, idxr, fr); transitionRefF = fr; transitionRefIdx = idxr; transitionRefX = xr; transitionRefY = yr; haveTransitionRefSeed = ref_ok; } unsigned bestIndexFound = 0u; float bestIndexValue = FLT_MAX; float bestIndexedX = 0.0f; float bestIndexedY = 0.0f; BestSolutionMsg lastSentBestMsg; InitBestSolutionMsg(lastSentBestMsg); int no_improve = 0; const float a = 0.0f; const float b = 1.0f; float p = 0.0f; float dmax = b - a; const float initial_len = dmax; const float A_dim = fmaf(1.0f / sqrtf(dim_f + 6.75f), 5.535f, 0.0f); const float A_dim__ = fmaf(1.0f / sqrtf(dim_f + 6.75f), 3.425f, 0.0f); const float B_dim = fmaf(A_dim, 0.7f, 0.0f); const float B_dim__ = fmaf(A_dim__, 4.325f, 0.0f); const float log_argument = A_dim - 2.03f; const float log_argument__ = A_dim__ - 2.0f; const float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim; const float C_dim__ = fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument__) - B_dim__; const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float adaptive_coeff_addition__ = fmaf(C_dim__, fmaf(C_dim__, fmaf(C_dim__, fmaf(C_dim__, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); float adaptive_coeff = A_dim - adaptive_coeff_addition; float adaptive_coeff__ = A_dim__ - adaptive_coeff_addition__; int it = 0; int stag_boost_remaining = 0; float stag_r_multiplier = 0.0f; int last_send_T = 0; const int send_interval_T = 7; int last_send_best = 0; const int send_interval_best = 2; const int n_stag_iters = static_cast<int>(3.0f + 2.045f * sqrtf(dim_f)); const int noImproveThrDim = static_cast<int>(fmaf(7.5f, exp2f(-0.1f * sqrtf(dim_f)), 0.0f)); const AGPHookeJeevesParams& hj_cfg = g_agp_hj_params; auto clamp01 = [](float v) noexcept -> float { return v < 0.0f ? 0.0f : (v > 1.0f ? 1.0f : v); }; auto lerp_hj = [](float a0, float a1, float t) noexcept -> float { return fmaf(t, a1 - a0, a0); }; auto select_hj_iteration_schedule = [&](float progress, int& period_out, int& phase_out) noexcept { period_out = hj_cfg.iteration_period; phase_out = hj_cfg.iteration_phase; int count = hj_cfg.iteration_schedule_count; if (count <= 0) return; if (count > AGP_HJ_MAX_PROGRESS_RULES) count = AGP_HJ_MAX_PROGRESS_RULES; const float p01 = clamp01(progress); for (int i = 0; i < count; ++i) { const AGPHJIterationScheduleRule& rule = hj_cfg.iteration_schedule[i]; if (p01 < rule.progress_lt || i + 1 == count) { period_out = rule.period; phase_out = rule.phase; break; } } if (period_out < 1) period_out = 1; phase_out %= period_out; if (phase_out < 0) phase_out += period_out; }; auto select_hj_coord_profile = [&](float progress) noexcept -> AGPHJCoordScaleProfile { int count = hj_cfg.coord_profile_schedule_count; if (count > AGP_HJ_MAX_PROGRESS_RULES) count = AGP_HJ_MAX_PROGRESS_RULES; const float p01 = clamp01(progress); if (count > 0) { for (int i = 0; i < count; ++i) { const AGPHJCoordScaleRule& rule = hj_cfg.coord_profile_schedule[i]; if (p01 < rule.progress_lt || i + 1 == count) { return rule.profile; } } } AGPHJCoordScaleProfile fallback{}; fallback.angle_scale_proximal = hj_cfg.angle_scale_first; fallback.angle_scale_distal = hj_cfg.angle_scale_last; fallback.length_scale_proximal = hj_cfg.length_scale_first; fallback.length_scale_distal = hj_cfg.length_scale_last; return fallback; }; auto select_hj_order_mode = [&](float progress) noexcept -> int { int order_mode = hj_cfg.order_mode; int count = hj_cfg.order_schedule_count; if (count <= 0) return order_mode; if (count > AGP_HJ_MAX_PROGRESS_RULES) count = AGP_HJ_MAX_PROGRESS_RULES; const float p01 = clamp01(progress); for (int i = 0; i < count; ++i) { const AGPHJOrderScheduleRule& rule = hj_cfg.order_schedule[i]; if (p01 < rule.progress_lt || i + 1 == count) { order_mode = rule.order_mode; break; } } return order_mode; }; auto coord_profile_scale = [&](int d, const AGPHJCoordScaleProfile& profile) noexcept -> float { if (d < n) { const float t01 = (n > 1) ? (static_cast<float>(d) / static_cast<float>(n - 1)) : 0.0f; return lerp_hj(profile.angle_scale_proximal, profile.angle_scale_distal, t01); } if (cost.variableLen) { const int j = d - n; const float t01 = (n > 1) ? (static_cast<float>(j) / static_cast<float>(n - 1)) : 0.0f; return lerp_hj(profile.length_scale_proximal, profile.length_scale_distal, t01); } return 1.0f; }; alignas(16) float hj_delta[FAST_STATE_DIM]; alignas(16) float hj_profile_scale_applied[FAST_STATE_DIM]; alignas(64) int hj_order_natural[FAST_STATE_DIM]; alignas(64) int hj_order_reverse[FAST_STATE_DIM]; alignas(64) int hj_order_interleaved_proximal[FAST_STATE_DIM]; alignas(64) int hj_order_interleaved_distal[FAST_STATE_DIM]; alignas(64) int hj_order_lengths_first[FAST_STATE_DIM]; { int pos = 0; while (pos < dim) { hj_order_natural[pos] = pos; ++pos; } pos = 0; int d = dim - 1; while (d >= 0) { hj_order_reverse[pos] = d; ++pos; --d; } pos = 0; if (cost.variableLen) { int j = 0; while (j < n) { hj_order_interleaved_proximal[pos] = j; ++pos; hj_order_interleaved_proximal[pos] = n + j; ++pos; ++j; } } else { int j = 0; while (j < n) { hj_order_interleaved_proximal[pos] = j; ++pos; ++j; } } pos = 0; if (cost.variableLen) { int j = n - 1; while (j >= 0) { hj_order_interleaved_distal[pos] = j; ++pos; hj_order_interleaved_distal[pos] = n + j; ++pos; --j; } } else { int j = n - 1; while (j >= 0) { hj_order_interleaved_distal[pos] = j; ++pos; --j; } } pos = 0; if (cost.variableLen) { int j = 0; while (j < n) { hj_order_lengths_first[pos] = n + j; ++pos; ++j; } } int j = 0; while (j < n) { hj_order_lengths_first[pos] = j; ++pos; ++j; } } auto select_hj_order_ptr = [&](int order_mode) noexcept -> const int* { switch (order_mode) { case AGP_HJ_ORDER_NATURAL: return hj_order_natural; case AGP_HJ_ORDER_REVERSE: return hj_order_reverse; case AGP_HJ_ORDER_INTERLEAVED_PROXIMAL: return hj_order_interleaved_proximal; case AGP_HJ_ORDER_LENGTHS_FIRST: return hj_order_lengths_first; default: return hj_order_interleaved_distal; } }; int hj_profile_cache_key = INT_MIN; auto apply_hj_coord_profile = [&](float progress) noexcept { AGPHJCoordScaleProfile profile{}; int key = -2; int count = hj_cfg.coord_profile_schedule_count; if (count > AGP_HJ_MAX_PROGRESS_RULES) count = AGP_HJ_MAX_PROGRESS_RULES; if (count > 0) { int i = 0; while (i < count) { const AGPHJCoordScaleRule& rule = hj_cfg.coord_profile_schedule[i]; if (progress < rule.progress_lt || i + 1 == count) { profile = rule.profile; key = i; break; } ++i; } } else { profile.angle_scale_proximal = hj_cfg.angle_scale_first; profile.angle_scale_distal = hj_cfg.angle_scale_last; profile.length_scale_proximal = hj_cfg.length_scale_first; profile.length_scale_distal = hj_cfg.length_scale_last; } if (key == hj_profile_cache_key) return; int d = 0; while (d < dim) { const float new_scale = fmaxf(coord_profile_scale(d, profile), 1.0e-12f); const float old_scale = fmaxf(hj_profile_scale_applied[d], 1.0e-12f); if (fabsf(new_scale - old_scale) > 1.0e-12f) { hj_delta[d] = fmaxf(hj_delta[d] * (new_scale / old_scale), hj_cfg.min_delta); hj_profile_scale_applied[d] = new_scale; } ++d; } hj_profile_cache_key = key; }; { const AGPHJCoordScaleProfile initial_hj_profile = select_hj_coord_profile(0.0f); for (int d = 0; d < dim; ++d) { const float span = fmaf(map.high[d], 1.0f, -map.low[d]); const float scale = fmaxf(coord_profile_scale(d, initial_hj_profile), 1.0e-12f); hj_profile_scale_applied[d] = scale; hj_delta[d] = fmaxf(span * hj_cfg.init_delta_frac * scale, hj_cfg.init_delta_abs); } } auto t_to_idx = [&](float t) -> unsigned long long { return static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f)); }; auto finalize_interval_geometry = [&](IntervalND* I) noexcept { I->i1 = t_to_idx(I->x1); I->i2 = t_to_idx(I->x2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); }; auto update_pockets_and_Mmax = [&](IntervalND* I) noexcept { const int k = I->span_level; if (I->M > M_by_span[k]) M_by_span[k] = I->M; if (M_by_span[k] > Mmax) Mmax = M_by_span[k]; }; auto recompute_dmax = [&]() noexcept { float new_dmax = 0.0f; for (auto* pI : H) { const float Ls = fmaf(pI->x2, 1.0f, -pI->x1); if (Ls > new_dmax) new_dmax = Ls; } dmax = (new_dmax > 0.0f) ? new_dmax : initial_len; }; auto recompute_heap_constM = [&](float m_cur) noexcept { const size_t sz = H.size(); if (sz == 0u) return; if (!hasHardConstraints) RecomputeR_ConstM_AVX2_ND(H.data(), sz, m_cur); else RecomputeR_ConstM_Mixed_ND(H.data(), sz, m_cur); std::make_heap(H.begin(), H.end(), ComparePtrND); }; auto update_best_from_trial = [&](const TrialPoint& tr) noexcept { if (better_indexed(tr.idx, tr.f, bestIndexFound, bestIndexValue)) { bestIndexFound = tr.idx; bestIndexValue = tr.f; agp_store_q(bestQIndexed, tr.q, dim); bestIndexedX = tr.x; bestIndexedY = tr.y; } if (tr.feasible) { if (tr.f < bestF) { bestF = tr.f; agp_store_q(bestQ, tr.q, dim); bestX = tr.x; bestY = tr.y; no_improve = 0; } else { ++no_improve; } } }; auto refine_trial = [&](float* q_inout, float& x_io, float& y_io, unsigned& idx_io, float& f_io, float t_lo, float t_hi, bool enforce_t_bounds) noexcept { if (!(f_io < fmaf(bestF, adaptive_coeff, 0.0f))) return; const float f_start = f_io; const float c1 = 1e-4f; const float tau = 0.5f; const int max_outer_iters = static_cast<int>(50.0f * (1.0f + 0.65f * p)); const int max_backtrack = static_cast<int>(20.0f * (1.0f + 0.65f * p)); const float lbfgs_trigger = 0.6f; const int m_lbfgs = 9; const int max_lbfgs_iters = static_cast<int>(25.0f * (1.0f + 0.65f * p)); const float eps_lbfgs_curv = 1e-6f; const float eps_descent = 1e-12f; float eta = 2.0f / sqrtf(dim_f); auto computeGrad = [&](const float* q_in, const float x_in, const float y_in, float* grad_out, float& grad_norm2_out) noexcept { float phi_acc_local = 0.0f; int ii = 0; while (ii < n) { phi_acc_local = fmaf(q_in[ii], 1.0f, phi_acc_local); phi[ii] = phi_acc_local; ++ii; } FABE13_SINCOS(phi, s_arr, c_arr, n); float acc_s = 0.0f; float acc_c = 0.0f; int kk = n - 1; while (kk >= 0) { const float Lk = cost.link_length(q_in, kk); acc_s = fmaf(Lk, s_arr[kk], acc_s); acc_c = fmaf(Lk, c_arr[kk], acc_c); sum_s[kk] = acc_s; sum_c[kk] = acc_c; --kk; } alignas(16) float prefix_tail_grad[FAST_STATE_DIM] = { 0.0f }; if (transitionMode && hasTransitionRefAngles && cost.transitionPrefixEnergyWeight > 0.0f) { alignas(16) float prefix_delta[FAST_STATE_DIM] = { 0.0f }; float phi_prefix = 0.0f; int jj = 0; while (jj < referenceAngleCount) { phi_prefix += q_in[jj]; prefix_delta[jj] = ManipCost::wrap_pi(phi_prefix - cost.referencePrefixAngles[jj]); ++jj; } float tail = 0.0f; jj = referenceAngleCount - 1; while (jj >= 0) { tail += prefix_delta[jj]; prefix_tail_grad[jj] = tail; --jj; } } const float dx = fmaf(x_in, 1.0f, -cost.targetX); const float dy = fmaf(y_in, 1.0f, -cost.targetY); const float dist2 = fmaf(dx, dx, dy * dy); const float dist = sqrtf(dist2); const float inv_dist = (dist > 1.0e-20f) ? (1.0f / dist) : 0.0f; grad_norm2_out = 0.0f; int i = 0; while (i < n) { float gpen = 0.0f; float g_main = 0.0f; if (transitionMode) { const float dtheta = (i < referenceAngleCount) ? cost.wrapped_angle_delta(q_in, i) : 0.0f; gpen = 2.0f * cost.transitionEnergyWeight * dtheta; gpen = fmaf(2.0f * cost.transitionPrefixEnergyWeight, prefix_tail_grad[i], gpen); g_main = 2.0f * cost.transitionCaptureWeight * fmaf(dx, -sum_s[i], dy * sum_c[i]); } else { const float tsg = fmaf(-q_in[i], cost.archBiasK, 0.0f); const float exp_arg = -tsg; const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float sig = 1.0f / (exp_val + 1.0f); gpen = fmaf(-cost.archBiasW * cost.archBiasK, sig, gpen); g_main = fmaf(dx, -sum_s[i], dy * sum_c[i]) * inv_dist; } const float gi = g_main + gpen; grad_out[i] = gi; grad_norm2_out = fmaf(gi, gi, grad_norm2_out); ++i; } if (cost.variableLen) { int j = 0; while (j < n) { float gi = 0.0f; if (transitionMode) { float gpenL = 0.0f; if (hasTransitionRefLengths && j < referenceLengthCount) { gpenL = 2.0f * cost.transitionLengthEnergyWeight * (q_in[n + j] - ref_state[n + j]); } gi = 2.0f * cost.transitionCaptureWeight * fmaf(dx, c_arr[j], dy * s_arr[j]) + gpenL; } else { gi = fmaf(dx, c_arr[j], dy * s_arr[j]) * inv_dist; } grad_out[n + j] = gi; grad_norm2_out = fmaf(gi, gi, grad_norm2_out); ++j; } } }; auto armijoLineSearch = [&](const float* q_base, const float f_base, const float* dir, const float gtd, float& alpha_io, float* q_out, float& f_out, float& x_out, float& y_out, unsigned& idx_out) noexcept -> bool { float alpha = alpha_io; int backtrack = 0; if (needsProjectedCandidates) { while (backtrack < max_backtrack) { int i = 0; while (i < dim) { q_out[i] = fmaf(alpha, dir[i], q_base[i]); ++i; } clampPointProjected(q_out); float x2 = 0.0f; float y2 = 0.0f; unsigned idx_try = 0u; float val_try = 0.0f; const bool feasible_try = cost.evaluate_indexed(q_out, x2, y2, idx_try, val_try); if (!feasible_try || !(val_try == val_try)) { alpha *= tau; ++backtrack; continue; } const float t_try = clamp01(map.pointToT(q_out)); if (enforce_t_bounds) { if (t_hi - t_lo > 2.0f * T_EPS) { if (!(t_try > t_lo + T_EPS && t_try < t_hi - T_EPS)) { alpha *= tau; ++backtrack; continue; } } else { if (fabsf(t_try - t_lo) > T_EPS) { alpha *= tau; ++backtrack; continue; } } } if (val_try <= f_base + c1 * alpha * gtd) { alpha_io = alpha; f_out = val_try; x_out = x2; y_out = y2; idx_out = idx_try; return true; } alpha *= tau; ++backtrack; } } else { while (backtrack < max_backtrack) { int i = 0; while (i < dim) { q_out[i] = fmaf(alpha, dir[i], q_base[i]); ++i; } float x2 = 0.0f; float y2 = 0.0f; unsigned idx_try = 0u; float val_try = 0.0f; cost.evaluate_indexed(q_out, x2, y2, idx_try, val_try); const float t_try = clamp01(map.pointToT(q_out)); if (enforce_t_bounds) { if (t_hi - t_lo > 2.0f * T_EPS) { if (!(t_try > t_lo + T_EPS && t_try < t_hi - T_EPS)) { alpha *= tau; ++backtrack; continue; } } else { if (fabsf(t_try - t_lo) > T_EPS) { alpha *= tau; ++backtrack; continue; } } } if (val_try <= f_base + c1 * alpha * gtd) { alpha_io = alpha; f_out = val_try; x_out = x2; y_out = y2; idx_out = idx_try; return true; } alpha *= tau; ++backtrack; } } return false; }; bool lbfgs_already_tried = false; int outer = 0; while (outer < max_outer_iters) { float grad[FAST_STATE_DIM]; float grad_norm2 = 0.0f; computeGrad(q_inout, x_io, y_io, grad, grad_norm2); if (!(grad_norm2 == grad_norm2) || grad_norm2 < 1.0e-12f) break; float dir_gd[FAST_STATE_DIM]; int i = 0; while (i < dim) { dir_gd[i] = -grad[i]; ++i; } const float gtd_gd = -grad_norm2; float eta_trial = eta; float f_new = f_io; float x_new = x_io; float y_new = y_io; unsigned idx_new = idx_io; const bool found = armijoLineSearch(q_inout, f_io, dir_gd, gtd_gd, eta_trial, q_try, f_new, x_new, y_new, idx_new); if (!found) break; std::memcpy(q_inout, q_try, static_cast<size_t>(dim) * sizeof(float)); f_io = f_new; x_io = x_new; y_io = y_new; idx_io = idx_new; eta = eta_trial; const float rel_impr = (f_start > 1.0e-20f) ? ((f_start - f_io) / f_start) : 0.0f; if (!lbfgs_already_tried && rel_impr >= lbfgs_trigger) { lbfgs_already_tried = true; float q_resume[FAST_STATE_DIM]; std::memcpy(q_resume, q_inout, static_cast<size_t>(dim) * sizeof(float)); const float f_resume = f_io; const float x_resume = x_io; const float y_resume = y_io; const unsigned idx_resume = idx_io; const float eta_resume = eta; float q_best_lbfgs[FAST_STATE_DIM]; std::memcpy(q_best_lbfgs, q_inout, static_cast<size_t>(dim) * sizeof(float)); float f_best_lbfgs = f_io; float x_best_lbfgs = x_io; float y_best_lbfgs = y_io; unsigned idx_best_lbfgs = idx_io; float s_hist[9][FAST_STATE_DIM]; float y_hist[9][FAST_STATE_DIM]; float rho_hist[9]; float alpha_hist[9]; int hist_size = 0; float gk[FAST_STATE_DIM]; float gk_norm2 = 0.0f; computeGrad(q_inout, x_io, y_io, gk, gk_norm2); bool lbfgs_ok = true; float alpha_k = 2.0f / sqrtf(dim_f); int it_lbfgs = 0; while (it_lbfgs < max_lbfgs_iters) { if (!(gk_norm2 == gk_norm2) || gk_norm2 < 1.0e-12f) break; float dir[FAST_STATE_DIM]; if (hist_size == 0) { int d = 0; while (d < dim) { dir[d] = -gk[d]; ++d; } } else { float q_vec[FAST_STATE_DIM]; int d = 0; while (d < dim) { q_vec[d] = gk[d]; ++d; } for (int jj = hist_size - 1; jj >= 0; --jj) { float dot_sq = 0.0f; int d2 = 0; while (d2 < dim) { dot_sq = fmaf(s_hist[jj][d2], q_vec[d2], dot_sq); ++d2; } const float a_coeff = dot_sq * rho_hist[jj]; alpha_hist[jj] = a_coeff; d2 = 0; while (d2 < dim) { q_vec[d2] = fmaf(-a_coeff, y_hist[jj][d2], q_vec[d2]); ++d2; } } float gamma = 1.0f; { const int last = hist_size - 1; float yy = 0.0f; int d2 = 0; while (d2 < dim) { yy = fmaf(y_hist[last][d2], y_hist[last][d2], yy); ++d2; } const float ys = 1.0f / rho_hist[last]; if (yy > 0.0f) gamma = ys / yy; } float r_vec[FAST_STATE_DIM]; d = 0; while (d < dim) { r_vec[d] = gamma * q_vec[d]; ++d; } for (int jj = 0; jj < hist_size; ++jj) { float dot_yr = 0.0f; int d2 = 0; while (d2 < dim) { dot_yr = fmaf(y_hist[jj][d2], r_vec[d2], dot_yr); ++d2; } const float b_coeff = dot_yr * rho_hist[jj]; const float coeff = alpha_hist[jj] - b_coeff; d2 = 0; while (d2 < dim) { r_vec[d2] = fmaf(coeff, s_hist[jj][d2], r_vec[d2]); ++d2; } } d = 0; while (d < dim) { dir[d] = -r_vec[d]; ++d; } } float gtd = 0.0f; int d = 0; while (d < dim) { gtd = fmaf(gk[d], dir[d], gtd); ++d; } if (!(gtd == gtd) || gtd >= -eps_descent) { d = 0; while (d < dim) { dir[d] = -gk[d]; ++d; } gtd = -gk_norm2; if (gtd >= -eps_descent) { lbfgs_ok = false; break; } } float q_old[FAST_STATE_DIM]; std::memcpy(q_old, q_inout, static_cast<size_t>(dim) * sizeof(float)); float g_old[FAST_STATE_DIM]; std::memcpy(g_old, gk, static_cast<size_t>(dim) * sizeof(float)); float alpha_try = alpha_k; float f_try = f_io; float x_try = x_io; float y_try = y_io; unsigned idx_try = idx_io; const bool step_ok = armijoLineSearch(q_inout, f_io, dir, gtd, alpha_try, q_try, f_try, x_try, y_try, idx_try); if (!step_ok) { lbfgs_ok = false; break; } std::memcpy(q_inout, q_try, static_cast<size_t>(dim) * sizeof(float)); f_io = f_try; x_io = x_try; y_io = y_try; idx_io = idx_try; alpha_k = alpha_try; computeGrad(q_inout, x_io, y_io, gk, gk_norm2); if (!(gk_norm2 == gk_norm2)) { lbfgs_ok = false; break; } if (f_io < f_best_lbfgs) { f_best_lbfgs = f_io; std::memcpy(q_best_lbfgs, q_inout, static_cast<size_t>(dim) * sizeof(float)); x_best_lbfgs = x_io; y_best_lbfgs = y_io; idx_best_lbfgs = idx_io; } float ys = 0.0f; float s_new[FAST_STATE_DIM]; float y_new[FAST_STATE_DIM]; d = 0; while (d < dim) { const float sd = q_inout[d] - q_old[d]; const float yd = gk[d] - g_old[d]; s_new[d] = sd; y_new[d] = yd; ys = fmaf(yd, sd, ys); ++d; } if (!(ys == ys) || ys <= eps_lbfgs_curv) { hist_size = 0; } else { if (hist_size < m_lbfgs) { const int idx_hist = hist_size; rho_hist[idx_hist] = 1.0f / ys; d = 0; while (d < dim) { s_hist[idx_hist][d] = s_new[d]; y_hist[idx_hist][d] = y_new[d]; ++d; } ++hist_size; } else { for (int jj = 0; jj < m_lbfgs - 1; ++jj) { rho_hist[jj] = rho_hist[jj + 1]; d = 0; while (d < dim) { s_hist[jj][d] = s_hist[jj + 1][d]; y_hist[jj][d] = y_hist[jj + 1][d]; ++d; } } const int idx_hist = m_lbfgs - 1; rho_hist[idx_hist] = 1.0f / ys; d = 0; while (d < dim) { s_hist[idx_hist][d] = s_new[d]; y_hist[idx_hist][d] = y_new[d]; ++d; } hist_size = m_lbfgs; } } if (gk_norm2 < 1.0e-12f) break; ++it_lbfgs; } if (lbfgs_ok) { std::memcpy(q_inout, q_best_lbfgs, static_cast<size_t>(dim) * sizeof(float)); f_io = f_best_lbfgs; x_io = x_best_lbfgs; y_io = y_best_lbfgs; idx_io = idx_best_lbfgs; break; } else { if (f_best_lbfgs < f_resume) { std::memcpy(q_inout, q_best_lbfgs, static_cast<size_t>(dim) * sizeof(float)); f_io = f_best_lbfgs; x_io = x_best_lbfgs; y_io = y_best_lbfgs; idx_io = idx_best_lbfgs; } else { std::memcpy(q_inout, q_resume, static_cast<size_t>(dim) * sizeof(float)); f_io = f_resume; x_io = x_resume; y_io = y_resume; idx_io = idx_resume; } eta = eta_resume; } } ++outer; } }; auto evaluate_trial_from_q = [&](const float* q_seed, float t_lo, float t_hi, bool enforce_t_bounds) noexcept -> TrialPoint { TrialPoint tr{}; std::memcpy(tr.q, q_seed, static_cast<size_t>(dim) * sizeof(float)); tr.feasible = cost.evaluate_indexed(tr.q, tr.x, tr.y, tr.idx, tr.f); if (tr.feasible) { refine_trial(tr.q, tr.x, tr.y, tr.idx, tr.f, t_lo, t_hi, enforce_t_bounds); } tr.t = clamp01(map.pointToT(tr.q)); if (enforce_t_bounds) { if (tr.t < t_lo) tr.t = t_lo; if (tr.t > t_hi) tr.t = t_hi; } update_best_from_trial(tr); return tr; }; auto evaluate_trial_from_t = [&](float t_base, float t_lo, float t_hi, bool enforce_t_bounds) noexcept -> TrialPoint { alignas(16) float q_seed[FAST_STATE_DIM]; map.map01ToPoint(t_base, q_seed); return evaluate_trial_from_q(q_seed, t_lo, t_hi, enforce_t_bounds); }; auto make_interval_from_trials = [&](const TrialPoint& Lp, const TrialPoint& Rp) noexcept -> IntervalND* { if (!(Lp.t + T_EPS < Rp.t)) return nullptr; IntervalND* I = new IntervalND(Lp.t, Rp.t, Lp.f, Rp.f, Lp.idx, Rp.idx); finalize_interval_geometry(I); return I; }; auto find_container_index = [&](float t) noexcept -> int { int best_pos = -1; float best_len = FLT_MAX; for (size_t i = 0u; i < H.size(); ++i) { IntervalND* I = H[i]; if (t > I->x1 + T_EPS && t < I->x2 - T_EPS) { const float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } } if (best_pos >= 0) return best_pos; for (size_t i = 0u; i < H.size(); ++i) { IntervalND* I = H[i]; if (t >= I->x1 - T_EPS && t <= I->x2 + T_EPS) { const float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } } return best_pos; }; auto inject_trial_into_queue = [&](const TrialPoint& tr, float r_eff_cur, float adaptive_coeff_local) noexcept { if (H.empty()) return; const int pos = find_container_index(tr.t); if (pos < 0) return; IntervalND* src = H[static_cast<size_t>(pos)]; if (!(tr.t > src->x1 + T_EPS && tr.t < src->x2 - T_EPS)) return; const float src_len = fmaf(src->x2, 1.0f, -src->x1); IntervalND* L = new IntervalND(src->x1, tr.t, src->y1, tr.f, src->idx1, tr.idx); IntervalND* Rv = new IntervalND(tr.t, src->x2, tr.f, src->y2, tr.idx, src->idx2); finalize_interval_geometry(L); finalize_interval_geometry(Rv); const float prevMmax = Mmax; H[static_cast<size_t>(pos)] = H.back(); H.pop_back(); delete src; if (!H.empty()) std::make_heap(H.begin(), H.end(), ComparePtrND); update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv); const float m_cur = fmaf(r_eff_cur, Mmax, 0.0f); L->ChangeCharacteristic(m_cur); Rv->ChangeCharacteristic(m_cur); H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND); H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND); if (Mmax > fmaf(adaptive_coeff_local, prevMmax, 0.0f)) recompute_heap_constM(m_cur); if (src_len + T_EPS >= dmax) recompute_dmax(); }; auto find_container_index_bounds = [&](const float t, float& lo, float& hi) noexcept -> int { int best_pos = -1; float best_len = FLT_MAX; size_t i = 0u; while (i < H.size()) { IntervalND* const I = H[i]; if (t > I->x1 + T_EPS && t < I->x2 - T_EPS) { const float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } ++i; } if (best_pos < 0) { i = 0u; while (i < H.size()) { IntervalND* const I = H[i]; if (t >= I->x1 - T_EPS && t <= I->x2 + T_EPS) { const float len = fmaf(I->x2, 1.0f, -I->x1); if (len < best_len) { best_len = len; best_pos = static_cast<int>(i); } } ++i; } } if (best_pos >= 0) { const IntervalND* const I = H[static_cast<size_t>(best_pos)]; lo = I->x1; hi = I->x2; } else { lo = 0.0f; hi = 1.0f; } return best_pos; }; auto inject_trial_into_queue_fast = [&](const TrialPoint& tr, const int pos, float r_eff_cur, float adaptive_coeff_local) noexcept { if (pos < 0 || static_cast<size_t>(pos) >= H.size()) { inject_trial_into_queue(tr, r_eff_cur, adaptive_coeff_local); return; } IntervalND* const src = H[static_cast<size_t>(pos)]; if (!(tr.t > src->x1 + T_EPS && tr.t < src->x2 - T_EPS)) { inject_trial_into_queue(tr, r_eff_cur, adaptive_coeff_local); return; } const float src_len = fmaf(src->x2, 1.0f, -src->x1); IntervalND* const L = new IntervalND(src->x1, tr.t, src->y1, tr.f, src->idx1, tr.idx); IntervalND* const Rv = new IntervalND(tr.t, src->x2, tr.f, src->y2, tr.idx, src->idx2); finalize_interval_geometry(L); finalize_interval_geometry(Rv); const float prevMmax = Mmax; H[static_cast<size_t>(pos)] = H.back(); H.pop_back(); delete src; if (!H.empty()) std::make_heap(H.begin(), H.end(), ComparePtrND); update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv); const float m_cur = fmaf(r_eff_cur, Mmax, 0.0f); L->ChangeCharacteristic(m_cur); Rv->ChangeCharacteristic(m_cur); H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND); H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND); if (Mmax > fmaf(adaptive_coeff_local, prevMmax, 0.0f)) recompute_heap_constM(m_cur); if (src_len + T_EPS >= dmax) recompute_dmax(); }; auto progress_outgoing = [&]() noexcept { for (auto itReq = g_pendingMulti.begin(); itReq != g_pendingMulti.end();) { if (itReq->req.test()) itReq = g_pendingMulti.erase(itReq); else ++itReq; } for (auto itReq = g_pendingBest.begin(); itReq != g_pendingBest.end();) { if (itReq->req.test()) itReq = g_pendingBest.erase(itReq); else ++itReq; } }; auto select_hj_seed = [&](const AlignedFloatVec*& seed_vec, float& seed_f, unsigned& seed_idx, float& seed_x, float& seed_y) noexcept -> bool { auto take_indexed = [&]() noexcept -> bool { if (bestQIndexed.empty()) return false; seed_vec = &bestQIndexed; seed_f = bestIndexValue; seed_idx = bestIndexFound; seed_x = bestIndexedX; seed_y = bestIndexedY; return true; }; auto take_feasible = [&]() noexcept -> bool { if (bestQ.empty()) return false; seed_vec = &bestQ; seed_f = bestF; seed_idx = fullConstraintIndex; seed_x = bestX; seed_y = bestY; return true; }; auto take_reference = [&]() noexcept -> bool { if (!haveTransitionRefSeed) return false; seed_vec = &transitionReferenceSeed; seed_f = transitionRefF; seed_idx = transitionRefIdx; seed_x = transitionRefX; seed_y = transitionRefY; return true; }; seed_vec = nullptr; seed_f = FLT_MAX; seed_idx = 0u; seed_x = 0.0f; seed_y = 0.0f; if (transitionMode) return take_feasible() || take_reference() || take_indexed(); return take_indexed() || take_feasible(); }; auto should_run_hj_this_iter = [&](float progress) noexcept -> bool { if (!hj_cfg.enabled) return false; if (hj_cfg.require_adaptive_flag && !adaptive) return false; int period = 1; int phase = 0; select_hj_iteration_schedule(progress, period, phase); if (period <= 1) return true; return (it % period) == phase; }; auto run_hooke_jeeves_iteration = [&](float r_eff_cur, float adaptive_coeff_local, float progress) noexcept { if (H.empty()) return; const AlignedFloatVec* seedQ = nullptr; float seedF = FLT_MAX; float seedX = 0.0f; float seedY = 0.0f; unsigned seedIdx = 0u; if (!select_hj_seed(seedQ, seedF, seedIdx, seedX, seedY)) return; TrialPoint base{}; std::memcpy(base.q, seedQ->data(), static_cast<size_t>(dim) * sizeof(float)); base.t = clamp01(map.pointToT(base.q)); base.f = seedF; base.idx = seedIdx; base.x = seedX; base.y = seedY; base.feasible = (base.idx == fullConstraintIndex); auto evaluate_hj_candidate = [&](const float* q_candidate) noexcept -> TrialPoint { const float t_guess = clamp01(map.pointToT(q_candidate)); float lo = 0.0f; float hi = 1.0f; const int pos = find_container_index_bounds(t_guess, lo, hi); const TrialPoint tr = evaluate_trial_from_q(q_candidate, lo, hi, true); inject_trial_into_queue_fast(tr, pos, r_eff_cur, adaptive_coeff_local); return tr; }; apply_hj_coord_profile(progress); const int hj_order_mode = select_hj_order_mode(progress); const int* const hj_order = select_hj_order_ptr(hj_order_mode); TrialPoint current = base; TrialPoint base_before = base; bool improved_any = false; int ord = 0; while (ord < dim) { const int d = hj_order[ord]; const float step = hj_delta[d]; const TrialPoint axis_base = current; auto try_direction = [&](int sgn, TrialPoint& accepted_tp) noexcept -> bool { float q_candidate[FAST_STATE_DIM]; std::memcpy(q_candidate, axis_base.q, static_cast<size_t>(dim) * sizeof(float)); q_candidate[d] = fmaf(static_cast<float>(sgn), step, q_candidate[d]); if (needsProjectedCandidates) clampPointProjected(q_candidate); if (fabsf(q_candidate[d] - axis_base.q[d]) <= 1.0e-12f) return false; const TrialPoint tp = evaluate_hj_candidate(q_candidate); if (better_indexed(tp.idx, tp.f, axis_base.idx, axis_base.f)) { accepted_tp = tp; return true; } return false; }; const int first_sign = hj_cfg.plus_first ? +1 : -1; const int second_sign = -first_sign; bool improved_this_axis = false; TrialPoint best_axis_tp{}; bool have_axis_tp = false; TrialPoint tp_try{}; if (try_direction(first_sign, tp_try)) { best_axis_tp = tp_try; have_axis_tp = true; if (hj_cfg.opportunistic_axis) { current = best_axis_tp; improved_any = true; improved_this_axis = true; } } if (!hj_cfg.opportunistic_axis || !have_axis_tp) { if (try_direction(second_sign, tp_try)) { if (!have_axis_tp || better_indexed(tp_try.idx, tp_try.f, best_axis_tp.idx, best_axis_tp.f)) { best_axis_tp = tp_try; have_axis_tp = true; } } if (have_axis_tp) { current = best_axis_tp; improved_any = true; improved_this_axis = true; } } if (improved_this_axis && hj_cfg.expand_delta_on_success) { hj_delta[d] = fmaxf(hj_cfg.min_delta, hj_delta[d] * hj_cfg.expand_factor); } ++ord; } if (improved_any) { if (hj_cfg.use_pattern) { float q_pattern[FAST_STATE_DIM]; for (int d = 0; d < dim; ++d) { q_pattern[d] = fmaf(hj_cfg.pattern_gain, current.q[d] - base_before.q[d], current.q[d]); } if (needsProjectedCandidates) clampPointProjected(q_pattern); TrialPoint pat_tp = evaluate_hj_candidate(q_pattern); if (better_indexed(pat_tp.idx, pat_tp.f, current.idx, current.f)) { current = pat_tp; } } } else { for (int d = 0; d < dim; ++d) { hj_delta[d] = fmaxf(hj_delta[d] * hj_cfg.shrink_factor, hj_cfg.min_delta); } } }; auto ensure_queue_nonempty = [&](float r_eff_cur) noexcept -> bool { if (!H.empty()) return true; const AlignedFloatVec* seedQ = nullptr; float seedF = FLT_MAX; float seedX = 0.0f; float seedY = 0.0f; unsigned seedIdx = 0u; if (!select_hj_seed(seedQ, seedF, seedIdx, seedX, seedY)) return false; const float t_center = clamp01(map.pointToT(seedQ->data())); const float half = fmaxf(1.0e-4f * dim_f, 1.0e-6f); const float t1b = clamp01(t_center - half); const float t2b = clamp01(t_center + half); const float mid = 0.5f * (t1b + t2b); const TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true); const TrialPoint right_tp = evaluate_trial_from_t(t2b, mid, t2b, true); IntervalND* I = make_interval_from_trials(left_tp, right_tp); if (I == nullptr) return false; update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff_cur, Mmax, 0.0f)); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); recompute_dmax(); return true; }; while (g_world->iprobe(boost::mpi::any_source, 0)) { MultiCrossMsg dummy; g_world->recv(boost::mpi::any_source, 0, dummy); } while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg dummy; g_world->recv(boost::mpi::any_source, 2, dummy); } const int seedStride = FAST_SEED_STRIDE; AlignedFloatVec seeds(static_cast<size_t>(FAST_MAX_SEEDS) * static_cast<size_t>(seedStride), 0.0f); const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds.data(), seedStride, static_cast<unsigned>(fmaf(static_cast<float>(g_world->rank()), 7919.0f, static_cast<float>(seed)))); const int K_raw = static_cast<int>(fmaf(-fmaf(sqrtf(dim_f), dim_f, 0.0f), 0.725f, 10.95f)); const int K = (K_raw > 2) ? K_raw : 2; H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + static_cast<size_t>(seedCnt) + 32u); const int rank = g_world->rank(); const int world = g_world->size(); const size_t comm_levels = std::bit_width(static_cast<size_t>(world - 1)); for (int i = 0; i < seedCnt; ++i) { const float* s = seeds.data() + static_cast<size_t>(i) * static_cast<size_t>(seedStride); const float t_seed = clamp01(map.pointToT(s)); const float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f) : fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f), exp2f((1.0f / static_cast<float>(fmaxf(static_cast<float>(seedCnt - 4), 1.0f))) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)), 0.0f); const float t1b = clamp01(fmaf(-interval_size, 1.0f, t_seed)); const float t2b = clamp01(fmaf(interval_size, 1.0f, t_seed)); const float mid = 0.5f * (t1b + t2b); const TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true); const TrialPoint right_tp = evaluate_trial_from_t(t2b, mid, t2b, true); IntervalND* I = make_interval_from_trials(left_tp, right_tp); if (I == nullptr) continue; update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); if (i < 3) { I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f); } else { const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f); const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f); const float mult = fmaf(start_mult, exp2f((1.0f / static_cast<float>(fmaxf(static_cast<float>(seedCnt - 4), 1.0f))) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)), 0.0f); I->R = fmaf(I->R, mult, 0.0f); } H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } std::vector<float> grid_base; grid_base.reserve(static_cast<size_t>(K) + 2u); grid_base.emplace_back(a); for (int k = 1; k <= K; ++k) { const float t = clamp01(fmaf(fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1)))); grid_base.emplace_back(t); } grid_base.emplace_back(b); std::vector<TrialPoint> grid_trials(grid_base.size()); for (size_t i = 0u; i < grid_base.size(); ++i) { float lo = grid_base[i]; float hi = grid_base[i]; if (i == 0u) { lo = hi = a; } else if (i + 1u == grid_base.size()) { lo = hi = b; } else { lo = 0.5f * (grid_base[i - 1u] + grid_base[i]); hi = 0.5f * (grid_base[i] + grid_base[i + 1u]); } grid_trials[i] = evaluate_trial_from_t(grid_base[i], lo, hi, true); } for (size_t i = 1u; i < grid_trials.size(); ++i) { IntervalND* I = make_interval_from_trials(grid_trials[i - 1u], grid_trials[i]); if (I == nullptr) continue; update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } recompute_dmax(); if (haveTransitionRefSeed) { const TrialPoint ref_tp = evaluate_trial_from_q(transitionReferenceSeed.data(), 0.0f, 1.0f, false); if (!H.empty()) { inject_trial_into_queue(ref_tp, r, 1.0f); recompute_dmax(); } else { ensure_queue_nonempty(r); } } g_world->barrier(); while (true) { p = fmaf(-1.0f / initial_len, dmax, 1.0f); stag_r_multiplier = 1.4f - 1.1f * fmaf(0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), 0.65f * p - 0.45f); const float p_arg = fmaf(p, 2.3f, -2.9775f); float current_r = r; if (stag_boost_remaining > 0) { current_r = r * stag_r_multiplier; --stag_boost_remaining; } const float exp_arg = fmaf(B_dim, p, 0.0f); const float exp_arg__ = fmaf(B_dim__, p, 0.0f); adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim); const float first_sqrt = sqrtf(fmaf(1.0f / dim_f, 2.0f, 0.0f)); const float second_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 5.0f, 0.0f)); const float third_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 9.0f, 0.0f)); const float fourth_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 6.5f, 0.0f)); const float rr = sqrtf(fmaf(-p, 1.0f, 1.0f)); const float xx = p * p; const float tt = fmaf(500.0f, p, -486.95472f); const float adaptive_coeff_ = (p < 0.95f) ? fmaf(fmaf(first_sqrt, xx, 0.0f), 0.0130349902f, fmaf(-0.04f, p, fmaf(fmaf(first_sqrt, rr, 0.0f), 0.15f, 1.1f))) : (p < 0.97390944f) ? fmaf(second_sqrt, rr, 0.9396f) : (p < 0.97590944f) ? fmaf(fmaf(fmaf(fmaf(third_sqrt, tt, 0.0f), tt, 0.0f), fmaf(-2.0f, tt, 3.0f), 0.0f), fmaf(0.25f, rr, -0.0396f), fmaf(fmaf(third_sqrt, rr, 0.0f), 0.75f, 0.9396f)) : fmaf(fourth_sqrt, rr, 0.925f); adaptive_coeff__ = fmaf(fmaf(exp_arg__, fmaf(exp_arg__, fmaf(exp_arg__, fmaf(exp_arg__, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition__, 2.0f - A_dim__); float grad_norm2_best = 0.0f; if (no_improve > 0 && !bestQ.empty()) { float acc_best = 0.0f; float phi_best[FAST_STATE_DIM]; int ii_best = 0; while (ii_best < n) { acc_best = fmaf(bestQ[ii_best], 1.0f, acc_best); phi_best[ii_best] = acc_best; ++ii_best; } float s_best[FAST_STATE_DIM]; float c_best[FAST_STATE_DIM]; FABE13_SINCOS(phi_best, s_best, c_best, n); float as_best = 0.0f; float ac_best = 0.0f; float sum_s_best[FAST_STATE_DIM]; float sum_c_best[FAST_STATE_DIM]; int k_best = n - 1; while (k_best >= 0) { const float Lk = cost.variableLen ? bestQ[n + k_best] : cost.fixedLength; as_best = fmaf(Lk, s_best[k_best], as_best); ac_best = fmaf(Lk, c_best[k_best], ac_best); sum_s_best[k_best] = as_best; sum_c_best[k_best] = ac_best; --k_best; } alignas(16) float prefix_tail_best[FAST_STATE_DIM] = { 0.0f }; if (transitionMode && hasTransitionRefAngles && cost.transitionPrefixEnergyWeight > 0.0f) { alignas(16) float prefix_delta_best[FAST_STATE_DIM] = { 0.0f }; float phi_pref = 0.0f; int jj = 0; while (jj < referenceAngleCount) { phi_pref += bestQ[jj]; prefix_delta_best[jj] = ManipCost::wrap_pi(phi_pref - cost.referencePrefixAngles[jj]); ++jj; } float tail = 0.0f; jj = referenceAngleCount - 1; while (jj >= 0) { tail += prefix_delta_best[jj]; prefix_tail_best[jj] = tail; --jj; } } const float dx_best = fmaf(bestX, 1.0f, -cost.targetX); const float dy_best = fmaf(bestY, 1.0f, -cost.targetY); const float dist_best = sqrtf(fmaf(dx_best, dx_best, dy_best * dy_best)); const float inv_dist_best = (dist_best > 1.0e-20f) ? (1.0f / dist_best) : 0.0f; int i_best = 0; while (i_best < n) { float gpen_best = 0.0f; float g_main_best = 0.0f; if (transitionMode) { const float dtheta_best = (i_best < referenceAngleCount) ? cost.wrapped_angle_delta(bestQ.data(), i_best) : 0.0f; gpen_best = 2.0f * cost.transitionEnergyWeight * dtheta_best; gpen_best = fmaf(2.0f * cost.transitionPrefixEnergyWeight, prefix_tail_best[i_best], gpen_best); g_main_best = 2.0f * cost.transitionCaptureWeight * fmaf(dx_best, -sum_s_best[i_best], dy_best * sum_c_best[i_best]); } else { const float tsg = fmaf(-bestQ[i_best], cost.archBiasK, 0.0f); const float exp_arg_g = -tsg; const float exp_val = fmaf(exp_arg_g, fmaf(exp_arg_g, fmaf(exp_arg_g, fmaf(exp_arg_g, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float sig = 1.0f / (exp_val + 1.0f); gpen_best = fmaf(-cost.archBiasW * cost.archBiasK, sig, gpen_best); g_main_best = fmaf(dx_best, -sum_s_best[i_best], dy_best * sum_c_best[i_best]) * inv_dist_best; } const float gi_best = g_main_best + gpen_best; grad_norm2_best = fmaf(gi_best, gi_best, grad_norm2_best); ++i_best; } if (cost.variableLen) { int j_best = 0; while (j_best < n) { float gi_best = 0.0f; if (transitionMode) { float gpenL_best = 0.0f; if (hasTransitionRefLengths && j_best < referenceLengthCount) { gpenL_best = 2.0f * cost.transitionLengthEnergyWeight * (bestQ[n + j_best] - ref_state[n + j_best]); } gi_best = 2.0f * cost.transitionCaptureWeight * fmaf(dx_best, c_best[j_best], dy_best * s_best[j_best]) + gpenL_best; } else { gi_best = fmaf(dx_best, c_best[j_best], dy_best * s_best[j_best]) * inv_dist_best; } grad_norm2_best = fmaf(gi_best, gi_best, grad_norm2_best); ++j_best; } } } const bool stagnation = (no_improve > noImproveThrDim) && (grad_norm2_best < 0.5e-1f); const float r_eff = (dim > 2) ? fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, fmaf(sqrtf(dim_f - 1), current_r, 0.0f), 0.0f) : fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, current_r, 0.0f); if (stagnation) { stag_boost_remaining = n_stag_iters; float t_seeds[32]; int seed_count = 0; if (hasTransitionRefAngles) { alignas(16) float q_center[FAST_STATE_DIM]; std::memcpy(q_center, ref_state, static_cast<size_t>(dim) * sizeof(float)); t_seeds[seed_count++] = clamp01(map.pointToT(q_center)); if (!bestQ.empty() && seed_count < 32) { t_seeds[seed_count++] = clamp01(map.pointToT(bestQ.data())); } unsigned st_local = seed + static_cast<unsigned>(it) + 911u; const int local_variants = 2 + static_cast<int>(sqrtf(dim_f)); for (int v = 0; v < local_variants && seed_count < 32; ++v) { float q_temp[FAST_STATE_DIM]; std::memcpy(q_temp, q_center, static_cast<size_t>(dim) * sizeof(float)); st_local ^= st_local << 13; st_local ^= st_local >> 17; st_local ^= st_local << 5; const float rnd0 = static_cast<float>(st_local & 0xFFFFFFu) * 5.9604645e-8f; const int aidx = v % n; q_temp[aidx] = fmaf((2.0f * rnd0 - 1.0f), 0.25f * transitionMaxAngleStep, q_temp[aidx]); if (cost.variableLen && hasTransitionRefLengths) { st_local ^= st_local << 13; st_local ^= st_local >> 17; st_local ^= st_local << 5; const float rnd1 = static_cast<float>(st_local & 0xFFFFFFu) * 5.9604645e-8f; const int lidx = v % n; q_temp[n + lidx] = fmaf((2.0f * rnd1 - 1.0f), 0.25f * cost.max_length_step(lidx), q_temp[n + lidx]); } clampPointProjected(q_temp); t_seeds[seed_count++] = clamp01(map.pointToT(q_temp)); } } else { const int num_ik = 1 + static_cast<int>(sqrtf(dim_f)); const float dist_to_target = sqrtf(cost.targetX * cost.targetX + cost.targetY * cost.targetY); float max_reach = 0.0f; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) max_reach += map.high[n + ii]; } else { max_reach = static_cast<float>(n) * cost.fixedLength; } const float ratio = dist_to_target / fmaxf(max_reach, 1.0e-20f); const bool prefer_extended = (ratio > 0.7f); const bool prefer_compact = (ratio < 0.4f); const bool use_ik = !(ratio > 0.4f && ratio < 0.7f); if (!use_ik) { AlignedFloatVec temp_S( static_cast<size_t>(FAST_MAX_SEEDS) * static_cast<size_t>(FAST_SEED_STRIDE), 0.0f); const int sobol_gen = generate_sobol_seeds_limited( map, dim, temp_S.data(), FAST_SEED_STRIDE, seed + static_cast<unsigned>(it), FAST_MAX_SEEDS); const int num_sobol = num_ik; for (int kk = 0; kk < num_sobol && kk < sobol_gen && seed_count < 32; ++kk) { const float* s = temp_S.data() + static_cast<size_t>(kk) * static_cast<size_t>(FAST_SEED_STRIDE); t_seeds[seed_count++] = clamp01(map.pointToT(s)); } } else { float angles_ccd[32] = { 0.0f }; float lengths_ccd[32]; if (cost.variableLen) { const float len_low = map.low[n]; const float len_high = map.high[n]; const float avg_len = 0.5f * (len_low + len_high); for (int ii = 0; ii < n; ++ii) lengths_ccd[ii] = avg_len; } else { for (int ii = 0; ii < n; ++ii) lengths_ccd[ii] = cost.fixedLength; } ccd_ik(cost.targetX, cost.targetY, lengths_ccd, n, angles_ccd, 10); float angles_fabrik[32] = { 0.0f }; float lengths_fabrik[32]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) lengths_fabrik[ii] = lengths_ccd[ii]; } else { for (int ii = 0; ii < n; ++ii) lengths_fabrik[ii] = cost.fixedLength; } float targetX_fab = cost.targetX; float targetY_fab = cost.targetY; for (int iter_fab = 0; iter_fab < 3; ++iter_fab) { float prevX = targetX_fab; float prevY = targetY_fab; for (int j = n - 1; j >= 0; --j) { const float len = lengths_fabrik[j]; const float angle_to_target = atan2f(prevY, prevX); angles_fabrik[j] = angle_to_target; float s_val, c_val; FABE13_SINCOS(&angle_to_target, &s_val, &c_val, 1); prevX = prevX - len * c_val; prevY = prevY - len * s_val; } } if (prefer_extended) { { float q_ccd[FAST_STATE_DIM]; for (int ii = 0; ii < n; ++ii) q_ccd[ii] = angles_ccd[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) q_ccd[n + ii] = lengths_ccd[ii]; } t_seeds[seed_count++] = clamp01(map.pointToT(q_ccd)); } unsigned st_ik = seed + static_cast<unsigned>(it) + 222u; const int remaining = num_ik - 1; for (int v = 0; v < remaining && seed_count < 32; ++v) { float noisy_angles[32]; float noisy_lengths[32]; for (int ii = 0; ii < n; ++ii) { st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; const float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f; noisy_angles[ii] = angles_ccd[ii] + (2.0f * rnd - 1.0f) * 0.1f; const float lo = (ii == 0) ? -1.0471975511965976f : -2.6179938779914944f; const float hi = 2.6179938779914944f; if (noisy_angles[ii] < lo) noisy_angles[ii] = lo; if (noisy_angles[ii] > hi) noisy_angles[ii] = hi; } if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) { st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; const float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f; noisy_lengths[ii] = lengths_ccd[ii] + (2.0f * rnd - 1.0f) * 0.05f; if (noisy_lengths[ii] < 0.5f) noisy_lengths[ii] = 0.5f; if (noisy_lengths[ii] > 2.0f) noisy_lengths[ii] = 2.0f; } } float q_temp[FAST_STATE_DIM]; for (int ii = 0; ii < n; ++ii) q_temp[ii] = noisy_angles[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) q_temp[n + ii] = noisy_lengths[ii]; } t_seeds[seed_count++] = clamp01(map.pointToT(q_temp)); } } else if (prefer_compact) { { float q_fabrik[FAST_STATE_DIM]; for (int ii = 0; ii < n; ++ii) q_fabrik[ii] = angles_fabrik[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) q_fabrik[n + ii] = lengths_fabrik[ii]; } t_seeds[seed_count++] = clamp01(map.pointToT(q_fabrik)); } unsigned st_ik = seed + static_cast<unsigned>(it) + 222u; const int remaining = num_ik - 1; for (int v = 0; v < remaining && seed_count < 32; ++v) { float noisy_angles[32]; float noisy_lengths[32]; for (int ii = 0; ii < n; ++ii) { st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; const float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f; noisy_angles[ii] = angles_fabrik[ii] + (2.0f * rnd - 1.0f) * 0.06f; const float lo = (ii == 0) ? -1.0471975511965976f : -2.6179938779914944f; const float hi = 2.6179938779914944f; if (noisy_angles[ii] < lo) noisy_angles[ii] = lo; if (noisy_angles[ii] > hi) noisy_angles[ii] = hi; } if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) { st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; const float rnd = static_cast<float>(st_ik & 0xFFFFFFu) * 5.9604645e-8f; noisy_lengths[ii] = lengths_fabrik[ii] + (2.0f * rnd - 1.0f) * 0.03f; if (noisy_lengths[ii] < 0.5f) noisy_lengths[ii] = 0.5f; if (noisy_lengths[ii] > 2.0f) noisy_lengths[ii] = 2.0f; } } float q_temp[FAST_STATE_DIM]; for (int ii = 0; ii < n; ++ii) q_temp[ii] = noisy_angles[ii]; if (cost.variableLen) { for (int ii = 0; ii < n; ++ii) q_temp[n + ii] = noisy_lengths[ii]; } t_seeds[seed_count++] = clamp01(map.pointToT(q_temp)); } } } } for (int s = 0; s < seed_count; ++s) { const float t_center = t_seeds[s]; const float interval_size = fmaf(0.00031f, static_cast<float>(dim), 0.0f); const float t1b = clamp01(t_center - 0.5f * interval_size); const float t2b = clamp01(t_center + 0.5f * interval_size); const float mid = 0.5f * (t1b + t2b); const TrialPoint left_tp = evaluate_trial_from_t(t1b, t1b, mid, true); const TrialPoint right_tp = evaluate_trial_from_t(t2b, mid, t2b, true); IntervalND* I = make_interval_from_trials(left_tp, right_tp); if (I == nullptr) continue; update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); I->R = fmaf(I->R, fmaf(0.01f, dim_f, 0.85f), 0.0f); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } recompute_dmax(); no_improve = 0; } const float bestFOld = bestF; const bool local_idle = !ensure_queue_nonempty(r_eff); const bool do_hj_this_iter = should_run_hj_this_iter(p); if (!local_idle) { if (do_hj_this_iter) { run_hooke_jeeves_iteration(r_eff, adaptive_coeff_, p); } else { std::pop_heap(H.begin(), H.end(), ComparePtrND); IntervalND* cur = H.back(); H.pop_back(); const float x1 = cur->x1; const float x2 = cur->x2; const float y1 = cur->y1; const float y2 = cur->y2; float m = fmaf(r_eff, Mmax, 0.0f); const float tBase = step(m, x1, x2, y1, y2, dim_f, r_eff, cur->idx1, cur->idx2); const TrialPoint trNew = evaluate_trial_from_t(tBase, x1, x2, true); IntervalND* L = new IntervalND(x1, trNew.t, y1, trNew.f, cur->idx1, trNew.idx); IntervalND* Rv = new IntervalND(trNew.t, x2, trNew.f, y2, trNew.idx, cur->idx2); finalize_interval_geometry(L); finalize_interval_geometry(Rv); const float Mloc = fmaxf(L->M, Rv->M); const float prevMmax = Mmax; update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv); m = fmaf(r_eff, Mmax, 0.0f); const float len1 = fmaf(trNew.t, 1.0f, -x1); const float len2 = fmaf(x2, 1.0f, -trNew.t); if (fmaf(len1, 1.0f, len2) >= dmax - T_EPS) { dmax = fmaxf(len1, len2); for (auto* pI : H) { const float Ls = fmaf(pI->x2, 1.0f, -pI->x1); if (Ls > dmax) dmax = Ls; } } if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) { const float alpha = p * p; const float beta = fmaf(-alpha, 1.0f, 2.0f); const float MULT = (1.0f / fmaxf(dmax, 1.0e-20f)) * Mmax; const float global_coeff = fmaf(MULT, r_eff, -MULT); const float GF = beta * global_coeff; L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f))); Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f))); const size_t sz = H.size(); if (sz > 0u) { if (!hasHardConstraints) RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha); else RecomputeR_AffineM_Mixed_ND(H.data(), sz, GF, alpha); std::make_heap(H.begin(), H.end(), ComparePtrND); } } else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); if (Mloc > prevMmax && Mloc > fmaf(adaptive_coeff_, prevMmax, 0.0f)) { recompute_heap_constM(m); } } H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND); H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND); delete cur; } } if (!H.empty()) { _mm_prefetch(reinterpret_cast<const char*>(H[0]), _MM_HINT_T0); if (H.size() > 1u) _mm_prefetch(reinterpret_cast<const char*>(H[1]), _MM_HINT_T0); } if (!H.empty() && (stagnation || bestF < fmaf(fmaf(bestFOld, 0.6f, 0.0f), adaptive_coeff__, 0.0f)) && it - last_send_T >= send_interval_T) { last_send_T = it; progress_outgoing(); const size_t scan_count = H.size(); unsigned intervals_to_send = (dim < 12) ? static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f))) : 7u; if (intervals_to_send > scan_count) intervals_to_send = static_cast<unsigned>(scan_count); if (intervals_to_send > 0u) { const float alpha = 0.63f - sqrtf(fmaxf(-0.113f * (p - 1.0f), 0.0f)); const float beta = 1.0f - alpha; const float w_pos = 3.085f * (-sqrtf(fmaxf(0.007f * p, 0.0f)) + 1.0f); const float w_size = 5.085f - w_pos; const int num_bins = (p < 1.0f - 1.0e-9f) ? static_cast<int>(1.0f / (1.0f - p)) : 1; std::vector<float> pos_metric(scan_count); std::vector<float> size_metric(scan_count); std::vector<float> R_val(scan_count); float R_max = -FLT_MAX; float R_min = FLT_MAX; float size_max = -FLT_MAX; float size_min = FLT_MAX; float pos_max = -FLT_MAX; float pos_min = FLT_MAX; for (size_t idx = 0u; idx < scan_count; ++idx) { IntervalND* I = H[idx]; const float center = 0.5f * (I->x1 + I->x2); const float len = fmaf(I->x2, 1.0f, -I->x1); const float size = len * static_cast<float>(1 << I->span_level); if (p < 0.95f && num_bins > 1) { float bin = floorf(center * static_cast<float>(num_bins)); if (bin >= static_cast<float>(num_bins)) bin = static_cast<float>(num_bins - 1); pos_metric[idx] = bin / static_cast<float>(num_bins - 1); } else { pos_metric[idx] = center; } size_metric[idx] = size; R_val[idx] = I->R; if (pos_metric[idx] > pos_max) pos_max = pos_metric[idx]; if (pos_metric[idx] < pos_min) pos_min = pos_metric[idx]; if (size > size_max) size_max = size; if (size < size_min) size_min = size; if (I->R > R_max) R_max = I->R; if (I->R < R_min) R_min = I->R; } auto norm = [](float val, float minv, float maxv) noexcept -> float { const float denom = maxv - minv; if (fabsf(denom) <= 1.0e-20f) return 0.0f; return (val - minv) / denom; }; std::vector<float> R_norm(scan_count); std::vector<float> pos_norm(scan_count); std::vector<float> size_norm(scan_count); for (size_t idx = 0u; idx < scan_count; ++idx) { R_norm[idx] = norm(R_val[idx], R_min, R_max); pos_norm[idx] = norm(pos_metric[idx], pos_min, pos_max); size_norm[idx] = norm(size_metric[idx], size_min, size_max); } std::vector<int> selected; selected.reserve(intervals_to_send); std::vector<unsigned char> used(scan_count, 0u); selected.push_back(0); used[0] = true; while (selected.size() < intervals_to_send) { int best_idx = -1; float best_score = -FLT_MAX; for (size_t idx = 0u; idx < scan_count; ++idx) { if (used[idx] != 0u) continue; float min_dist = FLT_MAX; for (int s : selected) { float d2 = 0.0f; float d = pos_norm[idx] - pos_norm[static_cast<size_t>(s)]; d2 += w_pos * d * d; d = size_norm[idx] - size_norm[static_cast<size_t>(s)]; d2 += w_size * d * d; if (d2 < min_dist) min_dist = d2; } const float novelty = sqrtf(fmaxf(min_dist, 0.0f)) / sqrtf(fmaxf(w_pos + w_size, 1.0e-20f)); const float score = alpha * R_norm[idx] + beta * novelty; if (score > best_score) { best_score = score; best_idx = static_cast<int>(idx); } } if (best_idx < 0) break; selected.push_back(best_idx); used[static_cast<size_t>(best_idx)] = 1u; } MultiCrossMsg out{}; out.count = static_cast<unsigned>(selected.size()); float* dest = out.intervals; for (unsigned s = 0u; s < out.count; ++s) { IntervalND* Tt = H[static_cast<size_t>(selected[s])]; dest[0] = Tt->x1; dest[1] = 0.0f; dest[2] = Tt->x2; dest[3] = 0.0f; dest[4] = Tt->R; dest += 5; } const size_t iterations = comm_levels; bool active = true; const bool invert_T = ((static_cast<int>(exchange_counter_T) + 1) & 1) != 0; size_t ii2 = 0u; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ static_cast<int>(step); if (partner < world) { const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T); if (am_sender) { g_pendingMulti.emplace_back(*g_world, partner, out); if (g_pendingMulti.size() > MAX_INFLIGHT_MULTI) { g_pendingMulti.front().req.wait(); g_pendingMulti.pop_front(); } active = false; } } ++ii2; } ++exchange_counter_T; } } const bool indexedBestImprovedSinceLastSend = !bestQIndexed.empty() && better_indexed(bestIndexFound, bestIndexValue, lastSentBestMsg.bestIndex, lastSentBestMsg.bestF); if (indexedBestImprovedSinceLastSend && it - last_send_best >= send_interval_best) { last_send_best = it; progress_outgoing(); const size_t iterations = comm_levels; bool active = true; const bool invert_best = ((static_cast<int>(exchange_counter) + 1) & 1) != 0; size_t ii2 = 0u; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ static_cast<int>(step); if (partner < world) { const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_best); if (am_sender) { BestSolutionMsg outMsg; InitBestSolutionMsg(outMsg); if (FillBestSolutionMsg(outMsg, bestIndexFound, bestIndexValue, bestIndexedX, bestIndexedY, bestQIndexed) && better_indexed(outMsg.bestIndex, outMsg.bestF, lastSentBestMsg.bestIndex, lastSentBestMsg.bestF)) { g_pendingBest.emplace_back(*g_world, partner, outMsg); lastSentBestMsg = outMsg; } if (g_pendingBest.size() > MAX_INFLIGHT_BEST) { g_pendingBest.front().req.wait(); g_pendingBest.pop_front(); } active = false; } } ++ii2; } ++exchange_counter; } while (g_world->iprobe(boost::mpi::any_source, 0)) { MultiCrossMsg in; g_world->recv(boost::mpi::any_source, 0, in); for (unsigned ii = 0u; ii < in.count; ++ii) { const float* d = &in.intervals[ii * 5u]; const float sx = d[0]; const float ex = d[2]; alignas(16) float ql[FAST_STATE_DIM]; float tx = 0.0f; float ty = 0.0f; unsigned v1i = 0u; unsigned v2i = 0u; float y1i = 0.0f; float y2i = 0.0f; map.map01ToPoint(sx, ql); cost.evaluate_indexed(ql, tx, ty, v1i, y1i); map.map01ToPoint(ex, ql); cost.evaluate_indexed(ql, tx, ty, v2i, y2i); IntervalND* inj = new IntervalND(sx, ex, y1i, y2i, v1i, v2i); finalize_interval_geometry(inj); const float prevMmax = Mmax; const float candidateMmax = fmaxf(Mmax, inj->M); const float m_inj = fmaf(r_eff, candidateMmax, 0.0f); inj->ChangeCharacteristic(m_inj); if (H.empty() || inj->R > fmaf(adaptive_coeff, H.front()->R, 0.0f) || stagnation) { update_pockets_and_Mmax(inj); H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtrND); if (Mmax > fmaf(adaptive_coeff_, prevMmax, 0.0f)) { recompute_heap_constM(fmaf(r_eff, Mmax, 0.0f)); } } else { delete inj; } } } while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg incoming; g_world->recv(boost::mpi::any_source, 2, incoming); UpdateIndexedBestFromMessage( incoming, fullConstraintIndex, bestIndexFound, bestIndexValue, bestQIndexed, bestIndexedX, bestIndexedY, bestF, bestQ, bestX, bestY); } const float interval_len = H.empty() ? 0.0f : ((dim > 1) ? fmaf(H.front()->x2, 1.0f, -H.front()->x1) : H.front()->diam); if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter - 1) { for (auto& s : g_pendingMulti) s.req.wait(); for (auto& s : g_pendingBest) s.req.wait(); g_pendingMulti.clear(); g_pendingBest.clear(); while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg incoming; g_world->recv(boost::mpi::any_source, 2, incoming); UpdateIndexedBestFromMessage( incoming, fullConstraintIndex, bestIndexFound, bestIndexValue, bestQIndexed, bestIndexedX, bestIndexedY, bestF, bestQ, bestX, bestY); } g_world->barrier(); BestSolutionMsg bestMsg; InitBestSolutionMsg(bestMsg); if (!bestQIndexed.empty()) { FillBestSolutionMsg(bestMsg, bestIndexFound, bestIndexValue, bestIndexedX, bestIndexedY, bestQIndexed); } const size_t iterations = comm_levels; bool active = true; size_t itx = 0u; while (itx < iterations && active) { const size_t step = 1ULL << itx; const int partner = rank ^ static_cast<int>(step); if (partner < world) { const bool am_sender = (rank & static_cast<int>(step)) != 0; if (am_sender) { g_world->send(partner, 3, bestMsg); active = false; } else { BestSolutionMsg inBest; InitBestSolutionMsg(inBest); g_world->recv(partner, 3, inBest); if (better_indexed(inBest.bestIndex, inBest.bestF, bestMsg.bestIndex, bestMsg.bestF)) { bestMsg = inBest; } } } ++itx; } if (rank == 0) { UpdateIndexedBestFromMessage( bestMsg, fullConstraintIndex, bestIndexFound, bestIndexValue, bestQIndexed, bestIndexedX, bestIndexedY, bestF, bestQ, bestX, bestY); } const bool haveFeasibleBest = (bestIndexFound == fullConstraintIndex) && !bestQ.empty(); if (haveFeasibleBest && !transitionMode) { alignas(16) float q_local[FAST_STATE_DIM]; std::memcpy(q_local, bestQ.data(), static_cast<size_t>(dim) * sizeof(float)); float x_final = bestX; float y_final = bestY; const float f_final = bestF; const int last = n - 1; const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f; const float hi = 2.6179938779914943653855361527329f; float bestLocF = f_final; float saved = q_local[last]; float delta = 0.05f; while (delta >= 0.00625f) { int sgn = -1; while (sgn < 2) { float cand = fmaf(static_cast<float>(sgn), delta, saved); if (cand < lo) cand = lo; else if (cand > hi) cand = hi; const float backup = q_local[last]; q_local[last] = cand; float x2_loc = 0.0f; float y2_loc = 0.0f; unsigned idx2 = 0u; float val2 = 0.0f; const bool feasible2 = cost.evaluate_indexed(q_local, x2_loc, y2_loc, idx2, val2); if (feasible2 && val2 < bestLocF) { bestLocF = val2; x_final = x2_loc; y_final = y2_loc; saved = cand; } q_local[last] = backup; sgn += 2; } delta *= 0.5f; } if (bestLocF < f_final) { q_local[last] = saved; bestF = bestLocF; bestX = x_final; bestY = y_final; bestQ.assign(q_local, q_local + dim); bestIndexFound = fullConstraintIndex; bestIndexValue = bestF; bestQIndexed = bestQ; bestIndexedX = bestX; bestIndexedY = bestY; } } else if (haveFeasibleBest) { bestIndexFound = fullConstraintIndex; bestIndexValue = bestF; bestQIndexed = bestQ; bestIndexedX = bestX; bestIndexedY = bestY; } else { bestQ = bestQIndexed; bestF = bestIndexValue; bestX = bestIndexedX; bestY = bestIndexedY; } out_iterations = static_cast<size_t>(it); out_achieved_epsilon = interval_len; return; } ++it; } }

Если хочешь, следующим сообщением я могу отдельно очень коротко перечислить именно те 6–8 мест, где быстрая версия ломала Transition, чтобы ты мог точечно проверить это у себя в профилировщике и тестах.

Compartilhar esta Q&A