"static __declspec(noalias) void agp_run_branch_mp...

Créé le : 5 mars 2026

Répondu en utilisant GPT-5.2 Thinking par Chat01

Question

"static declspec(noalias) void agp_run_branch_mpi(
const MortonND& map,
const ManipCost& cost,
int maxIter,
float r,
bool adaptive,
float eps,
unsigned seed,
std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H,
std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ,
float& bestF,
float& bestX,
float& bestY,
size_t& out_iterations,
float& out_achieved_epsilon,
float M_prior) noexcept {
const int n = cost.n;
const int dim = n + (cost.variableLen ? n : 0);
const float dim_f = static_cast<float>(dim);
unsigned exchange_counter = 0;
unsigned exchange_counter_T = 0;
alignas(16) float M_by_span[12];
int msi = 0;
while (msi < 12)
M_by_span[msi++] = M_prior;
float Mmax = M_prior;
alignas(16) float q_local[32];
alignas(16) float phi[32];
alignas(16) float s_arr[32];
alignas(16) float c_arr[32];
alignas(16) float sum_s[32];
alignas(16) float sum_c[32];
alignas(16) float q_try[32];
bestQ.reserve(static_cast<size_t>(dim));
float x = 0.0f;
float y = 0.0f;
int no_improve = 0;
auto t_to_idx = [&](float t) -> unsigned long long {
unsigned long long idx = static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f));
return idx;
};
auto update_pockets_and_Mmax = [&](IntervalND* I) {
const int k = I->span_level;
if (I->M > M_by_span[k])
M_by_span[k] = I->M;
if (M_by_span[k] > Mmax)
Mmax = M_by_span[k];
};
const float a = 0.0f;
const float b = 1.0f;
float p = 0.0f;
float dmax = b - a;
const float initial_len = dmax;
const float A_dim = fmaf(1.0f / sqrtf(dim_f + 6.75f), 5.535f, 0.0f);
const float A_dim
= fmaf(1.0f / sqrtf(dim_f + 6.75f), 3.425f, 0.0f);
const float B_dim = fmaf(A_dim, 0.7f, 0.0f);
const float B_dim__ = fmaf(A_dim__, 4.325f, 0.0f);
const float log_argument = A_dim - 2.03f;
const float log_argument__ = A_dim__ - 2.0f;
const float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim;
const float C_dim__ = fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument__) - B_dim__;
const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float adaptive_coeff_addition__ = fmaf(C_dim__, fmaf(C_dim__, fmaf(C_dim__, fmaf(C_dim__, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
float adaptive_coeff = A_dim - adaptive_coeff_addition;
float adaptive_coeff__ = A_dim__ - adaptive_coeff_addition__;
const float A_dim_clone = fmaf(A_dim - fmaf(-fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim), 0.5f, 0.0f);
int it = 0;
// --- Переменные для механизма стагнации с усилением исследования ---
int stag_boost_remaining = 0;
float stag_r_multiplier = 0.0f;

text
const int n_stag_iters = static_cast<int>(3.0f + 2.045f * sqrtf(dim_f)); // количество итераций усиления auto evalAt = [&](const float t) -> float { map.map01ToPoint(t, q_local); float f = cost(q_local, x, y); if (f < fmaf(bestF, adaptive_coeff, 0.0f)) { // --- значение функции в точке старта локальной доводки (для триггера 70%) --- const float f_start = f; // Параметры Armijo const float c1 = 1e-4f; // достаточное убывание const float tau = 0.5f; // коэффициент уменьшения шага const int max_outer_iters = (int)(50.0f * (1.0f + 0.65f * p)); const int max_backtrack = (int)(20.0f * (1.0f + 0.65f * p)); // --- L-BFGS параметры --- const float lbfgs_trigger = 0.6f; // 70% относительное улучшение const int m_lbfgs = 9; // память const int max_lbfgs_iters = (int)(25.0f * (1.0f + 0.65f * p)); const float eps_lbfgs_curv = 1e-6f; // порог кривизны ys const float eps_descent = 1e-12f; float eta = 1.0f; // шаг для GD (и как стартовый для line-search) // ------------------------------------------------------------ // Вспомогательное: clamp в допустимые границы // ------------------------------------------------------------ auto clampPoint = [&](float* q) { int i = 0;

#pragma loop ivdep
while (i < n) {
const float lo = (i == 0) ? -1.0471975511965977f : -2.6179938779914944f;
const float hi = 2.6179938779914944f;
if (q[i] < lo) q[i] = lo;
else if (q[i] > hi) q[i] = hi;
++i;
}
if (cost.variableLen) {
i = 0;
#pragma loop ivdep
while (i < n) {
if (q[n + i] < 0.5f) q[n + i] = 0.5f;
else if (q[n + i] > 2.0f) q[n + i] = 2.0f;
++i;
}
}
};

text
// ------------------------------------------------------------ // Вспомогательное: вычислить градиент // Вход: q_in, x_in, y_in // Выход: grad_out[dim], grad_norm2_out // ------------------------------------------------------------ auto computeGrad = [&](const float* q_in, const float x_in, const float y_in, float* grad_out, float& grad_norm2_out) { // ---- Вычисляем градиент в текущей точке q_in ---- float acc = 0.0f; int ii = 0;

#pragma loop ivdep
while (ii < n) {
acc = fmaf(q_in[ii], 1.0f, acc);
phi[ii] = acc;
++ii;
}
FABE13_SINCOS(phi, s_arr, c_arr, n);

text
float as = 0.0f; float ac = 0.0f; int k = n - 1; while (k >= 0) { const float Lk = cost.variableLen ? q_in[n + k] : 1.0f; as = fmaf(Lk, s_arr[k], as); ac = fmaf(Lk, c_arr[k], ac); sum_s[k] = as; sum_c[k] = ac; --k; } const float dx = fmaf(x_in, 1.0f, -cost.targetX); const float dy = fmaf(y_in, 1.0f, -cost.targetY); const float dist = sqrtf(fmaf(dx, dx, dy * dy)); const float inv_dist = 1.0f / dist; grad_norm2_out = 0.0f; // Градиент по углам int i = 0;

#pragma loop ivdep
while (i < n) {
float gpen = 0.0f;
// Производная штрафа за превышение minTheta
{
const float ai = fabsf(q_in[i]);
const float v = fmaf(ai, 1.0f, -cost.minTheta);
if (v > 0.0f) {
const float scale_arg = fmaf(2.0f / cost.minTheta, v * 0.69314718f, 0.0f);
const float exp_val = fmaf(scale_arg,
fmaf(scale_arg,
fmaf(scale_arg,
fmaf(scale_arg, 0.00833333377f, 0.0416666679f),
0.16666667f),
0.5f),
1.0f);
const float dpen = fmaf(cost.sharpW, exp_val * (1.38629436f / cost.minTheta), 0.0f);
gpen = fmaf(dpen, copysignf(1.0f, q_in[i]), gpen);
}
}
// Производная арк-штрафа
{
const float tsg = fmaf(-q_in[i], cost.archBiasK, 0.0f);
const float exp_arg = -tsg;
const float exp_val = fmaf(exp_arg,
fmaf(exp_arg,
fmaf(exp_arg,
fmaf(exp_arg, 0.00833333377f, 0.0416666679f),
0.16666667f),
0.5f),
1.0f);
const float sig = 1.0f / (exp_val + 1.0f);
gpen = fmaf(-cost.archBiasW * cost.archBiasK, sig, gpen);
}

text
const float g_main = fmaf(dx, -sum_s[i], dy * sum_c[i]) * inv_dist; float gi = g_main + gpen; grad_out[i] = gi; grad_norm2_out = fmaf(gi, gi, grad_norm2_out); ++i; } // Градиент по длинам if (cost.variableLen) { int j = 0;

#pragma loop ivdep
while (j < n) {
const float gi = fmaf(dx, c_arr[j], dy * s_arr[j]) * inv_dist;
grad_out[n + j] = gi;
grad_norm2_out = fmaf(gi, gi, grad_norm2_out);
++j;
}
}
};

text
// ------------------------------------------------------------ // Вспомогательное: общий Armijo backtracking для направления dir // Условие: f_new <= f_cur + c1 * alpha * (g^T d) // Возвращает флаг clipped (было ли обрезание границ) через ссылку. // ------------------------------------------------------------ auto armijoLineSearch = [&](const float* q_base, const float f_base, const float x_base, const float y_base, const float* grad_base, const float* dir, const float gtd, float& alpha_io, float* q_out, float& f_out, float& x_out, float& y_out, bool& clipped) -> bool { float alpha = alpha_io; int backtrack = 0; while (backtrack < max_backtrack) { int i = 0;

#pragma loop ivdep
while (i < dim) {
q_out[i] = fmaf(alpha, dir[i], q_base[i]);
++i;
}

text
// Сохраняем копию до клиппинга float q_before_clamp[64]; memcpy(q_before_clamp, q_out, dim * sizeof(float)); // clamp как у тебя clampPoint(q_out); // Проверяем, был ли клиппинг clipped = false; i = 0;

#pragma loop ivdep
while (i < dim) {
if (fabsf(q_out[i] - q_before_clamp[i]) > 1e-12f) {
clipped = true;
break;
}
++i;
}

text
float x2, y2; float f_try = cost(q_out, x2, y2); // проверки на NaN/Inf if (!(f_try == f_try)) { // NaN alpha *= tau; ++backtrack; continue; } // Armijo if (f_try <= f_base + c1 * alpha * gtd) { alpha_io = alpha; f_out = f_try; x_out = x2; y_out = y2; return true; } alpha *= tau; ++backtrack; } return false; }; // ------------------------------------------------------------ // 1) Градиентный спуск // 2) При улучшении >= 70% -> L-BFGS, с fallback обратно в GD // ------------------------------------------------------------ bool lbfgs_already_tried = false; int outer = 0; while (outer < max_outer_iters) { // ---- Градиент в текущей точке ---- float grad[64]; float grad_norm2 = 0.0f; computeGrad(q_local, x, y, grad, grad_norm2); if (!(grad_norm2 == grad_norm2) || grad_norm2 < 1e-12f) break; // ---- Направление GD: d = -g ---- float dir_gd[64]; int i = 0;

#pragma loop ivdep
while (i < dim) {
dir_gd[i] = -grad[i];
++i;
}

text
// Для d = -g: g^T d = -||g||^2 const float gtd_gd = -grad_norm2; // ---- Line-search (Armijo) ---- float eta_trial = eta; float f_new = f, x_new = x, y_new = y; bool clipped_gd = false; // пробная точка в q_try bool found = armijoLineSearch(q_local, f, x, y, grad, dir_gd, gtd_gd, eta_trial, q_try, f_new, x_new, y_new, clipped_gd); if (!found) break; // не удалось сделать шаг — выход // Шаг принят memcpy(q_local, q_try, static_cast<size_t>(dim) * sizeof(float)); f = f_new; x = x_new; y = y_new; eta = eta_trial; // запоминаем удачный шаг // -------------------------------------------------------- // Триггер: относительное улучшение от f_start // improvement = (f_start - f) / f_start // -------------------------------------------------------- const float denom = fmaxf(f_start, 1e-12f); const float rel_impr = (f_start - f) / denom; // Запускаем L-BFGS при достижении порога (без блокировки по клиппингу) if (!lbfgs_already_tried && rel_impr >= lbfgs_trigger) { lbfgs_already_tried = true; // --- Сохраняем точку, чтобы уметь вернуться в GD --- float q_resume[64]; memcpy(q_resume, q_local, static_cast<size_t>(dim) * sizeof(float)); float f_resume = f; float x_resume = x; float y_resume = y; float eta_resume = eta; // --- Лучшая точка, найденная внутри L-BFGS --- float q_best_lbfgs[64]; memcpy(q_best_lbfgs, q_local, static_cast<size_t>(dim) * sizeof(float)); float f_best_lbfgs = f; float x_best_lbfgs = x; float y_best_lbfgs = y; // --- История L-BFGS --- float s_hist[m_lbfgs][64]; float y_hist[m_lbfgs][64]; float rho_hist[m_lbfgs]; float alpha_hist[m_lbfgs]; int hist_size = 0; // --- Текущий градиент для L-BFGS --- float gk[64]; float gk_norm2 = 0.0f; computeGrad(q_local, x, y, gk, gk_norm2); bool lbfgs_ok = true; // стартовый alpha для L-BFGS (можно взять eta, но чаще лучше 1.0) float alpha_k = 1.0f; int it = 0; while (it < max_lbfgs_iters) { if (!(gk_norm2 == gk_norm2) || gk_norm2 < 1e-12f) break; // ---- Считаем направление L-BFGS ---- float dir[64]; if (hist_size == 0) { int d = 0;

#pragma loop ivdep
while (d < dim) {
dir[d] = -gk[d];
++d;
}
}
else {
// two-loop recursion
float q_vec[64];
int d = 0;
#pragma loop ivdep
while (d < dim) {
q_vec[d] = gk[d];
++d;
}

text
for (int jj = hist_size - 1; jj >= 0; --jj) { float dot_sq = 0.0f; d = 0;

#pragma loop ivdep
while (d < dim) {
dot_sq = fmaf(s_hist[jj][d], q_vec[d], dot_sq);
++d;
}
const float a = dot_sq * rho_hist[jj];
alpha_hist[jj] = a;

							d = 0;

#pragma loop ivdep
while (d < dim) {
q_vec[d] = fmaf(-a, y_hist[jj][d], q_vec[d]);
++d;
}
}

text
// gamma = (s_{k-1}^T y_{k-1}) / (y_{k-1}^T y_{k-1}) float gamma = 1.0f; { const int last = hist_size - 1; float yy = 0.0f; d = 0;

#pragma loop ivdep
while (d < dim) {
yy = fmaf(y_hist[last][d], y_hist[last][d], yy);
++d;
}
// ys = 1/rho
const float ys = 1.0f / rho_hist[last];
if (yy > 0.0f) gamma = ys / yy;
}

text
float r_vec[64]; d = 0;

#pragma loop ivdep
while (d < dim) {
r_vec[d] = gamma * q_vec[d];
++d;
}

text
for (int jj = 0; jj < hist_size; ++jj) { float dot_yr = 0.0f; d = 0;

#pragma loop ivdep
while (d < dim) {
dot_yr = fmaf(y_hist[jj][d], r_vec[d], dot_yr);
++d;
}
const float b = dot_yr * rho_hist[jj];
const float coeff = alpha_hist[jj] - b;

							d = 0;

#pragma loop ivdep
while (d < dim) {
r_vec[d] = fmaf(coeff, s_hist[jj][d], r_vec[d]);
++d;
}
}

						d = 0;

#pragma loop ivdep
while (d < dim) {
dir[d] = -r_vec[d];
++d;
}
}

text
// ---- Проверка, что направление действительно спусковое ---- float gtd = 0.0f; int d = 0;

#pragma loop ivdep
while (d < dim) {
gtd = fmaf(gk[d], dir[d], gtd);
++d;
}
if (!(gtd == gtd) || gtd >= -eps_descent) {
// fallback на -grad
d = 0;
#pragma loop ivdep
while (d < dim) {
dir[d] = -gk[d];
++d;
}
gtd = -gk_norm2;
if (gtd >= -eps_descent) { lbfgs_ok = false; break; }
}

text
// ---- Сохраняем старую точку/градиент для s,y ---- float q_old[64]; memcpy(q_old, q_local, static_cast<size_t>(dim) * sizeof(float)); float g_old[64]; memcpy(g_old, gk, static_cast<size_t>(dim) * sizeof(float)); const float f_old = f; const float x_old = x; const float y_old = y; // ---- Line-search Armijo по L-BFGS направлению ---- float alpha_try = alpha_k; float f_try, x_try, y_try; bool clipped_lbfgs = false; bool step_ok = armijoLineSearch(q_local, f, x, y, gk, dir, gtd, alpha_try, q_try, f_try, x_try, y_try, clipped_lbfgs); if (!step_ok) { lbfgs_ok = false; break; } // ---- Принимаем шаг ---- memcpy(q_local, q_try, static_cast<size_t>(dim) * sizeof(float)); f = f_try; x = x_try; y = y_try; alpha_k = alpha_try; // ---- Новый градиент ---- computeGrad(q_local, x, y, gk, gk_norm2); if (!(gk_norm2 == gk_norm2)) { lbfgs_ok = false; break; } // ---- Обновляем лучшую точку внутри L-BFGS ---- if (f < f_best_lbfgs) { f_best_lbfgs = f; memcpy(q_best_lbfgs, q_local, static_cast<size_t>(dim) * sizeof(float)); x_best_lbfgs = x; y_best_lbfgs = y; } // ---- s = x_{k+1}-x_k, y = g_{k+1}-g_k ---- float ys = 0.0f; float s_new[64]; float y_new[64]; d = 0;

#pragma loop ivdep
while (d < dim) {
const float sd = q_local[d] - q_old[d];
const float yd = gk[d] - g_old[d];
s_new[d] = sd;
y_new[d] = yd;
ys = fmaf(yd, sd, ys);
++d;
}

text
if (!(ys == ys) || ys <= eps_lbfgs_curv) { // плохая кривизна -> сброс истории hist_size = 0; } else { // пушим в историю (сдвигом; m маленькое) if (hist_size < m_lbfgs) { const int idx = hist_size; rho_hist[idx] = 1.0f / ys; d = 0;

#pragma loop ivdep
while (d < dim) {
s_hist[idx][d] = s_new[d];
y_hist[idx][d] = y_new[d];
++d;
}
++hist_size;
}
else {
// shift 0..m-2 <- 1..m-1
for (int jj = 0; jj < m_lbfgs - 1; ++jj) {
rho_hist[jj] = rho_hist[jj + 1];
d = 0;
#pragma loop ivdep
while (d < dim) {
s_hist[jj][d] = s_hist[jj + 1][d];
y_hist[jj][d] = y_hist[jj + 1][d];
++d;
}
}
const int idx = m_lbfgs - 1;
rho_hist[idx] = 1.0f / ys;
d = 0;
#pragma loop ivdep
while (d < dim) {
s_hist[idx][d] = s_new[d];
y_hist[idx][d] = y_new[d];
++d;
}
hist_size = m_lbfgs;
}
}

text
// ---- критерий выхода ---- if (gk_norm2 < 1e-12f) break; ++it; } if (lbfgs_ok) { // L-BFGS успешно: фиксируем лучшую точку из L-BFGS и завершаем доводку memcpy(q_local, q_best_lbfgs, static_cast<size_t>(dim) * sizeof(float)); f = f_best_lbfgs; x = x_best_lbfgs; y = y_best_lbfgs; break; // выходим из GD-цикла: L-BFGS — финальная стадия } else { // L-BFGS провалился: откатываемся (но если L-BFGS успел найти лучше — берем лучше) if (f_best_lbfgs < f_resume) { memcpy(q_local, q_best_lbfgs, static_cast<size_t>(dim) * sizeof(float)); f = f_best_lbfgs; x = x_best_lbfgs; y = y_best_lbfgs; } else { memcpy(q_local, q_resume, static_cast<size_t>(dim) * sizeof(float)); f = f_resume; x = x_resume; y = y_resume; } // возвращаем шаг GD и продолжаем GD eta = eta_resume; } } ++outer; } // Обновляем лучшую точку if (f < bestF) { bestF = f; bestQ.assign(q_local, q_local + dim); bestX = x; bestY = y; no_improve = 0; } else { ++no_improve; } } return f; }; const float f_a = evalAt(a); const float f_b = evalAt(b); const int K = static_cast<int>(fmaf(-fmaf(sqrtf(dim_f), dim_f, 0.0f), 0.725f, 10.95f)); H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + 16u); const int rank = g_world->rank(); const int world = g_world->size(); alignas(16) float seeds[256 * 32]; const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, static_cast<unsigned>(fmaf(static_cast<float>(rank), 7919.0f, static_cast<float>(seed)))); int i = 0; while (i < seedCnt) { const float* s = seeds + static_cast<size_t>(fmaf(static_cast<float>(i), 32.0f, 0.0f)); const float t_seed = map.pointToT(s); const float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f) : fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f), exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)), 0.0f); const float t1 = fmaf(-interval_size, 1.0f, t_seed); const float t2 = fmaf(interval_size, 1.0f, t_seed); alignas(16) float q1[32]; alignas(16) float q2[32]; float x1; float y1; float x2; float y2; map.map01ToPoint(t1, q1); const float f1 = cost(q1, x1, y1); map.map01ToPoint(t2, q2); const float f2 = cost(q2, x2, y2); IntervalND* I = new IntervalND(t1, t2, f1, f2); I->i1 = t_to_idx(t1); I->i2 = t_to_idx(t2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); if (i < 3) I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f); else { const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f); const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f); const float mult = fmaf(start_mult, exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)), 0.0f); I->R = fmaf(I->R, mult, 0.0f); } H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); if (f1 < bestF) { bestF = f1; bestQ.assign(q1, q1 + dim); bestX = x1; bestY = y1; } if (f2 < bestF) { bestF = f2; bestQ.assign(q2, q2 + dim); bestX = x2; bestY = y2; } ++i; } float prev_t = a; float prev_f = f_a; int k = 1; while (k <= K) { const float t = fmaf(fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1))); const float f = evalAt(t); IntervalND* I = new IntervalND(prev_t, t, prev_f, f); I->i1 = t_to_idx(prev_t); I->i2 = t_to_idx(t); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); prev_t = t; prev_f = f; ++k; } IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b); tail->i1 = t_to_idx(prev_t); tail->i2 = t_to_idx(b); tail->diam = map.block_diameter(tail->i1, tail->i2); tail->compute_span_level(map); tail->set_metric(tail->diam); update_pockets_and_Mmax(tail); tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); H.emplace_back(tail); std::push_heap(H.begin(), H.end(), ComparePtrND); const int noImproveThrDim = static_cast<int>(fmaf(7.5f, exp2f(-0.1f * sqrtf(dim_f)), 0.0f)); while (true) { boost::mpi::wait_all(pending_requests->begin(), pending_requests->end()); p = fmaf(-1.0f / initial_len, dmax, 1.0f); stag_r_multiplier = 1.4f - 1.1f * fmaf(0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, fmaf(0.65f * p - 0.45f, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), 0.65f * p - 0.45f); const float p_arg = fmaf(p, 2.3f, -2.9775f); float current_r = r; if (stag_boost_remaining > 0) { current_r = r * stag_r_multiplier; stag_boost_remaining--; } // ----- Проверка стагнации с учётом нормы градиента ----- const float grad_threshold = 0.5e-1f; // порог, как для переключения на L-BFGS float grad_norm2_best = 0.0f; if (no_improve > 0) { // Вычисляем градиент в текущей лучшей точке bestQ float acc_best = 0.0f; float phi_best[32]; int ii_best = 0;

#pragma loop ivdep
while (ii_best < n) {
acc_best = fmaf(bestQ[ii_best], 1.0f, acc_best);
phi_best[ii_best] = acc_best;
++ii_best;
}
float s_best[32], c_best[32];
FABE13_SINCOS(phi_best, s_best, c_best, n);

text
float as_best = 0.0f, ac_best = 0.0f; float sum_s_best[32], sum_c_best[32]; int k_best = n - 1; while (k_best >= 0) { const float Lk = cost.variableLen ? bestQ[n + k_best] : 1.0f; as_best = fmaf(Lk, s_best[k_best], as_best); ac_best = fmaf(Lk, c_best[k_best], ac_best); sum_s_best[k_best] = as_best; sum_c_best[k_best] = ac_best; --k_best; } const float dx_best = fmaf(bestX, 1.0f, -cost.targetX); const float dy_best = fmaf(bestY, 1.0f, -cost.targetY); const float dist_best = sqrtf(fmaf(dx_best, dx_best, dy_best * dy_best)); const float inv_dist_best = 1.0f / dist_best; int i_best = 0;

#pragma loop ivdep
while (i_best < n) {
float gpen_best = 0.0f;
// Производная штрафа за превышение minTheta
{
const float ai = fabsf(bestQ[i_best]);
const float v = fmaf(ai, 1.0f, -cost.minTheta);
if (v > 0.0f) {
const float scale_arg = fmaf(2.0f / cost.minTheta, v * 0.69314718f, 0.0f);
const float exp_val = fmaf(scale_arg,
fmaf(scale_arg,
fmaf(scale_arg,
fmaf(scale_arg, 0.00833333377f, 0.0416666679f),
0.16666667f),
0.5f),
1.0f);
const float dpen = fmaf(cost.sharpW, exp_val * (1.38629436f / cost.minTheta), 0.0f);
gpen_best = fmaf(dpen, copysignf(1.0f, bestQ[i_best]), gpen_best);
}
}
// Производная арк-штрафа
{
const float tsg = fmaf(-bestQ[i_best], cost.archBiasK, 0.0f);
const float exp_arg = -tsg;
const float exp_val = fmaf(exp_arg,
fmaf(exp_arg,
fmaf(exp_arg,
fmaf(exp_arg, 0.00833333377f, 0.0416666679f),
0.16666667f),
0.5f),
1.0f);
const float sig = 1.0f / (exp_val + 1.0f);
gpen_best = fmaf(-cost.archBiasW * cost.archBiasK, sig, gpen_best);
}

text
const float g_main_best = fmaf(dx_best, -sum_s_best[i_best], dy_best * sum_c_best[i_best]) * inv_dist_best; float gi_best = g_main_best + gpen_best; grad_norm2_best = fmaf(gi_best, gi_best, grad_norm2_best); ++i_best; } if (cost.variableLen) { int j_best = 0;

#pragma loop ivdep
while (j_best < n) {
const float gi_best = fmaf(dx_best, c_best[j_best], dy_best * s_best[j_best]) * inv_dist_best;
grad_norm2_best = fmaf(gi_best, gi_best, grad_norm2_best);
++j_best;
}
}
}

text
// Модифицированное условие стагнации const bool stagnation = (no_improve > noImproveThrDim) && (grad_norm2_best < grad_threshold); const float r_eff = dim > 2 ? fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, fmaf(sqrtf(dim_f - 1), current_r, 0.0f), 0.0f) : fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, current_r, 0.0f); if (stagnation) { // Устанавливаем усиление на несколько итераций stag_boost_remaining = n_stag_iters; // Определяем количество разведывательных сидов по формуле из текущего кода const int num_ik = 1 + static_cast<int>(sqrtf(dim_f)); // ---- Оценка вытянутости конфигурации ---- float dist_to_target = sqrtf(cost.targetX * cost.targetX + cost.targetY * cost.targetY); float max_reach = 0.0f; if (cost.variableLen) { for (int i = 0; i < n; ++i) max_reach += map.high[n + i]; } else { max_reach = static_cast<float>(n); } float ratio = dist_to_target / max_reach; bool prefer_extended = (ratio > 0.7f); bool prefer_compact = (ratio < 0.4f); bool use_ik = !(ratio > 0.4f && ratio < 0.7f); // Массивы для хранения сидов float t_seeds[32]; int seed_count = 0; if (!use_ik) { // В неопределённом случае используем Sobol-сиды float temp_S[32 * 32]; int sobol_gen = generate_sobol_seeds(map, dim, temp_S, 32, seed + it); int num_sobol = num_ik; // берём столько же, сколько планировали IK for (int k = 0; k < num_sobol && k < sobol_gen; ++k) { const float* s = temp_S + k * 32; float t_s = map.pointToT(s); t_seeds[seed_count++] = t_s; } } else { // Используем IK-сиды // Базовые решения вычисляем так же, но возможно только одно из них float angles_ccd[32] = { 0 }; float lengths_ccd[32]; if (cost.variableLen) { float len_low = map.low[n]; float len_high = map.high[n]; float avg_len = (len_low + len_high) * 0.5f; for (int i = 0; i < n; ++i) lengths_ccd[i] = avg_len; } else { for (int i = 0; i < n; ++i) lengths_ccd[i] = 1.0f; } ccd_ik(cost.targetX, cost.targetY, lengths_ccd, n, angles_ccd, 10); float angles_fabrik[32] = { 0 }; float lengths_fabrik[32]; if (cost.variableLen) { for (int i = 0; i < n; ++i) lengths_fabrik[i] = lengths_ccd[i]; } else { for (int i = 0; i < n; ++i) lengths_fabrik[i] = 1.0f; } float targetX_fab = cost.targetX; float targetY_fab = cost.targetY; for (int iter_fab = 0; iter_fab < 3; ++iter_fab) { float prevX = targetX_fab; float prevY = targetY_fab; for (int j = n - 1; j >= 0; --j) { float len = lengths_fabrik[j]; float angle_to_target = atan2f(prevY, prevX); angles_fabrik[j] = angle_to_target; float s_val, c_val; FABE13_SINCOS(&angle_to_target, &s_val, &c_val, 1); prevX = prevX - len * c_val; prevY = prevY - len * s_val; } } if (prefer_extended) { // --- Чистый CCD --- { float q_ccd[32]; for (int i = 0; i < n; ++i) q_ccd[i] = angles_ccd[i]; if (cost.variableLen) { for (int i = 0; i < n; ++i) q_ccd[n + i] = lengths_ccd[i]; } float t_ccd = map.pointToT(q_ccd); t_seeds[seed_count++] = t_ccd; } // --- Остальные сиды с шумом на основе CCD --- unsigned st_ik = seed + it + 222; int remaining = num_ik - 1; for (int v = 0; v < remaining; ++v) { float noisy_angles[32]; float noisy_lengths[32]; for (int i = 0; i < n; ++i) { st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; float rnd = static_cast<float>(st_ik & 0xFFFFFF) * 5.9604645e-8f; noisy_angles[i] = angles_ccd[i] + (2.0f * rnd - 1.0f) * 0.1f; const float lo = (i == 0) ? -1.0471975511965976f : -2.6179938779914944f; const float hi = 2.6179938779914944f; if (noisy_angles[i] < lo) noisy_angles[i] = lo; if (noisy_angles[i] > hi) noisy_angles[i] = hi; } if (cost.variableLen) { for (int i = 0; i < n; ++i) { st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; float rnd = static_cast<float>(st_ik & 0xFFFFFF) * 5.9604645e-8f; noisy_lengths[i] = lengths_ccd[i] + (2.0f * rnd - 1.0f) * 0.05f; if (noisy_lengths[i] < 0.5f) noisy_lengths[i] = 0.5f; if (noisy_lengths[i] > 2.0f) noisy_lengths[i] = 2.0f; } } float q_temp[32]; for (int i = 0; i < n; ++i) q_temp[i] = noisy_angles[i]; if (cost.variableLen) { for (int i = 0; i < n; ++i) q_temp[n + i] = noisy_lengths[i]; } float t_temp = map.pointToT(q_temp); t_seeds[seed_count++] = t_temp; } } else if (prefer_compact) { // --- Чистый FABRIK --- { float q_fabrik[32]; for (int i = 0; i < n; ++i) q_fabrik[i] = angles_fabrik[i]; if (cost.variableLen) { for (int i = 0; i < n; ++i) q_fabrik[n + i] = lengths_fabrik[i]; } float t_fabrik = map.pointToT(q_fabrik); t_seeds[seed_count++] = t_fabrik; } // --- Остальные сиды с шумом на основе FABRIK --- unsigned st_ik = seed + it + 222; int remaining = num_ik - 1; for (int v = 0; v < remaining; ++v) { float noisy_angles[32]; float noisy_lengths[32]; for (int i = 0; i < n; ++i) { st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; float rnd = static_cast<float>(st_ik & 0xFFFFFF) * 5.9604645e-8f; noisy_angles[i] = angles_fabrik[i] + (2.0f * rnd - 1.0f) * 0.1f; const float lo = (i == 0) ? -1.0471975511965976f : -2.6179938779914944f; const float hi = 2.6179938779914944f; if (noisy_angles[i] < lo) noisy_angles[i] = lo; if (noisy_angles[i] > hi) noisy_angles[i] = hi; } if (cost.variableLen) { for (int i = 0; i < n; ++i) { st_ik ^= st_ik << 13; st_ik ^= st_ik >> 17; st_ik ^= st_ik << 5; float rnd = static_cast<float>(st_ik & 0xFFFFFF) * 5.9604645e-8f; noisy_lengths[i] = lengths_fabrik[i] + (2.0f * rnd - 1.0f) * 0.05f; if (noisy_lengths[i] < 0.5f) noisy_lengths[i] = 0.5f; if (noisy_lengths[i] > 2.0f) noisy_lengths[i] = 2.0f; } } float q_temp[32]; for (int i = 0; i < n; ++i) q_temp[i] = noisy_angles[i]; if (cost.variableLen) { for (int i = 0; i < n; ++i) q_temp[n + i] = noisy_lengths[i]; } float t_temp = map.pointToT(q_temp); t_seeds[seed_count++] = t_temp; } } } // Теперь у нас есть seed_count сидов (должно быть равно num_ik) // Проведём локальную оптимизацию для каждого float optimized_points[32][32]; float optimized_f[32]; float optimized_t[32]; for (int s = 0; s < seed_count; ++s) { float t_cur = t_seeds[s]; float f_opt = evalAt(t_cur); // запускает локальную доводку, обновляет q_local, x, y memcpy(optimized_points[s], q_local, dim * sizeof(float)); optimized_f[s] = f_opt; optimized_t[s] = map.pointToT(q_local); } // Создаём один интервал вокруг каждой оптимизированной точки for (int s = 0; s < seed_count; ++s) { float t_opt = optimized_t[s]; const float interval_size = fmaf(0.00031f, static_cast<float>(dim), 0.0f); float t1 = t_opt - interval_size * 0.5f; float t2 = t_opt + interval_size * 0.5f; if (t1 < 0.0f) t1 = 0.0f; if (t2 > 1.0f) t2 = 1.0f; float q1[32], q2[32]; float x1, y1, x2, y2; map.map01ToPoint(t1, q1); float f1 = cost(q1, x1, y1); map.map01ToPoint(t2, q2); float f2 = cost(q2, x2, y2); IntervalND* I = new IntervalND(t1, t2, f1, f2); I->i1 = t_to_idx(t1); I->i2 = t_to_idx(t2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); const float boost = fmaf(0.01f, dim_f, 0.85f); I->R = fmaf(I->R, boost, 0.0f); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } // Сбрасываем счётчик стагнации no_improve = 0; } const float exp_arg = fmaf(B_dim, p, 0.0f); const float exp_arg__ = fmaf(B_dim__, p, 0.0f); adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim); float first_sqrt = sqrtf(fmaf(1.0f / dim_f, 2.0f, 0.0f)); float second_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 5.0f, 0.0f)); float third_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 9.0f, 0.0f)); float fourth_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 6.5f, 0.0f)); float rr = sqrtf(fmaf(-p, 1.0f, 1.0f)), xx = p * p, tt = fmaf(500.0f, p, -486.95472f); float adaptive_coeff_ = (p < 0.95f) ? fmaf(fmaf(first_sqrt, xx, 0.0f), 0.0130349902f, fmaf(-0.04f, p, fmaf(fmaf(first_sqrt, rr, 0.0f), 0.15f, 1.1f))) : (p < 0.97390944f) ? fmaf(second_sqrt, rr, 0.9396f) : (p < 0.97590944f) ? fmaf(fmaf(fmaf(fmaf(third_sqrt, tt, 0.0f), tt, 0.0f), fmaf(-2.0f, tt, 3.0f), 0.0f), fmaf(0.25f, rr, -0.0396f), fmaf(fmaf(third_sqrt, rr, 0.0f), 0.75f, 0.9396f)) : fmaf(fourth_sqrt, rr, 0.925f); adaptive_coeff__ = fmaf(fmaf(exp_arg__, fmaf(exp_arg__, fmaf(exp_arg__, fmaf(exp_arg__, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition__, 2.0f - A_dim__); const float arg_for_T = fmaf(p, 4.5f, 2.475f); const float exp_argument = fmaf(-0.2925f, dim_f, 0.0f); const float exp2_exp_arg = fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float A = fmaf(39.995f, exp2_exp_arg, 0.0f); const int T = static_cast<int>(fmaf(fmaf(fmaf(1.0f / fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 2.0f), 1.0498f, -0.04978f), fmaf(sqrtf(dim_f), 4.88f, 0.0f), 0.0f), fmaf(-(exp2_exp_arg - 1.0f), A, A), 0.0f)); std::pop_heap(H.begin(), H.end(), ComparePtrND); IntervalND* cur = H.back(); H.pop_back(); const float x1 = cur->x1; const float x2 = cur->x2; const float y1 = cur->y1; const float y2 = cur->y2; float m = fmaf(r_eff, Mmax, 0.0f); float tNew = step(m, x1, x2, y1, y2, dim_f, r_eff); const float bestFOld = bestF; const float fNew = evalAt(tNew); IntervalND* L = new IntervalND(x1, tNew, y1, fNew); IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2); L->i1 = t_to_idx(x1); L->i2 = t_to_idx(tNew); Rv->i1 = t_to_idx(tNew); Rv->i2 = t_to_idx(x2); L->diam = map.block_diameter(L->i1, L->i2); Rv->diam = map.block_diameter(Rv->i1, Rv->i2); L->compute_span_level(map); Rv->compute_span_level(map); L->set_metric(L->diam); Rv->set_metric(Rv->diam); const float Mloc = fmaxf(L->M, Rv->M); update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv); const float prevMmax = Mmax; if (Mloc > Mmax) Mmax = Mloc; m = fmaf(r_eff, Mmax, 0.0f); if (adaptive) { const float len1 = fmaf(tNew, 1.0f, -x1); const float len2 = fmaf(x2, 1.0f, -tNew); if (fmaf(len1, 1.0f, len2) == dmax) { dmax = fmaxf(len1, len2); for (auto pI : H) { const float Ls = fmaf(pI->x2, 1.0f, -pI->x1); if (Ls > dmax) dmax = Ls; } } if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) { const float alpha = p * p; const float beta = fmaf(-alpha, 1.0f, 2.0f); const float MULT = (1.0f / dmax) * Mmax; const float global_coeff = fmaf(MULT, r_eff, -MULT); const float GF = beta * global_coeff; L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f))); Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f))); const size_t sz = H.size(); RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha); std::make_heap(H.begin(), H.end(), ComparePtrND); } else { if (Mloc > prevMmax) { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); if (Mloc > fmaf(adaptive_coeff_, prevMmax, 0.0f)) { const size_t sz = H.size(); RecomputeR_ConstM_AVX2_ND(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtrND); } } else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); } } } else { if (Mloc > prevMmax) { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); if (Mloc > fmaf(adaptive_coeff_, prevMmax, 0.0f)) { const size_t sz = H.size(); RecomputeR_ConstM_AVX2_ND(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtrND); } } else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); } } H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND); H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND); _mm_prefetch((const char*)H[0], _MM_HINT_T0); _mm_prefetch((const char*)H[1], _MM_HINT_T0); IntervalND* const top = H.front(); const float interval_len = dim > 1 ? fmaf(top->x2, 1.0f, -top->x1) : top->diam; if (/*(dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || */it == maxIter - 1) { memcpy(q_local, bestQ.data(), static_cast<size_t>(dim) * sizeof(float)); float x_final = bestX; float y_final = bestY; float f_final = bestF; const int last = n - 1; const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f; const float hi = 2.6179938779914943653855361527329f; float bestLocF = f_final; float saved = q_local[last]; float delta = 0.05f; while (delta >= 0.00625f) { int sgn = -1; while (sgn < 2) { float cand = fmaf(static_cast<float>(sgn), delta, saved); if (cand < lo) cand = lo; else if (cand > hi) cand = hi; const float backup = q_local[last]; q_local[last] = cand; float x2; float y2; const float f2 = cost(q_local, x2, y2); if (f2 < bestLocF) { bestLocF = f2; x_final = x2; y_final = y2; saved = cand; } q_local[last] = backup; sgn += 2; } delta *= 0.5f; } if (bestLocF < f_final) { q_local[last] = saved; f_final = bestLocF; bestF = f_final; bestX = x_final; bestY = y_final; bestQ.assign(q_local, q_local + dim); } out_iterations = static_cast<size_t>(it); out_achieved_epsilon = interval_len; return; } if (bestF < fmaf(bestFOld, adaptive_coeff__, 0.0f)) { BestPointMsg out; out.bestF = bestF; out.bestX = bestX; out.bestY = bestY; out.dim = static_cast<unsigned>(dim); memcpy(out.bestQ, bestQ.data(), static_cast<size_t>(dim) * sizeof(float)); IntervalND* I0 = H[0]; IntervalND* I1 = H[1]; map.map01ToPoint(I0->x1, out.q1_left); map.map01ToPoint(I0->x2, out.q1_right); out.f1_left = I0->y1; out.f1_right = I0->y2; map.map01ToPoint(I1->x1, out.q2_left); map.map01ToPoint(I1->x2, out.q2_right); out.f2_left = I1->y1; out.f2_right = I1->y2; const size_t iterations = std::bit_width(static_cast<size_t>(world - 1)); bool active = true; const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter), 1.0f, 1.0f)) & 1); size_t ii2 = 0u; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ static_cast<int>(step); if (partner < world) { const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T); if (am_sender) { pending_requests->push_back(g_world->isend(partner, 2, out)); active = false; } } ++ii2; } ++exchange_counter; } while (g_world->iprobe(boost::mpi::any_source, 2)) { BestPointMsg bm; g_world->recv(boost::mpi::any_source, 2, bm); if (bm.bestF < fmaf(bestF, adaptive_coeff__, 0.0f) || stagnation) { _mm_prefetch((const char*)H[0], _MM_HINT_T0); _mm_prefetch((const char*)H[1], _MM_HINT_T0); // Обновляем лучшую точку, если она лучше if (bm.bestF < bestF) { bestF = bm.bestF; bestX = bm.bestX; bestY = bm.bestY; bestQ.assign(bm.bestQ, bm.bestQ + bm.dim); } // Предрасчёт коэффициентов (как в оригинале) const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f; const float kf = stagnation ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f); float adaptive_coeff_clone0 = fmaf(-1.0f, adaptive_coeff_addition, A_dim_clone); const float exp_arg = B_dim; float adaptive_coeff_clone1 = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim_clone); { float t1 = map.pointToT(bm.q1_left); float t2 = map.pointToT(bm.q1_right); IntervalND* I = new IntervalND(t1, t2, bm.f1_left, bm.f1_right); I->i1 = t_to_idx(t1); I->i2 = t_to_idx(t2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); I->R = fmaf(I->R, fmaf(kf, adaptive_coeff_clone0, 0.0f), 0.0f); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } { float t1 = map.pointToT(bm.q2_left); float t2 = map.pointToT(bm.q2_right); IntervalND* I = new IntervalND(t1, t2, bm.f2_left, bm.f2_right); I->i1 = t_to_idx(t1); I->i2 = t_to_idx(t2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); I->R = fmaf(I->R, fmaf(kf, adaptive_coeff_clone1, 0.0f), 0.0f); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } } } ++it; }

}" - код сейчас, до этого была другая пересылка: "if (bestF < fmaf(bestFOld, adaptive_coeff__, 0.0f)) {
BestSolutionMsg out;
out.bestF = bestF;
out.bestX = bestX;
out.bestY = bestY;
out.dim = static_cast<unsigned>(bestQ.size());
memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float));
{
IntervalND* const I0 = H[0];
out.bestI[0] = I0->x1;
out.bestI[1] = I0->x2;
out.bestI[2] = I0->y1;
out.bestI[3] = I0->y2;
IntervalND* const I1 = H[1];
out.bestI[4] = I1->x1;
out.bestI[5] = I1->x2;
out.bestI[6] = I1->y1;
out.bestI[7] = I1->y2;
}
const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
bool active = true;
const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter), 1.0f, 1.0f)) & 1);
size_t ii2 = 0u;
while (ii2 < iterations && active) {
const size_t step = 1ULL << ii2;
const int partner = rank ^ static_cast<int>(step);
if (partner < world) {
const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
if (am_sender) {
pending_requests->push_back(g_world->isend(partner, 2, out));
active = false;
}
}
++ii2;
}
++exchange_counter;
}
while (g_world->iprobe(boost::mpi::any_source, 2)) {
BestSolutionMsg bm;
g_world->recv(boost::mpi::any_source, 2, bm);
if (bm.bestF < fmaf(bestF, adaptive_coeff__, 0.0f) || stagnation) {
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f;
const float kf = stagnation ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f);
float adaptive_coeff_clone0 = fmaf(-1.0f, adaptive_coeff_addition, A_dim_clone);
const float exp_arg = B_dim;
float adaptive_coeff_clone1 = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim_clone);
{
const float sx = bm.bestI[0];
const float ex = bm.bestI[1];
const float y1i = bm.bestI[2];
const float y2i = bm.bestI[3];
IntervalND* I = new IntervalND(sx, ex, y1i, y2i);
I->i1 = t_to_idx(sx);
I->i2 = t_to_idx(ex);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, fmaf(kf, adaptive_coeff_clone0, 0.0f), 0.0f);
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
{
const float sx = bm.bestI[4];
const float ex = bm.bestI[5];
const float y1i = bm.bestI[6];
const float y2i = bm.bestI[7];
IntervalND* I = new IntervalND(sx, ex, y1i, y2i);
I->i1 = t_to_idx(sx);
I->i2 = t_to_idx(ex);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, fmaf(kf, adaptive_coeff_clone1, 0.0f), 0.0f);
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
if (bm.bestF < bestF) {
bestF = bm.bestF;
bestX = bm.bestX;
bestY = bm.bestY;
bestQ.assign(bm.bestQ, bm.bestQ + bm.dim);
}
}
}", при этом структуры для MPI: "__declspec(align(16)) struct CrossMsg final {
float s_x1;
float s_x2;
float e_x1;
float e_x2;
float Rtop;

text
template <typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }

};

__declspec(align(16)) struct MultiCrossMsg final {
float intervals[35];
unsigned count;

text
template <typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept { ar& intervals& count; }

};

__declspec(align(16)) struct BestSolutionMsg final {
float bestF;
float bestX;
float bestY;
float bestQ[32];
float bestI[8];
unsigned dim;

text
template <typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept { ar& bestF& bestX& bestY& bestQ& bestI& dim; }

};

// Новая структура для передачи лучшего решения и двух интервалов в виде точек!!!!!!!!!!!!!!!!!!!!!
__declspec(align(16)) struct BestPointMsg {
float bestF;
float bestX;
float bestY;
float bestQ[32];
unsigned dim;
float f1_left, f1_right;
float q1_left[32];
float q1_right[32];
float f2_left, f2_right;
float q2_left[32];
float q2_right[32];

text
template <typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept { ar& bestF& bestX& bestY& bestQ& dim & f1_left& f1_right& q1_left& q1_right & f2_left& f2_right& q2_left& q2_right; }

};", код для Мортон-кодирования: "__declspec(align(16)) struct MortonCachePerRank final sealed{
std::vector<int> permCache;
std::vector<unsigned long long> invMaskCache;
unsigned baseSeed;
};

static thread_local MortonCachePerRank g_mc;

static __declspec(noalias) __forceinline unsigned long long gray_encode(unsigned long long x) noexcept {
return x ^ (x >> 1);
}

static __declspec(noalias) __forceinline long long gray_decode(unsigned long long g) noexcept {
g ^= g >> 32;
g ^= g >> 16;
g ^= g >> 8;
g ^= g >> 4;
g ^= g >> 2;
g ^= g >> 1;
return static_cast<long long>(g);
}

__declspec(align(16)) struct MortonND final sealed{
const int dim;
const int levels;
const int eff_levels;
const int extra_levels;
const int chunks;
std::vector<int> chunk_bits;
std::vector<unsigned long long> chunk_bases;
unsigned long long scale;
std::vector<float> low;
std::vector<float> high;
std::vector<float> step;
std::vector<float> invStep;
std::vector<float> baseOff;
std::vector<int> perm;
std::vector<unsigned long long> invMask;
std::vector<unsigned long long> pextMask;
std::vector<unsigned long long> pextMaskChunks;
const float invScaleLevel;
const bool use_gray;

text
static __declspec(noalias) __forceinline unsigned long long make_mask(int dim, int Lc, int d) noexcept { unsigned long long m = 0ull; unsigned long long bitpos = static_cast<unsigned long long>(d); int b = 0;

#pragma loop ivdep
while (b < Lc) {
m |= 1ull << bitpos;
bitpos += static_cast<unsigned long long>(dim);
++b;
}
return m;
}

text
__declspec(noalias) __forceinline MortonND(int D, int L, const float* lows, const float* highs, const MortonCachePerRank& mc) : dim(D), levels(L), eff_levels((std::max)(1, static_cast<int>(63 / (D ? D : 1)))), extra_levels((L > eff_levels) ? (L - eff_levels) : 0), chunks((extra_levels > 0) ? (1 + (extra_levels + eff_levels - 1) / eff_levels) : 1), low(lows, lows + D), high(highs, highs + D), step(D, 0.0f), invStep(D, 0.0f), baseOff(D, 0.0f), perm(mc.permCache.begin(), mc.permCache.begin() + D), invMask(mc.invMaskCache.begin(), mc.invMaskCache.begin() + D), invScaleLevel(1.0f / static_cast<float>(static_cast<unsigned long long>(1) << L)), use_gray(true) { int d = 0;

#pragma loop ivdep
while (d < dim) {
const float rng = high[d] - low[d];
const float st = rng * invScaleLevel;
step[d] = st;
invStep[d] = 1.0f / st;
baseOff[d] = fmaf(0.5f, st, low[d]);
++d;
}
chunk_bits.resize(chunks);
pextMaskChunks.resize(static_cast<size_t>(chunks) * static_cast<size_t>(dim));
chunk_bases.resize(chunks);
int remaining = levels;
int c = 0;
while (c < chunks) {
const int Lc = (c == 0) ? (std::min)(eff_levels, remaining) : (std::min)(eff_levels, remaining);
chunk_bits[c] = Lc;
remaining -= Lc;
const unsigned long long baseC = static_cast<unsigned long long>(1) << (dim * Lc);
chunk_bases[c] = baseC;
d = 0;
#pragma loop ivdep
while (d < dim) {
pextMaskChunks[static_cast<size_t>(c) * static_cast<size_t>(dim) + static_cast<size_t>(d)] = make_mask(dim, Lc, d);
++d;
}
++c;
}
pextMask.resize(dim);
d = 0;
#pragma loop ivdep
while (d < dim) {
pextMask[d] = make_mask(dim, chunk_bits[0], d);
++d;
}
scale = static_cast<unsigned long long>(1) << (dim * chunk_bits[0]);
}

text
__declspec(noalias) __forceinline float block_diameter(unsigned long long i1, unsigned long long i2) const noexcept { if (i1 > i2) std::swap(i1, i2); float s2 = 0.0f; int d = 0;

#pragma loop ivdep
while (d < dim) {
const int pd = perm[d];
const unsigned long long varying = (i1 ^ i2) & pextMask[d];
const int nfree_hi = _mm_popcnt_u64(varying);
const int nfree_total = nfree_hi + levels - chunk_bits[0];
const float range = fmaf(step[pd], fmaf(ldexpf(1.0f, nfree_total), 1.0f, -1.0f), 0.0f);
s2 = fmaf(range, range, s2);
++d;
}
return sqrtf(s2);
}

text
__declspec(noalias) __forceinline void map01ToPoint(float t, float* __restrict out) const noexcept { float u = t; unsigned long long accBits[32] = {0ull}; int c = 0; while (c < chunks) { const int Lc = chunk_bits[c]; const unsigned long long baseC = chunk_bases[c]; u *= static_cast<float>(baseC); unsigned long long idxc = static_cast<unsigned long long>(u); u -= static_cast<float>(idxc); if (use_gray) idxc = gray_encode(idxc); int shift_from_top = 0; int k = 0; while (k <= c) { shift_from_top += chunk_bits[k]; ++k; } const int inv_shift = levels - shift_from_top; int d = 0;

#pragma loop ivdep
while (d < dim) {
const int pd = perm[d];
const unsigned long long mask = pextMaskChunks[static_cast<size_t>(c) * static_cast<size_t>(dim) + static_cast<size_t>(d)];
unsigned long long bits = _pext_u64(idxc, mask);
if (inv_shift >= 0 && chunk_bits[c] < 63) {
const unsigned long long invMaskSegment = (invMask[pd] >> inv_shift) & (static_cast<unsigned long long>(1) << chunk_bits[c]) - 1ull;
bits ^= invMaskSegment;
}
accBits[pd] = (accBits[pd] << Lc) | bits;
++d;
}
++c;
}
int d = 0;
#pragma loop ivdep
while (d < dim) {
out[d] = fmaf(step[d], static_cast<float>(accBits[d]), baseOff[d]);
++d;
}
}

__declspec(noalias) __forceinline float pointToT(const float* __restrict q) const noexcept {
unsigned long long cell[32];
int d = 0;
#pragma loop ivdep
while (d < dim) {
const int pd = perm[d];
const float v = (q[pd] - baseOff[pd]) * invStep[pd];
const long long c = _mm_cvt_ss2si(_mm_round_ss(_mm_setzero_ps(), _mm_set_ss(v),
_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
const long long maxv = (static_cast<long long>(1) << levels) - 1;
cell[pd] = static_cast<unsigned long long>(c < 0 ? 0 : (c > maxv ? maxv : c));
++d;
}
float t = 0.0;
int c = chunks;
while (c > 0) {
--c;
const int Lc = chunk_bits[c];
const unsigned long long baseC = chunk_bases[c];
int shift_from_top = 0;
int k = 0;
while (k <= c) {
shift_from_top += chunk_bits[k];
++k;
}
const int inv_shift = levels - shift_from_top;
unsigned long long idxc = 0ull;
d = 0;
#pragma loop ivdep
while (d < dim) {
const int pd = perm[d];
const unsigned long long mask = (static_cast<unsigned long long>(1) << Lc) - 1;
unsigned long long bits = (cell[pd] >> inv_shift) & mask;

text
if (inv_shift >= 0 && chunk_bits[c] < 63) { const unsigned long long invMaskSegment = (invMask[pd] >> inv_shift) & mask; bits ^= invMaskSegment; } const unsigned long long pdep_mask = pextMaskChunks[static_cast<size_t>(c) * dim + d]; idxc |= _pdep_u64(bits, pdep_mask); ++d; } if (use_gray) idxc = gray_decode(idxc); t = (t + static_cast<float>(idxc)) / static_cast<float>(baseC); } return static_cast<float>(t);

}
};

__declspec(noalias) __forceinline void IntervalND::compute_span_level(const MortonND& map) noexcept {
span_level = 0;
int d = 0;
#pragma loop ivdep
while (d < map.dim) {
const unsigned long long varying = (i1 ^ i2) & map.pextMask[d];
span_level += _mm_popcnt_u64(varying);
++d;
}
span_level += (map.levels - map.chunk_bits[0]) * map.dim;
span_level = (std::min)(span_level, 11);
}" - после перехода на новый тип пересылки, точность осталась в точности такой же, но скорость упала в 50 раз, в чём проблема? исправь, структура для интервалов: "__declspec(align(16)) struct IntervalND final sealed{
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float ordinate_factor;
float N_factor;
float quadratic_term;
float M;
float R;
unsigned long long i1;
unsigned long long i2;
float diam;
int span_level;

text
static __declspec(noalias) __forceinline void* operator new(size_t) noexcept { Slab* s = tls.local(); char* r = s->current; s->current += 64u; return r; } __declspec(noalias) __forceinline IntervalND(float _x1, float _x2, float _y1, float _y2) noexcept : x1(_x1), x2(_x2), y1(_y1), y2(_y2), delta_y(fmaf(_y2, 1.0f, -_y1)), ordinate_factor(fmaf(fmaf(-y1, 1.0f, -y2), 2.0f, 0.0f)), N_factor(0), quadratic_term(0), M(0), R(0), i1(0), i2(0), diam(0), span_level(0) {

}"

Partager cette Q&R