"static __declspec(noalias) void agp_run_branch_mp...

"static declspec(noalias) void agp_run_branch_mpi(
const MortonND& map,
const ManipCost& cost,
int maxIter,
float r,
bool adaptive,
float eps,
unsigned seed,
std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H,
std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ,
float& bestF,
float& bestX,
float& bestY,
size_t& out_iterations,
float& out_achieved_epsilon,
float M_prior) noexcept {
const int n = cost.n;
const int dim = n + (cost.variableLen ? n : 0);
const float dim_f = static_cast<float>(dim);
unsigned exchange_counter = 0;
unsigned exchange_counter_T = 0;
alignas(16) float M_by_span[12];
int msi = 0;
while (msi < 12)
M_by_span[msi++] = M_prior;
float Mmax = M_prior;
alignas(16) float q_local[32];
alignas(16) float phi[32];
alignas(16) float s_arr[32];
alignas(16) float c_arr[32];
alignas(16) float sum_s[32];
alignas(16) float sum_c[32];
alignas(16) float q_try[32];
bestQ.reserve(static_cast<size_t>(dim));
float x = 0.0f;
float y = 0.0f;
int no_improve = 0;
auto t_to_idx = [&](float t) -> unsigned long long {
unsigned long long idx = static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f));
return idx;
};
auto update_pockets_and_Mmax = [&](IntervalND* I) {
const int k = I->span_level;
if (I->M > M_by_span[k])
M_by_span[k] = I->M;
if (M_by_span[k] > Mmax)
Mmax = M_by_span[k];
};
const float a = 0.0f;
const float b = 1.0f;
float p = 0.0f;
float dmax = b - a;
const float initial_len = dmax;
const float A_dim = fmaf(1.0f / sqrtf(dim_f + 6.75f), 5.535f, 0.0f);
const float A_dim = fmaf(1.0f / sqrtf(dim_f + 6.75f), 3.425f, 0.0f);
const float B_dim = fmaf(A_dim, 0.7f, 0.0f);
const float B_dim__ = fmaf(A_dim__, 4.325f, 0.0f);
const float log_argument = A_dim - 2.03f;
const float log_argument__ = A_dim__ - 2.0f;
const float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim;
const float C_dim__ = fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument__) - B_dim__;
const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float adaptive_coeff_addition__ = fmaf(C_dim__, fmaf(C_dim__, fmaf(C_dim__, fmaf(C_dim__, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
float adaptive_coeff = A_dim - adaptive_coeff_addition;
float adaptive_coeff__ = A_dim__ - adaptive_coeff_addition__;
const float A_dim_clone = fmaf(A_dim - fmaf(-fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim), 0.5f, 0.0f);
int it = 0;
// --- Переменные для механизма стагнации с усилением исследования ---
int stag_boost_remaining = 0;
float stag_r_multiplier = 0.0f;

text
const int n_stag_iters = static_cast<int>(3.0f + 2.045f * sqrtf(dim_f)); // количество итераций усиления
auto evalAt = [&](const float t) -> float {
	map.map01ToPoint(t, q_local);
	float f = cost(q_local, x, y);
	if (f < fmaf(bestF, adaptive_coeff, 0.0f)) {

		// --- значение функции в точке старта локальной доводки (для триггера 70%) ---
		const float f_start = f;

		// Параметры Armijo
		const float c1 = 1e-4f;                // достаточное убывание
		const float tau = 0.5f;                // коэффициент уменьшения шага
		const int max_outer_iters = (int)(50.0f * (1.0f + 0.65f * p));
		const int max_backtrack = (int)(20.0f * (1.0f + 0.65f * p));

		// --- L-BFGS параметры ---
		const float lbfgs_trigger = 0.6f;     // 70% относительное улучшение
		const int   m_lbfgs = 9;               // память
		const int   max_lbfgs_iters = (int)(25.0f * (1.0f + 0.65f * p));
		const float eps_lbfgs_curv = 1e-6f;    // порог кривизны ys
		const float eps_descent = 1e-12f;

		float eta = 1.0f;                      // шаг для GD (и как стартовый для line-search)

		// ------------------------------------------------------------
		// Вспомогательное: clamp в допустимые границы
		// ------------------------------------------------------------
		auto clampPoint = [&](float* q) {
			int i = 0;

#pragma loop ivdep
while (i < n) {
const float lo = (i == 0) ? -1.0471975511965977f : -2.6179938779914944f;
const float hi = 2.6179938779914944f;
if (q[i] < lo) q[i] = lo;
else if (q[i] > hi) q[i] = hi;
++i;
}
if (cost.variableLen) {
i = 0;
#pragma loop ivdep
while (i < n) {
if (q[n + i] < 0.5f) q[n + i] = 0.5f;
else if (q[n + i] > 2.0f) q[n + i] = 2.0f;
++i;
}
}
};

text
		// ------------------------------------------------------------
		// Вспомогательное: вычислить градиент
		// Вход: q_in, x_in, y_in
		// Выход: grad_out[dim], grad_norm2_out
		// ------------------------------------------------------------
		auto computeGrad = [&](const float* q_in, const float x_in, const float y_in,
			float* grad_out, float& grad_norm2_out) {
				// ---- Вычисляем градиент в текущей точке q_in ----
				float acc = 0.0f;
				int ii = 0;

#pragma loop ivdep
while (ii < n) {
acc = fmaf(q_in[ii], 1.0f, acc);
phi[ii] = acc;
++ii;
}
FABE13_SINCOS(phi, s_arr, c_arr, n);

text
				float as = 0.0f;
				float ac = 0.0f;
				int k = n - 1;
				while (k >= 0) {
					const float Lk = cost.variableLen ? q_in[n + k] : 1.0f;
					as = fmaf(Lk, s_arr[k], as);
					ac = fmaf(Lk, c_arr[k], ac);
					sum_s[k] = as;
					sum_c[k] = ac;
					--k;
				}

				const float dx = fmaf(x_in, 1.0f, -cost.targetX);
				const float dy = fmaf(y_in, 1.0f, -cost.targetY);
				const float dist = sqrtf(fmaf(dx, dx, dy * dy));
				const float inv_dist = 1.0f / dist;

				grad_norm2_out = 0.0f;

				// Градиент по углам
				int i = 0;

#pragma loop ivdep
while (i < n) {
float gpen = 0.0f;
// Производная штрафа за превышение minTheta
{
const float ai = fabsf(q_in[i]);
const float v = fmaf(ai, 1.0f, -cost.minTheta);
if (v > 0.0f) {
const float scale_arg = fmaf(2.0f / cost.minTheta, v * 0.69314718f, 0.0f);
const float exp_val = fmaf(scale_arg,
fmaf(scale_arg,
fmaf(scale_arg,
fmaf(scale_arg, 0.00833333377f, 0.0416666679f),
0.16666667f),
0.5f),
1.0f);
const float dpen = fmaf(cost.sharpW, exp_val * (1.38629436f / cost.minTheta), 0.0f);
gpen = fmaf(dpen, copysignf(1.0f, q_in[i]), gpen);
}
}
// Производная арк-штрафа
{
const float tsg = fmaf(-q_in[i], cost.archBiasK, 0.0f);
const float exp_arg = -tsg;
const float exp_val = fmaf(exp_arg,
fmaf(exp_arg,
fmaf(exp_arg,
fmaf(exp_arg, 0.00833333377f, 0.0416666679f),
0.16666667f),
0.5f),
1.0f);
const float sig = 1.0f / (exp_val + 1.0f);
gpen = fmaf(-cost.archBiasW * cost.archBiasK, sig, gpen);
}

text
					const float g_main = fmaf(dx, -sum_s[i], dy * sum_c[i]) * inv_dist;
					float gi = g_main + gpen;
					grad_out[i] = gi;
					grad_norm2_out = fmaf(gi, gi, grad_norm2_out);
					++i;
				}

				// Градиент по длинам
				if (cost.variableLen) {
					int j = 0;

#pragma loop ivdep
while (j < n) {
const float gi = fmaf(dx, c_arr[j], dy * s_arr[j]) * inv_dist;
grad_out[n + j] = gi;
grad_norm2_out = fmaf(gi, gi, grad_norm2_out);
++j;
}
}
};

text
		// ------------------------------------------------------------
		// Вспомогательное: общий Armijo backtracking для направления dir
		// Условие: f_new <= f_cur + c1 * alpha * (g^T d)
		// Возвращает флаг clipped (было ли обрезание границ) через ссылку.
		// ------------------------------------------------------------
		auto armijoLineSearch = [&](const float* q_base,
			const float f_base,
			const float x_base, const float y_base,
			const float* grad_base,
			const float* dir,
			const float gtd,
			float& alpha_io,
			float* q_out,
			float& f_out,
			float& x_out,
			float& y_out,
			bool& clipped) -> bool {
				float alpha = alpha_io;
				int backtrack = 0;

				while (backtrack < max_backtrack) {
					int i = 0;

#pragma loop ivdep
while (i < dim) {
q_out[i] = fmaf(alpha, dir[i], q_base[i]);
++i;
}

text
					// Сохраняем копию до клиппинга
					float q_before_clamp[64];
					memcpy(q_before_clamp, q_out, dim * sizeof(float));

					// clamp как у тебя
					clampPoint(q_out);

					// Проверяем, был ли клиппинг
					clipped = false;
					i = 0;

#pragma loop ivdep
while (i < dim) {
if (fabsf(q_out[i] - q_before_clamp[i]) > 1e-12f) {
clipped = true;
break;
}
++i;
}

text
					float x2, y2;
					float f_try = cost(q_out, x2, y2);

					// проверки на NaN/Inf
					if (!(f_try == f_try)) { // NaN
						alpha *= tau;
						++backtrack;
						continue;
					}

					// Armijo
					if (f_try <= f_base + c1 * alpha * gtd) {
						alpha_io = alpha;
						f_out = f_try;
						x_out = x2;
						y_out = y2;
						return true;
					}

					alpha *= tau;
					++backtrack;
				}

				return false;
			};

		// ------------------------------------------------------------
		// 1) Градиентный спуск
		// 2) При улучшении >= 70% -> L-BFGS, с fallback обратно в GD
		// ------------------------------------------------------------
		bool lbfgs_already_tried = false;

		int outer = 0;
		while (outer < max_outer_iters) {

			// ---- Градиент в текущей точке ----
			float grad[64];
			float grad_norm2 = 0.0f;
			computeGrad(q_local, x, y, grad, grad_norm2);

			if (!(grad_norm2 == grad_norm2) || grad_norm2 < 1e-12f) break;

			// ---- Направление GD: d = -g ----
			float dir_gd[64];
			int i = 0;

#pragma loop ivdep
while (i < dim) {
dir_gd[i] = -grad[i];
++i;
}

text
			// Для d = -g: g^T d = -||g||^2
			const float gtd_gd = -grad_norm2;

			// ---- Line-search (Armijo) ----
			float eta_trial = eta;
			float f_new = f, x_new = x, y_new = y;
			bool clipped_gd = false;

			// пробная точка в q_try
			bool found = armijoLineSearch(q_local, f, x, y, grad, dir_gd, gtd_gd,
				eta_trial, q_try, f_new, x_new, y_new, clipped_gd);

			if (!found) break; // не удалось сделать шаг — выход

			// Шаг принят
			memcpy(q_local, q_try, static_cast<size_t>(dim) * sizeof(float));
			f = f_new;
			x = x_new;
			y = y_new;
			eta = eta_trial; // запоминаем удачный шаг

			// --------------------------------------------------------
			// Триггер: относительное улучшение от f_start
			// improvement = (f_start - f) / f_start
			// --------------------------------------------------------
			const float denom = fmaxf(f_start, 1e-12f);
			const float rel_impr = (f_start - f) / denom;

			// Запускаем L-BFGS при достижении порога (без блокировки по клиппингу)
			if (!lbfgs_already_tried && rel_impr >= lbfgs_trigger) {
				lbfgs_already_tried = true;

				// --- Сохраняем точку, чтобы уметь вернуться в GD ---
				float q_resume[64];
				memcpy(q_resume, q_local, static_cast<size_t>(dim) * sizeof(float));
				float f_resume = f;
				float x_resume = x;
				float y_resume = y;
				float eta_resume = eta;

				// --- Лучшая точка, найденная внутри L-BFGS ---
				float q_best_lbfgs[64];
				memcpy(q_best_lbfgs, q_local, static_cast<size_t>(dim) * sizeof(float));
				float f_best_lbfgs = f;
				float x_best_lbfgs = x;
				float y_best_lbfgs = y;

				// --- История L-BFGS ---
				float s_hist[m_lbfgs][64];
				float y_hist[m_lbfgs][64];
				float rho_hist[m_lbfgs];
				float alpha_hist[m_lbfgs];
				int hist_size = 0;

				// --- Текущий градиент для L-BFGS ---
				float gk[64];
				float gk_norm2 = 0.0f;
				computeGrad(q_local, x, y, gk, gk_norm2);

				bool lbfgs_ok = true;

				// стартовый alpha для L-BFGS (можно взять eta, но чаще лучше 1.0)
				float alpha_k = 1.0f;

				int it = 0;
				while (it < max_lbfgs_iters) {
					if (!(gk_norm2 == gk_norm2) || gk_norm2 < 1e-12f) break;

					// ---- Считаем направление L-BFGS ----
					float dir[64];

					if (hist_size == 0) {
						int d = 0;

#pragma loop ivdep
while (d < dim) {
dir[d] = -gk[d];
++d;
}
}
else {
// two-loop recursion
float q_vec[64];
int d = 0;
#pragma loop ivdep
while (d < dim) {
q_vec[d] = gk[d];
++d;
}

text
						for (int jj = hist_size - 1; jj >= 0; --jj) {
							float dot_sq = 0.0f;
							d = 0;

#pragma loop ivdep
while (d < dim) {
dot_sq = fmaf(s_hist[jj][d], q_vec[d], dot_sq);
++d;
}
const float a = dot_sq * rho_hist[jj];
alpha_hist[jj] = a;

							d = 0;

#pragma loop ivdep
while (d < dim) {
q_vec[d] = fmaf(-a, y_hist[jj][d], q_vec[d]);
++d;
}
}

text
						// gamma = (s_{k-1}^T y_{k-1}) / (y_{k-1}^T y_{k-1})
						float gamma = 1.0f;
						{
							const int last = hist_size - 1;
							float yy = 0.0f;
							d = 0;

#pragma loop ivdep
while (d < dim) {
yy = fmaf(y_hist[last][d], y_hist[last][d], yy);
++d;
}
// ys = 1/rho
const float ys = 1.0f / rho_hist[last];
if (yy > 0.0f) gamma = ys / yy;
}

text
						float r_vec[64];
						d = 0;

#pragma loop ivdep
while (d < dim) {
r_vec[d] = gamma * q_vec[d];
++d;
}

text
						for (int jj = 0; jj < hist_size; ++jj) {
							float dot_yr = 0.0f;
							d = 0;

#pragma loop ivdep
while (d < dim) {
dot_yr = fmaf(y_hist[jj][d], r_vec[d], dot_yr);
++d;
}
const float b = dot_yr * rho_hist[jj];
const float coeff = alpha_hist[jj] - b;

							d = 0;

#pragma loop ivdep
while (d < dim) {
r_vec[d] = fmaf(coeff, s_hist[jj][d], r_vec[d]);
++d;
}
}

						d = 0;

#pragma loop ivdep
while (d < dim) {
dir[d] = -r_vec[d];
++d;
}
}

text
					// ---- Проверка, что направление действительно спусковое ----
					float gtd = 0.0f;
					int d = 0;

#pragma loop ivdep
while (d < dim) {
gtd = fmaf(gk[d], dir[d], gtd);
++d;
}
if (!(gtd == gtd) || gtd >= -eps_descent) {
// fallback на -grad
d = 0;
#pragma loop ivdep
while (d < dim) {
dir[d] = -gk[d];
++d;
}
gtd = -gk_norm2;
if (gtd >= -eps_descent) { lbfgs_ok = false; break; }
}

text
					// ---- Сохраняем старую точку/градиент для s,y ----
					float q_old[64];
					memcpy(q_old, q_local, static_cast<size_t>(dim) * sizeof(float));
					float g_old[64];
					memcpy(g_old, gk, static_cast<size_t>(dim) * sizeof(float));
					const float f_old = f;
					const float x_old = x;
					const float y_old = y;

					// ---- Line-search Armijo по L-BFGS направлению ----
					float alpha_try = alpha_k;
					float f_try, x_try, y_try;
					bool clipped_lbfgs = false;

					bool step_ok = armijoLineSearch(q_local, f, x, y, gk, dir, gtd,
						alpha_try, q_try, f_try, x_try, y_try, clipped_lbfgs);

					if (!step_ok) { lbfgs_ok = false; break; }

					// ---- Принимаем шаг ----
					memcpy(q_local, q_try, static_cast<size_t>(dim) * sizeof(float));
					f = f_try;
					x = x_try;
					y = y_try;
					alpha_k = alpha_try;

					// ---- Новый градиент ----
					computeGrad(q_local, x, y, gk, gk_norm2);
					if (!(gk_norm2 == gk_norm2)) { lbfgs_ok = false; break; }

					// ---- Обновляем лучшую точку внутри L-BFGS ----
					if (f < f_best_lbfgs) {
						f_best_lbfgs = f;
						memcpy(q_best_lbfgs, q_local, static_cast<size_t>(dim) * sizeof(float));
						x_best_lbfgs = x;
						y_best_lbfgs = y;
					}

					// ---- s = x_{k+1}-x_k, y = g_{k+1}-g_k ----
					float ys = 0.0f;
					float s_new[64];
					float y_new[64];

					d = 0;

#pragma loop ivdep
while (d < dim) {
const float sd = q_local[d] - q_old[d];
const float yd = gk[d] - g_old[d];
s_new[d] = sd;
y_new[d] = yd;
ys = fmaf(yd, sd, ys);
++d;
}

text
					if (!(ys == ys) || ys <= eps_lbfgs_curv) {
						// плохая кривизна -> сброс истории
						hist_size = 0;
					}
					else {
						// пушим в историю (сдвигом; m маленькое)
						if (hist_size < m_lbfgs) {
							const int idx = hist_size;
							rho_hist[idx] = 1.0f / ys;
							d = 0;

#pragma loop ivdep
while (d < dim) {
s_hist[idx][d] = s_new[d];
y_hist[idx][d] = y_new[d];
++d;
}
++hist_size;
}
else {
// shift 0..m-2 <- 1..m-1
for (int jj = 0; jj < m_lbfgs - 1; ++jj) {
rho_hist[jj] = rho_hist[jj + 1];
d = 0;
#pragma loop ivdep
while (d < dim) {
s_hist[jj][d] = s_hist[jj + 1][d];
y_hist[jj][d] = y_hist[jj + 1][d];
++d;
}
}
const int idx = m_lbfgs - 1;
rho_hist[idx] = 1.0f / ys;
d = 0;
#pragma loop ivdep
while (d < dim) {
s_hist[idx][d] = s_new[d];
y_hist[idx][d] = y_new[d];
++d;
}
hist_size = m_lbfgs;
}
}

text
					// ---- критерий выхода ----
					if (gk_norm2 < 1e-12f) break;

					++it;
				}

				if (lbfgs_ok) {
					// L-BFGS успешно: фиксируем лучшую точку из L-BFGS и завершаем доводку
					memcpy(q_local, q_best_lbfgs, static_cast<size_t>(dim) * sizeof(float));
					f = f_best_lbfgs;
					x = x_best_lbfgs;
					y = y_best_lbfgs;
					break; // выходим из GD-цикла: L-BFGS — финальная стадия
				}
				else {
					// L-BFGS провалился: откатываемся (но если L-BFGS успел найти лучше — берем лучше)
					if (f_best_lbfgs < f_resume) {
						memcpy(q_local, q_best_lbfgs, static_cast<size_t>(dim) * sizeof(float));
						f = f_best_lbfgs;
						x = x_best_lbfgs;
						y = y_best_lbfgs;
					}
					else {
						memcpy(q_local, q_resume, static_cast<size_t>(dim) * sizeof(float));
						f = f_resume;
						x = x_resume;
						y = y_resume;
					}

					// возвращаем шаг GD и продолжаем GD
					eta = eta_resume;
				}
			}

			++outer;
		}

		// Обновляем лучшую точку
		if (f < bestF) {
			bestF = f;
			bestQ.assign(q_local, q_local + dim);
			bestX = x;
			bestY = y;
			no_improve = 0;
		}
		else {
			++no_improve;
		}
	}
	return f;
	};
const float f_a = evalAt(a);
const float f_b = evalAt(b);
const int K = static_cast<int>(fmaf(-fmaf(sqrtf(dim_f), dim_f, 0.0f), 0.725f, 10.95f));
H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + 16u);
const int rank = g_world->rank();
const int world = g_world->size();
alignas(16) float seeds[256 * 32];
const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, static_cast<unsigned>(fmaf(static_cast<float>(rank), 7919.0f, static_cast<float>(seed))));
int i = 0;
while (i < seedCnt) {
	const float* s = seeds + static_cast<size_t>(fmaf(static_cast<float>(i), 32.0f, 0.0f));
	const float t_seed = map.pointToT(s);
	const float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f) : fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f), exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
	const float t1 = fmaf(-interval_size, 1.0f, t_seed);
	const float t2 = fmaf(interval_size, 1.0f, t_seed);
	alignas(16) float q1[32];
	alignas(16) float q2[32];
	float x1;
	float y1;
	float x2;
	float y2;
	map.map01ToPoint(t1, q1);
	const float f1 = cost(q1, x1, y1);
	map.map01ToPoint(t2, q2);
	const float f2 = cost(q2, x2, y2);
	IntervalND* I = new IntervalND(t1, t2, f1, f2);
	I->i1 = t_to_idx(t1);
	I->i2 = t_to_idx(t2);
	I->diam = map.block_diameter(I->i1, I->i2);
	I->compute_span_level(map);
	I->set_metric(I->diam);
	update_pockets_and_Mmax(I);
	I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
	if (i < 3)
		I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f);
	else {
		const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f);
		const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f);
		const float mult = fmaf(start_mult, exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
		I->R = fmaf(I->R, mult, 0.0f);
	}
	H.emplace_back(I);
	std::push_heap(H.begin(), H.end(), ComparePtrND);
	if (f1 < bestF) {
		bestF = f1;
		bestQ.assign(q1, q1 + dim);
		bestX = x1;
		bestY = y1;
	}
	if (f2 < bestF) {
		bestF = f2;
		bestQ.assign(q2, q2 + dim);
		bestX = x2;
		bestY = y2;
	}
	++i;
}
float prev_t = a;
float prev_f = f_a;
int k = 1;
while (k <= K) {
	const float t = fmaf(fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1)));
	const float f = evalAt(t);
	IntervalND* I = new IntervalND(prev_t, t, prev_f, f);
	I->i1 = t_to_idx(prev_t);
	I->i2 = t_to_idx(t);
	I->diam = map.block_diameter(I->i1, I->i2);
	I->compute_span_level(map);
	I->set_metric(I->diam);
	update_pockets_and_Mmax(I);
	I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
	H.emplace_back(I);
	std::push_heap(H.begin(), H.end(), ComparePtrND);
	prev_t = t;
	prev_f = f;
	++k;
}
IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b);
tail->i1 = t_to_idx(prev_t);
tail->i2 = t_to_idx(b);
tail->diam = map.block_diameter(tail->i1, tail->i2);
tail->compute_span_level(map);
tail->set_metric(tail->diam);
update_pockets_and_Mmax(tail);
tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(tail);
std::push_heap(H.begin(), H.end(), ComparePtrND);
const int noImproveThrDim = static_cast<int>(fmaf(7.5f, exp2f(-0.1f * sqrtf(dim_f)), 0.0f));
while (true) {
	boost::mpi::wait_all(pending_requests->begin(), pending_requests->end());
	p = fmaf(-1.0f / initial_len, dmax, 1.0f);
	stag_r_multiplier = 1.4f - 1.1f * fmaf(0.65f * p - 0.45f,
		fmaf(0.65f * p - 0.45f,
			fmaf(0.65f * p - 0.45f,
				fmaf(0.65f * p - 0.45f,
					fmaf(0.65f * p - 0.45f, 0.164056f, -0.098462f),
					0.240884f),
				-0.351834f),
			0.999996f),
		0.65f * p - 0.45f);
	const float p_arg = fmaf(p, 2.3f, -2.9775f);

	float current_r = r;
	if (stag_boost_remaining > 0) {
		current_r = r * stag_r_multiplier;
		stag_boost_remaining--;
	}

	// ----- Проверка стагнации с учётом нормы градиента -----
	const float grad_threshold = 0.5e-1f; // порог, как для переключения на L-BFGS

	float grad_norm2_best = 0.0f;
	if (no_improve > 0) {
		// Вычисляем градиент в текущей лучшей точке bestQ
		float acc_best = 0.0f;
		float phi_best[32];
		int ii_best = 0;

#pragma loop ivdep
while (ii_best < n) {
acc_best = fmaf(bestQ[ii_best], 1.0f, acc_best);
phi_best[ii_best] = acc_best;
++ii_best;
}
float s_best[32], c_best[32];
FABE13_SINCOS(phi_best, s_best, c_best, n);

text
		float as_best = 0.0f, ac_best = 0.0f;
		float sum_s_best[32], sum_c_best[32];
		int k_best = n - 1;
		while (k_best >= 0) {
			const float Lk = cost.variableLen ? bestQ[n + k_best] : 1.0f;
			as_best = fmaf(Lk, s_best[k_best], as_best);
			ac_best = fmaf(Lk, c_best[k_best], ac_best);
			sum_s_best[k_best] = as_best;
			sum_c_best[k_best] = ac_best;
			--k_best;
		}

		const float dx_best = fmaf(bestX, 1.0f, -cost.targetX);
		const float dy_best = fmaf(bestY, 1.0f, -cost.targetY);
		const float dist_best = sqrtf(fmaf(dx_best, dx_best, dy_best * dy_best));
		const float inv_dist_best = 1.0f / dist_best;

		int i_best = 0;

#pragma loop ivdep
while (i_best < n) {
float gpen_best = 0.0f;
// Производная штрафа за превышение minTheta
{
const float ai = fabsf(bestQ[i_best]);
const float v = fmaf(ai, 1.0f, -cost.minTheta);
if (v > 0.0f) {
const float scale_arg = fmaf(2.0f / cost.minTheta, v * 0.69314718f, 0.0f);
const float exp_val = fmaf(scale_arg,
fmaf(scale_arg,
fmaf(scale_arg,
fmaf(scale_arg, 0.00833333377f, 0.0416666679f),
0.16666667f),
0.5f),
1.0f);
const float dpen = fmaf(cost.sharpW, exp_val * (1.38629436f / cost.minTheta), 0.0f);
gpen_best = fmaf(dpen, copysignf(1.0f, bestQ[i_best]), gpen_best);
}
}
// Производная арк-штрафа
{
const float tsg = fmaf(-bestQ[i_best], cost.archBiasK, 0.0f);
const float exp_arg = -tsg;
const float exp_val = fmaf(exp_arg,
fmaf(exp_arg,
fmaf(exp_arg,
fmaf(exp_arg, 0.00833333377f, 0.0416666679f),
0.16666667f),
0.5f),
1.0f);
const float sig = 1.0f / (exp_val + 1.0f);
gpen_best = fmaf(-cost.archBiasW * cost.archBiasK, sig, gpen_best);
}

text
			const float g_main_best = fmaf(dx_best, -sum_s_best[i_best], dy_best * sum_c_best[i_best]) * inv_dist_best;
			float gi_best = g_main_best + gpen_best;
			grad_norm2_best = fmaf(gi_best, gi_best, grad_norm2_best);
			++i_best;
		}
		if (cost.variableLen) {
			int j_best = 0;

#pragma loop ivdep
while (j_best < n) {
const float gi_best = fmaf(dx_best, c_best[j_best], dy_best * s_best[j_best]) * inv_dist_best;
grad_norm2_best = fmaf(gi_best, gi_best, grad_norm2_best);
++j_best;
}
}
}

text
	// Модифицированное условие стагнации
	const bool stagnation = (no_improve > noImproveThrDim) && (grad_norm2_best < grad_threshold);

	const float r_eff = dim > 2 ? fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, fmaf(sqrtf(dim_f - 1), current_r, 0.0f), 0.0f) : fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, current_r, 0.0f);
	if (stagnation) {

		// Устанавливаем усиление на несколько итераций
		stag_boost_remaining = n_stag_iters;

		// Определяем количество разведывательных сидов по формуле из текущего кода
		const int num_ik = 1 + static_cast<int>(sqrtf(dim_f));

		// ---- Оценка вытянутости конфигурации ----
		float dist_to_target = sqrtf(cost.targetX * cost.targetX + cost.targetY * cost.targetY);
		float max_reach = 0.0f;
		if (cost.variableLen) {
			for (int i = 0; i < n; ++i) max_reach += map.high[n + i];
		}
		else {
			max_reach = static_cast<float>(n);
		}
		float ratio = dist_to_target / max_reach;
		bool prefer_extended = (ratio > 0.7f);
		bool prefer_compact = (ratio < 0.4f);
		bool use_ik = !(ratio > 0.4f && ratio < 0.7f);

		// Массивы для хранения сидов
		float t_seeds[32];
		int seed_count = 0;

		if (!use_ik) {
			// В неопределённом случае используем Sobol-сиды
			float temp_S[32 * 32];
			int sobol_gen = generate_sobol_seeds(map, dim, temp_S, 32, seed + it);
			int num_sobol = num_ik; // берём столько же, сколько планировали IK
			for (int k = 0; k < num_sobol && k < sobol_gen; ++k) {
				const float* s = temp_S + k * 32;
				float t_s = map.pointToT(s);
				t_seeds[seed_count++] = t_s;
			}
		}
		else {
			// Используем IK-сиды
			// Базовые решения вычисляем так же, но возможно только одно из них
			float angles_ccd[32] = { 0 };
			float lengths_ccd[32];
			if (cost.variableLen) {
				float len_low = map.low[n];
				float len_high = map.high[n];
				float avg_len = (len_low + len_high) * 0.5f;
				for (int i = 0; i < n; ++i) lengths_ccd[i] = avg_len;
			}
			else {
				for (int i = 0; i < n; ++i) lengths_ccd[i] = 1.0f;
			}
			ccd_ik(cost.targetX, cost.targetY, lengths_ccd, n, angles_ccd, 10);

			float angles_fabrik[32] = { 0 };
			float lengths_fabrik[32];
			if (cost.variableLen) {
				for (int i = 0; i < n; ++i) lengths_fabrik[i] = lengths_ccd[i];
			}
			else {
				for (int i = 0; i < n; ++i) lengths_fabrik[i] = 1.0f;
			}
			float targetX_fab = cost.targetX;
			float targetY_fab = cost.targetY;
			for (int iter_fab = 0; iter_fab < 3; ++iter_fab) {
				float prevX = targetX_fab;
				float prevY = targetY_fab;
				for (int j = n - 1; j >= 0; --j) {
					float len = lengths_fabrik[j];
					float angle_to_target = atan2f(prevY, prevX);
					angles_fabrik[j] = angle_to_target;
					float s_val, c_val;
					FABE13_SINCOS(&angle_to_target, &s_val, &c_val, 1);
					prevX = prevX - len * c_val;
					prevY = prevY - len * s_val;
				}
			}

			if (prefer_extended) {
				// --- Чистый CCD ---
				{
					float q_ccd[32];
					for (int i = 0; i < n; ++i) q_ccd[i] = angles_ccd[i];
					if (cost.variableLen) {
						for (int i = 0; i < n; ++i) q_ccd[n + i] = lengths_ccd[i];
					}
					float t_ccd = map.pointToT(q_ccd);
					t_seeds[seed_count++] = t_ccd;
				}

				// --- Остальные сиды с шумом на основе CCD ---
				unsigned st_ik = seed + it + 222;
				int remaining = num_ik - 1;
				for (int v = 0; v < remaining; ++v) {
					float noisy_angles[32];
					float noisy_lengths[32];
					for (int i = 0; i < n; ++i) {
						st_ik ^= st_ik << 13;
						st_ik ^= st_ik >> 17;
						st_ik ^= st_ik << 5;
						float rnd = static_cast<float>(st_ik & 0xFFFFFF) * 5.9604645e-8f;
						noisy_angles[i] = angles_ccd[i] + (2.0f * rnd - 1.0f) * 0.1f;
						const float lo = (i == 0) ? -1.0471975511965976f : -2.6179938779914944f;
						const float hi = 2.6179938779914944f;
						if (noisy_angles[i] < lo) noisy_angles[i] = lo;
						if (noisy_angles[i] > hi) noisy_angles[i] = hi;
					}
					if (cost.variableLen) {
						for (int i = 0; i < n; ++i) {
							st_ik ^= st_ik << 13;
							st_ik ^= st_ik >> 17;
							st_ik ^= st_ik << 5;
							float rnd = static_cast<float>(st_ik & 0xFFFFFF) * 5.9604645e-8f;
							noisy_lengths[i] = lengths_ccd[i] + (2.0f * rnd - 1.0f) * 0.05f;
							if (noisy_lengths[i] < 0.5f) noisy_lengths[i] = 0.5f;
							if (noisy_lengths[i] > 2.0f) noisy_lengths[i] = 2.0f;
						}
					}
					float q_temp[32];
					for (int i = 0; i < n; ++i) q_temp[i] = noisy_angles[i];
					if (cost.variableLen) {
						for (int i = 0; i < n; ++i) q_temp[n + i] = noisy_lengths[i];
					}
					float t_temp = map.pointToT(q_temp);
					t_seeds[seed_count++] = t_temp;
				}
			}
			else if (prefer_compact) {
				// --- Чистый FABRIK ---
				{
					float q_fabrik[32];
					for (int i = 0; i < n; ++i) q_fabrik[i] = angles_fabrik[i];
					if (cost.variableLen) {
						for (int i = 0; i < n; ++i) q_fabrik[n + i] = lengths_fabrik[i];
					}
					float t_fabrik = map.pointToT(q_fabrik);
					t_seeds[seed_count++] = t_fabrik;
				}

				// --- Остальные сиды с шумом на основе FABRIK ---
				unsigned st_ik = seed + it + 222;
				int remaining = num_ik - 1;
				for (int v = 0; v < remaining; ++v) {
					float noisy_angles[32];
					float noisy_lengths[32];
					for (int i = 0; i < n; ++i) {
						st_ik ^= st_ik << 13;
						st_ik ^= st_ik >> 17;
						st_ik ^= st_ik << 5;
						float rnd = static_cast<float>(st_ik & 0xFFFFFF) * 5.9604645e-8f;
						noisy_angles[i] = angles_fabrik[i] + (2.0f * rnd - 1.0f) * 0.1f;
						const float lo = (i == 0) ? -1.0471975511965976f : -2.6179938779914944f;
						const float hi = 2.6179938779914944f;
						if (noisy_angles[i] < lo) noisy_angles[i] = lo;
						if (noisy_angles[i] > hi) noisy_angles[i] = hi;
					}
					if (cost.variableLen) {
						for (int i = 0; i < n; ++i) {
							st_ik ^= st_ik << 13;
							st_ik ^= st_ik >> 17;
							st_ik ^= st_ik << 5;
							float rnd = static_cast<float>(st_ik & 0xFFFFFF) * 5.9604645e-8f;
							noisy_lengths[i] = lengths_fabrik[i] + (2.0f * rnd - 1.0f) * 0.05f;
							if (noisy_lengths[i] < 0.5f) noisy_lengths[i] = 0.5f;
							if (noisy_lengths[i] > 2.0f) noisy_lengths[i] = 2.0f;
						}
					}
					float q_temp[32];
					for (int i = 0; i < n; ++i) q_temp[i] = noisy_angles[i];
					if (cost.variableLen) {
						for (int i = 0; i < n; ++i) q_temp[n + i] = noisy_lengths[i];
					}
					float t_temp = map.pointToT(q_temp);
					t_seeds[seed_count++] = t_temp;
				}
			}
		}

		// Теперь у нас есть seed_count сидов (должно быть равно num_ik)
		// Проведём локальную оптимизацию для каждого
		float optimized_points[32][32];
		float optimized_f[32];
		float optimized_t[32];
		for (int s = 0; s < seed_count; ++s) {
			float t_cur = t_seeds[s];
			float f_opt = evalAt(t_cur); // запускает локальную доводку, обновляет q_local, x, y
			memcpy(optimized_points[s], q_local, dim * sizeof(float));
			optimized_f[s] = f_opt;
			optimized_t[s] = map.pointToT(q_local);
		}

		// Создаём один интервал вокруг каждой оптимизированной точки
		for (int s = 0; s < seed_count; ++s) {
			float t_opt = optimized_t[s];
			const float interval_size = fmaf(0.00031f, static_cast<float>(dim), 0.0f);
			float t1 = t_opt - interval_size * 0.5f;
			float t2 = t_opt + interval_size * 0.5f;
			if (t1 < 0.0f) t1 = 0.0f;
			if (t2 > 1.0f) t2 = 1.0f;

			float q1[32], q2[32];
			float x1, y1, x2, y2;
			map.map01ToPoint(t1, q1);
			float f1 = cost(q1, x1, y1);
			map.map01ToPoint(t2, q2);
			float f2 = cost(q2, x2, y2);

			IntervalND* I = new IntervalND(t1, t2, f1, f2);

			I->i1 = t_to_idx(t1);
			I->i2 = t_to_idx(t2);
			I->diam = map.block_diameter(I->i1, I->i2);
			I->compute_span_level(map);
			I->set_metric(I->diam);
			update_pockets_and_Mmax(I);
			I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));

			const float boost = fmaf(0.01f, dim_f, 0.85f);
			I->R = fmaf(I->R, boost, 0.0f);

			H.emplace_back(I);
			std::push_heap(H.begin(), H.end(), ComparePtrND);
		}
		// Сбрасываем счётчик стагнации
		no_improve = 0;
	}
	const float exp_arg = fmaf(B_dim, p, 0.0f);
	const float exp_arg__ = fmaf(B_dim__, p, 0.0f);
	adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim);
	float first_sqrt = sqrtf(fmaf(1.0f / dim_f, 2.0f, 0.0f));
	float second_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 5.0f, 0.0f));
	float third_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 9.0f, 0.0f));
	float fourth_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 6.5f, 0.0f));
	float rr = sqrtf(fmaf(-p, 1.0f, 1.0f)), xx = p * p, tt = fmaf(500.0f, p, -486.95472f);
	float adaptive_coeff_ = (p < 0.95f) ? fmaf(fmaf(first_sqrt, xx, 0.0f), 0.0130349902f, fmaf(-0.04f, p, fmaf(fmaf(first_sqrt, rr, 0.0f), 0.15f, 1.1f))) : (p < 0.97390944f) ? fmaf(second_sqrt, rr, 0.9396f) : (p < 0.97590944f) ? fmaf(fmaf(fmaf(fmaf(third_sqrt, tt, 0.0f), tt, 0.0f), fmaf(-2.0f, tt, 3.0f), 0.0f), fmaf(0.25f, rr, -0.0396f), fmaf(fmaf(third_sqrt, rr, 0.0f), 0.75f, 0.9396f)) : fmaf(fourth_sqrt, rr, 0.925f);
	adaptive_coeff__ = fmaf(fmaf(exp_arg__, fmaf(exp_arg__, fmaf(exp_arg__, fmaf(exp_arg__, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition__, 2.0f - A_dim__);
	const float arg_for_T = fmaf(p, 4.5f, 2.475f);
	const float exp_argument = fmaf(-0.2925f, dim_f, 0.0f);
	const float exp2_exp_arg = fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
	const float A = fmaf(39.995f, exp2_exp_arg, 0.0f);
	const int T = static_cast<int>(fmaf(fmaf(fmaf(1.0f / fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 2.0f), 1.0498f, -0.04978f), fmaf(sqrtf(dim_f), 4.88f, 0.0f), 0.0f), fmaf(-(exp2_exp_arg - 1.0f), A, A), 0.0f));
	std::pop_heap(H.begin(), H.end(), ComparePtrND);
	IntervalND* cur = H.back();
	H.pop_back();
	const float x1 = cur->x1;
	const float x2 = cur->x2;
	const float y1 = cur->y1;
	const float y2 = cur->y2;
	float m = fmaf(r_eff, Mmax, 0.0f);
	float tNew = step(m, x1, x2, y1, y2, dim_f, r_eff);
	const float bestFOld = bestF;
	const float fNew = evalAt(tNew);
	IntervalND* L = new IntervalND(x1, tNew, y1, fNew);
	IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2);
	L->i1 = t_to_idx(x1);
	L->i2 = t_to_idx(tNew);
	Rv->i1 = t_to_idx(tNew);
	Rv->i2 = t_to_idx(x2);
	L->diam = map.block_diameter(L->i1, L->i2);
	Rv->diam = map.block_diameter(Rv->i1, Rv->i2);
	L->compute_span_level(map);
	Rv->compute_span_level(map);
	L->set_metric(L->diam);
	Rv->set_metric(Rv->diam);
	const float Mloc = fmaxf(L->M, Rv->M);
	update_pockets_and_Mmax(L);
	update_pockets_and_Mmax(Rv);
	const float prevMmax = Mmax;
	if (Mloc > Mmax)
		Mmax = Mloc;
	m = fmaf(r_eff, Mmax, 0.0f);
	if (adaptive) {
		const float len1 = fmaf(tNew, 1.0f, -x1);
		const float len2 = fmaf(x2, 1.0f, -tNew);
		if (fmaf(len1, 1.0f, len2) == dmax) {
			dmax = fmaxf(len1, len2);
			for (auto pI : H) {
				const float Ls = fmaf(pI->x2, 1.0f, -pI->x1);
				if (Ls > dmax)
					dmax = Ls;
			}
		}
		if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) {
			const float alpha = p * p;
			const float beta = fmaf(-alpha, 1.0f, 2.0f);
			const float MULT = (1.0f / dmax) * Mmax;
			const float global_coeff = fmaf(MULT, r_eff, -MULT);
			const float GF = beta * global_coeff;
			L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f)));
			Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f)));
			const size_t sz = H.size();
			RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha);
			std::make_heap(H.begin(), H.end(), ComparePtrND);
		}
		else {
			if (Mloc > prevMmax) {
				L->ChangeCharacteristic(m);
				Rv->ChangeCharacteristic(m);
				if (Mloc > fmaf(adaptive_coeff_, prevMmax, 0.0f)) {
					const size_t sz = H.size();
					RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
					std::make_heap(H.begin(), H.end(), ComparePtrND);
				}
			}
			else {
				L->ChangeCharacteristic(m);
				Rv->ChangeCharacteristic(m);
			}
		}
	}
	else {
		if (Mloc > prevMmax) {
			L->ChangeCharacteristic(m);
			Rv->ChangeCharacteristic(m);
			if (Mloc > fmaf(adaptive_coeff_, prevMmax, 0.0f)) {
				const size_t sz = H.size();
				RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
				std::make_heap(H.begin(), H.end(), ComparePtrND);
			}
		}
		else {
			L->ChangeCharacteristic(m);
			Rv->ChangeCharacteristic(m);
		}
	}
	H.emplace_back(L);
	std::push_heap(H.begin(), H.end(), ComparePtrND);
	H.emplace_back(Rv);
	std::push_heap(H.begin(), H.end(), ComparePtrND);
	_mm_prefetch((const char*)H[0], _MM_HINT_T0);
	_mm_prefetch((const char*)H[1], _MM_HINT_T0);
	IntervalND* const top = H.front();
	const float interval_len = dim > 1 ? fmaf(top->x2, 1.0f, -top->x1) : top->diam;
	if (/*(dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || */it == maxIter - 1) {
		memcpy(q_local, bestQ.data(), static_cast<size_t>(dim) * sizeof(float));
		float x_final = bestX;
		float y_final = bestY;
		float f_final = bestF;
		const int last = n - 1;
		const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
		const float hi = 2.6179938779914943653855361527329f;
		float bestLocF = f_final;
		float saved = q_local[last];
		float delta = 0.05f;
		while (delta >= 0.00625f) {
			int sgn = -1;
			while (sgn < 2) {
				float cand = fmaf(static_cast<float>(sgn), delta, saved);
				if (cand < lo) cand = lo;
				else if (cand > hi) cand = hi;
				const float backup = q_local[last];
				q_local[last] = cand;
				float x2;
				float y2;
				const float f2 = cost(q_local, x2, y2);
				if (f2 < bestLocF) {
					bestLocF = f2;
					x_final = x2;
					y_final = y2;
					saved = cand;
				}
				q_local[last] = backup;
				sgn += 2;
			}
			delta *= 0.5f;
		}
		if (bestLocF < f_final) {
			q_local[last] = saved;
			f_final = bestLocF;
			bestF = f_final;
			bestX = x_final;
			bestY = y_final;
			bestQ.assign(q_local, q_local + dim);
		}
		out_iterations = static_cast<size_t>(it);
		out_achieved_epsilon = interval_len;
		return;
	}
	if (bestF < fmaf(bestFOld, adaptive_coeff__, 0.0f)) {
		BestPointMsg out;
		out.bestF = bestF;
		out.bestX = bestX;
		out.bestY = bestY;
		out.dim = static_cast<unsigned>(dim);
		memcpy(out.bestQ, bestQ.data(), static_cast<size_t>(dim) * sizeof(float));
		IntervalND* I0 = H[0];
		IntervalND* I1 = H[1];
		map.map01ToPoint(I0->x1, out.q1_left);
		map.map01ToPoint(I0->x2, out.q1_right);
		out.f1_left = I0->y1;
		out.f1_right = I0->y2;
		map.map01ToPoint(I1->x1, out.q2_left);
		map.map01ToPoint(I1->x2, out.q2_right);
		out.f2_left = I1->y1;
		out.f2_right = I1->y2;
		const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
		bool active = true;
		const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter), 1.0f, 1.0f)) & 1);
		size_t ii2 = 0u;
		while (ii2 < iterations && active) {
			const size_t step = 1ULL << ii2;
			const int partner = rank ^ static_cast<int>(step);
			if (partner < world) {
				const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
				if (am_sender) {
					pending_requests->push_back(g_world->isend(partner, 2, out));
					active = false;
				}
			}
			++ii2;
		}
		++exchange_counter;
	}
	while (g_world->iprobe(boost::mpi::any_source, 2)) {
		BestPointMsg bm;
		g_world->recv(boost::mpi::any_source, 2, bm);

		if (bm.bestF < fmaf(bestF, adaptive_coeff__, 0.0f) || stagnation) {
			_mm_prefetch((const char*)H[0], _MM_HINT_T0);
			_mm_prefetch((const char*)H[1], _MM_HINT_T0);

			// Обновляем лучшую точку, если она лучше
			if (bm.bestF < bestF) {
				bestF = bm.bestF;
				bestX = bm.bestX;
				bestY = bm.bestY;
				bestQ.assign(bm.bestQ, bm.bestQ + bm.dim);
			}

			// Предрасчёт коэффициентов (как в оригинале)
			const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f),
				fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f),
					fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f),
						fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f),
							0.00833333377f, 0.0416666679f),
						0.16666667f),
					0.5f),
				1.0f) - 1.0f;
			const float kf = stagnation ? fmaf(0.5819767068693265f, poly, 0.4f)
				: fmaf(0.3891860241215959f, poly, 0.5f);
			float adaptive_coeff_clone0 = fmaf(-1.0f, adaptive_coeff_addition, A_dim_clone);
			const float exp_arg = B_dim;
			float adaptive_coeff_clone1 = fmaf(-fmaf(exp_arg,
				fmaf(exp_arg,
					fmaf(exp_arg,
						fmaf(exp_arg, 0.00833333377f, 0.0416666679f),
						0.16666667f),
					0.5f),
				1.0f),
				adaptive_coeff_addition, A_dim_clone);

			{
				float t1 = map.pointToT(bm.q1_left);
				float t2 = map.pointToT(bm.q1_right);
				IntervalND* I = new IntervalND(t1, t2, bm.f1_left, bm.f1_right);
				I->i1 = t_to_idx(t1);
				I->i2 = t_to_idx(t2);
				I->diam = map.block_diameter(I->i1, I->i2);
				I->compute_span_level(map);
				I->set_metric(I->diam);
				update_pockets_and_Mmax(I);
				I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
				I->R = fmaf(I->R, fmaf(kf, adaptive_coeff_clone0, 0.0f), 0.0f);
				H.emplace_back(I);
				std::push_heap(H.begin(), H.end(), ComparePtrND);
			}

			{
				float t1 = map.pointToT(bm.q2_left);
				float t2 = map.pointToT(bm.q2_right);
				IntervalND* I = new IntervalND(t1, t2, bm.f2_left, bm.f2_right);
				I->i1 = t_to_idx(t1);
				I->i2 = t_to_idx(t2);
				I->diam = map.block_diameter(I->i1, I->i2);
				I->compute_span_level(map);
				I->set_metric(I->diam);
				update_pockets_and_Mmax(I);
				I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
				I->R = fmaf(I->R, fmaf(kf, adaptive_coeff_clone1, 0.0f), 0.0f);
				H.emplace_back(I);
				std::push_heap(H.begin(), H.end(), ComparePtrND);
			}
		}
	}
	++it;
}

}" - код сейчас, до этого была другая пересылка: "if (bestF < fmaf(bestFOld, adaptive_coeff__, 0.0f)) {
BestSolutionMsg out;
out.bestF = bestF;
out.bestX = bestX;
out.bestY = bestY;
out.dim = static_cast<unsigned>(bestQ.size());
memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float));
{
IntervalND* const I0 = H[0];
out.bestI[0] = I0->x1;
out.bestI[1] = I0->x2;
out.bestI[2] = I0->y1;
out.bestI[3] = I0->y2;
IntervalND* const I1 = H[1];
out.bestI[4] = I1->x1;
out.bestI[5] = I1->x2;
out.bestI[6] = I1->y1;
out.bestI[7] = I1->y2;
}
const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
bool active = true;
const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter), 1.0f, 1.0f)) & 1);
size_t ii2 = 0u;
while (ii2 < iterations && active) {
const size_t step = 1ULL << ii2;
const int partner = rank ^ static_cast<int>(step);
if (partner < world) {
const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
if (am_sender) {
pending_requests->push_back(g_world->isend(partner, 2, out));
active = false;
}
}
++ii2;
}
++exchange_counter;
}
while (g_world->iprobe(boost::mpi::any_source, 2)) {
BestSolutionMsg bm;
g_world->recv(boost::mpi::any_source, 2, bm);
if (bm.bestF < fmaf(bestF, adaptive_coeff__, 0.0f) || stagnation) {
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f;
const float kf = stagnation ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f);
float adaptive_coeff_clone0 = fmaf(-1.0f, adaptive_coeff_addition, A_dim_clone);
const float exp_arg = B_dim;
float adaptive_coeff_clone1 = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim_clone);
{
const float sx = bm.bestI[0];
const float ex = bm.bestI[1];
const float y1i = bm.bestI[2];
const float y2i = bm.bestI[3];
IntervalND* I = new IntervalND(sx, ex, y1i, y2i);
I->i1 = t_to_idx(sx);
I->i2 = t_to_idx(ex);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, fmaf(kf, adaptive_coeff_clone0, 0.0f), 0.0f);
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
{
const float sx = bm.bestI[4];
const float ex = bm.bestI[5];
const float y1i = bm.bestI[6];
const float y2i = bm.bestI[7];
IntervalND* I = new IntervalND(sx, ex, y1i, y2i);
I->i1 = t_to_idx(sx);
I->i2 = t_to_idx(ex);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, fmaf(kf, adaptive_coeff_clone1, 0.0f), 0.0f);
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
if (bm.bestF < bestF) {
bestF = bm.bestF;
bestX = bm.bestX;
bestY = bm.bestY;
bestQ.assign(bm.bestQ, bm.bestQ + bm.dim);
}
}
}", при этом структуры для MPI: "__declspec(align(16)) struct CrossMsg final {
float s_x1;
float s_x2;
float e_x1;
float e_x2;
float Rtop;

text
template <typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept {
	ar& s_x1& s_x2& e_x1& e_x2& Rtop;
}

};

__declspec(align(16)) struct MultiCrossMsg final {
float intervals[35];
unsigned count;

text
template <typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept {
	ar& intervals& count;
}

};

__declspec(align(16)) struct BestSolutionMsg final {
float bestF;
float bestX;
float bestY;
float bestQ[32];
float bestI[8];
unsigned dim;

text
template <typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept {
	ar& bestF& bestX& bestY& bestQ& bestI& dim;
}

};

// Новая структура для передачи лучшего решения и двух интервалов в виде точек!!!!!!!!!!!!!!!!!!!!!
__declspec(align(16)) struct BestPointMsg {
float bestF;
float bestX;
float bestY;
float bestQ[32];
unsigned dim;
float f1_left, f1_right;
float q1_left[32];
float q1_right[32];
float f2_left, f2_right;
float q2_left[32];
float q2_right[32];

text
template <typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept {
	ar& bestF& bestX& bestY& bestQ& dim
		& f1_left& f1_right& q1_left& q1_right
		& f2_left& f2_right& q2_left& q2_right;
}

};", код для Мортон-кодирования: "__declspec(align(16)) struct MortonCachePerRank final sealed{
std::vector<int> permCache;
std::vector<unsigned long long> invMaskCache;
unsigned baseSeed;
};

static thread_local MortonCachePerRank g_mc;

static __declspec(noalias) __forceinline unsigned long long gray_encode(unsigned long long x) noexcept {
return x ^ (x >> 1);
}

static __declspec(noalias) __forceinline long long gray_decode(unsigned long long g) noexcept {
g ^= g >> 32;
g ^= g >> 16;
g ^= g >> 8;
g ^= g >> 4;
g ^= g >> 2;
g ^= g >> 1;
return static_cast<long long>(g);
}

__declspec(align(16)) struct MortonND final sealed{
const int dim;
const int levels;
const int eff_levels;
const int extra_levels;
const int chunks;
std::vector<int> chunk_bits;
std::vector<unsigned long long> chunk_bases;
unsigned long long scale;
std::vector<float> low;
std::vector<float> high;
std::vector<float> step;
std::vector<float> invStep;
std::vector<float> baseOff;
std::vector<int> perm;
std::vector<unsigned long long> invMask;
std::vector<unsigned long long> pextMask;
std::vector<unsigned long long> pextMaskChunks;
const float invScaleLevel;
const bool use_gray;

text
	static __declspec(noalias) __forceinline unsigned long long make_mask(int dim, int Lc, int d) noexcept {
			unsigned long long m = 0ull;
			unsigned long long bitpos = static_cast<unsigned long long>(d);
			int b = 0;

#pragma loop ivdep
while (b < Lc) {
m |= 1ull << bitpos;
bitpos += static_cast<unsigned long long>(dim);
++b;
}
return m;
}

text
	__declspec(noalias) __forceinline MortonND(int D, int L, const float* lows, const float* highs, const MortonCachePerRank& mc)
			: dim(D), levels(L),
				eff_levels((std::max)(1, static_cast<int>(63 / (D ? D : 1)))),
				extra_levels((L > eff_levels) ? (L - eff_levels) : 0),
				chunks((extra_levels > 0) ? (1 + (extra_levels + eff_levels - 1) / eff_levels) : 1),
				low(lows, lows + D), high(highs, highs + D), step(D, 0.0f), invStep(D, 0.0f), baseOff(D, 0.0f),
				perm(mc.permCache.begin(), mc.permCache.begin() + D),
				invMask(mc.invMaskCache.begin(), mc.invMaskCache.begin() + D),
				invScaleLevel(1.0f / static_cast<float>(static_cast<unsigned long long>(1) << L)), use_gray(true) {
			int d = 0;

#pragma loop ivdep
while (d < dim) {
const float rng = high[d] - low[d];
const float st = rng * invScaleLevel;
step[d] = st;
invStep[d] = 1.0f / st;
baseOff[d] = fmaf(0.5f, st, low[d]);
++d;
}
chunk_bits.resize(chunks);
pextMaskChunks.resize(static_cast<size_t>(chunks) * static_cast<size_t>(dim));
chunk_bases.resize(chunks);
int remaining = levels;
int c = 0;
while (c < chunks) {
const int Lc = (c == 0) ? (std::min)(eff_levels, remaining) : (std::min)(eff_levels, remaining);
chunk_bits[c] = Lc;
remaining -= Lc;
const unsigned long long baseC = static_cast<unsigned long long>(1) << (dim * Lc);
chunk_bases[c] = baseC;
d = 0;
#pragma loop ivdep
while (d < dim) {
pextMaskChunks[static_cast<size_t>(c) * static_cast<size_t>(dim) + static_cast<size_t>(d)] = make_mask(dim, Lc, d);
++d;
}
++c;
}
pextMask.resize(dim);
d = 0;
#pragma loop ivdep
while (d < dim) {
pextMask[d] = make_mask(dim, chunk_bits[0], d);
++d;
}
scale = static_cast<unsigned long long>(1) << (dim * chunk_bits[0]);
}

text
	__declspec(noalias) __forceinline float block_diameter(unsigned long long i1, unsigned long long i2) const noexcept {
			if (i1 > i2)
					std::swap(i1, i2);
			float s2 = 0.0f;
			int d = 0;

#pragma loop ivdep
while (d < dim) {
const int pd = perm[d];
const unsigned long long varying = (i1 ^ i2) & pextMask[d];
const int nfree_hi = _mm_popcnt_u64(varying);
const int nfree_total = nfree_hi + levels - chunk_bits[0];
const float range = fmaf(step[pd], fmaf(ldexpf(1.0f, nfree_total), 1.0f, -1.0f), 0.0f);
s2 = fmaf(range, range, s2);
++d;
}
return sqrtf(s2);
}

text
	__declspec(noalias) __forceinline void map01ToPoint(float t, float* __restrict out) const noexcept {
			float u = t;
			unsigned long long accBits[32] = {0ull};
			int c = 0;
			while (c < chunks) {
					const int Lc = chunk_bits[c];
					const unsigned long long baseC = chunk_bases[c];
					u *= static_cast<float>(baseC);
					unsigned long long idxc = static_cast<unsigned long long>(u);
					u -= static_cast<float>(idxc);
					if (use_gray)
							idxc = gray_encode(idxc);
					int shift_from_top = 0;
					int k = 0;
					while (k <= c) {
							shift_from_top += chunk_bits[k];
							++k;
					}
					const int inv_shift = levels - shift_from_top;
					int d = 0;

#pragma loop ivdep
while (d < dim) {
const int pd = perm[d];
const unsigned long long mask = pextMaskChunks[static_cast<size_t>(c) * static_cast<size_t>(dim) + static_cast<size_t>(d)];
unsigned long long bits = _pext_u64(idxc, mask);
if (inv_shift >= 0 && chunk_bits[c] < 63) {
const unsigned long long invMaskSegment = (invMask[pd] >> inv_shift) & (static_cast<unsigned long long>(1) << chunk_bits[c]) - 1ull;
bits ^= invMaskSegment;
}
accBits[pd] = (accBits[pd] << Lc) | bits;
++d;
}
++c;
}
int d = 0;
#pragma loop ivdep
while (d < dim) {
out[d] = fmaf(step[d], static_cast<float>(accBits[d]), baseOff[d]);
++d;
}
}

__declspec(noalias) __forceinline float pointToT(const float* __restrict q) const noexcept {
unsigned long long cell[32];
int d = 0;
#pragma loop ivdep
while (d < dim) {
const int pd = perm[d];
const float v = (q[pd] - baseOff[pd]) * invStep[pd];
const long long c = _mm_cvt_ss2si(_mm_round_ss(_mm_setzero_ps(), _mm_set_ss(v),
_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
const long long maxv = (static_cast<long long>(1) << levels) - 1;
cell[pd] = static_cast<unsigned long long>(c < 0 ? 0 : (c > maxv ? maxv : c));
++d;
}
float t = 0.0;
int c = chunks;
while (c > 0) {
--c;
const int Lc = chunk_bits[c];
const unsigned long long baseC = chunk_bases[c];
int shift_from_top = 0;
int k = 0;
while (k <= c) {
shift_from_top += chunk_bits[k];
++k;
}
const int inv_shift = levels - shift_from_top;
unsigned long long idxc = 0ull;
d = 0;
#pragma loop ivdep
while (d < dim) {
const int pd = perm[d];
const unsigned long long mask = (static_cast<unsigned long long>(1) << Lc) - 1;
unsigned long long bits = (cell[pd] >> inv_shift) & mask;

text
					if (inv_shift >= 0 && chunk_bits[c] < 63) {
							const unsigned long long invMaskSegment = (invMask[pd] >> inv_shift) & mask;
							bits ^= invMaskSegment;
					}

					const unsigned long long pdep_mask = pextMaskChunks[static_cast<size_t>(c) * dim + d];
					idxc |= _pdep_u64(bits, pdep_mask);
					++d;
			}
			if (use_gray)
					idxc = gray_decode(idxc);
			t = (t + static_cast<float>(idxc)) / static_cast<float>(baseC);
	}
	return static_cast<float>(t);

}
};

__declspec(noalias) __forceinline void IntervalND::compute_span_level(const MortonND& map) noexcept {
span_level = 0;
int d = 0;
#pragma loop ivdep
while (d < map.dim) {
const unsigned long long varying = (i1 ^ i2) & map.pextMask[d];
span_level += _mm_popcnt_u64(varying);
++d;
}
span_level += (map.levels - map.chunk_bits[0]) * map.dim;
span_level = (std::min)(span_level, 11);
}" - после перехода на новый тип пересылки, точность осталась в точности такой же, но скорость упала в 50 раз, в чём проблема? исправь, структура для интервалов: "__declspec(align(16)) struct IntervalND final sealed{
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float ordinate_factor;
float N_factor;
float quadratic_term;
float M;
float R;
unsigned long long i1;
unsigned long long i2;
float diam;
int span_level;

text
	static __declspec(noalias) __forceinline void* operator new(size_t) noexcept {
			Slab* s = tls.local();
			char* r = s->current;
			s->current += 64u;
			return r;
	}

	__declspec(noalias) __forceinline IntervalND(float _x1, float _x2, float _y1, float _y2) noexcept
			: x1(_x1), x2(_x2), y1(_y1), y2(_y2), delta_y(fmaf(_y2, 1.0f, -_y1)),
				ordinate_factor(fmaf(fmaf(-y1, 1.0f, -y2), 2.0f, 0.0f)),
				N_factor(0), quadratic_term(0), M(0), R(0),
				i1(0), i2(0), diam(0), span_level(0) {

"static __declspec(noalias) void agp_run_branch_mp...

Question

Partager cette Q&R