"static __declspec(noalias) void agp_run_branch_mp...

Question

"static __declspec(noalias) void agp_run_branch_mpi(
const MortonND& map,
const ManipCost& cost,
int maxIter,
float r,
bool adaptive,
float eps,
unsigned seed,
std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H,
std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ,
float& bestF,
float& bestX,
float& bestY,
size_t& out_iterations,
float& out_achieved_epsilon,
float M_prior) noexcept
{
const int n = cost.n;
const int dim = n + (cost.variableLen ? n : 0);
const float dim_f = static_cast<float>(dim);
unsigned exchange_counter = 0;
unsigned exchange_counter_T = 0;
alignas(16) float M_by_span[12];
int msi = 0;
while (msi < 12) M_by_span[msi++] = M_prior;
float Mmax = M_prior;
alignas(16) float q_local[32];
alignas(16) float phi[32];
alignas(16) float s_arr[32];
alignas(16) float c_arr[32];
alignas(16) float sum_s[32];
alignas(16) float sum_c[32];
alignas(16) float q_try[32];
bestQ.reserve(static_cast<size_t>(dim));
float x = 0.0f;
float y = 0.0f;
int no_improve = 0;
auto t_to_idx = [&](float t) -> unsigned long long {
unsigned long long idx = static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f));
return idx;
};
auto update_pockets_and_Mmax = [&](IntervalND* I) {
const int k = I->span_level;
if (I->M > M_by_span[k]) M_by_span[k] = I->M;
if (M_by_span[k] > Mmax) Mmax = M_by_span[k];
};
const float a = 0.0f;
const float b = 1.0f;
float p = 0.0f;
float dmax = fmaf(b, 1.0f, -a);
const float initial_len = dmax;
float exp_arg_threshold = fmaf(1.0f / fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, -0.2f, 0.25f), -0.33333333f), 0.5f), -1.0f), dim_f), 0.415888308336f, 0.0f);
const float log_arg_threshold = fmaf(dim_f, 0.1f, -0.105f);
const float adaptive_coeff_addition_threshold = fmaf(fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 1.0f / fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg_threshold), 0.17814618538f);
float adaptive_coeff_threshold = adaptive_coeff_addition_threshold + 1.0f;
const float log_arg = dim_f + 5.0f;
const float A_dim = fmaf(1.0f / fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), 3.0f, 0.0f);
const float B_dim = fmaf(A_dim, 0.65f, 0.0f);
const float log_argument = A_dim - 2.03f;
const float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim;
const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.55f), 1.0f);
float adaptive_coeff = A_dim - adaptive_coeff_addition;
const float A_dim_clone = fmaf(A_dim - fmaf(-fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim), 0.5f, 0.0f);
int it = 0;
auto evalAt = [&](float t) -> float {
map.map01ToPoint(t, q_local);
float f = cost(q_local, x, y);
const float step_arg = fmaf(-dim_f, 0.825f, 3.3f);
float eta = fmaf(fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f);
if ((((f < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) && p > 0.55f) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && ((p > 0.7f && !(it % 3)) || p > 0.9f))) && dim > 2) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && dim < 3)) {
float acc = 0.0f;
int ii = 0;
while (ii < n) {
acc = fmaf(q_local[ii], 1.0f, acc);
phi[ii] = acc;
++ii;
}
FABE13_SINCOS(phi, s_arr, c_arr, n);
float as = 0.0f;
float ac = 0.0f;
int k = n - 1;
while (k >= 0) {
const float Lk = cost.variableLen ? q_local[n + k] : 1.0f;
as = fmaf(Lk, s_arr[k], as);
ac = fmaf(Lk, c_arr[k], ac);
sum_s[k] = as;
sum_c[k] = ac;
--k;
}
const float dx = fmaf(x, 1.0f, -cost.targetX);
const float dy = fmaf(y, 1.0f, -cost.targetY);
const float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1.0e-8f;
int stepI = 0;
const float adaptive_window = fmaf(1.0f / fmaf(bestF, adaptive_coeff_threshold, 0.0f), fmaf(bestF, adaptive_coeff_threshold, -f), 0.0f);
const float curse_dim_arg = fmaf(dim_f, 0.825f, 0.0f);
const int threshold_iters_grad = static_cast<int>(fmaf(fmaf(adaptive_window, adaptive_window, 1.0f), fmaf(fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f), 0.0f));
int no_improve_steps = 0;
float f_at_check_start = f;
int steps_since_check = 0;
const int check_interval = static_cast<int>(fmaf(1.0f / log2f(fmaf(dim_f, 0.5f, 1.0f)), static_cast<float>(threshold_iters_grad >> 2), 0.0f));
while (stepI < threshold_iters_grad) {
int i = 0;
#pragma loop ivdep
while (i < n) {
float gpen = 0.0f;
{
const float ai = fabsf(q_local[i]);
const float v = fmaf(ai, 1.0f, -cost.minTheta);
if (v > 0.0f) {
const float scale_arg = fmaf(2.0f / (cost.minTheta + 1.0e-6f), fmaf(v, 0.69314718055994530941723212145818f, 0.0f), 0.0f);
const float exp_val = fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float dpen_dtheta = fmaf(cost.sharpW, fmaf(exp_val, 0.69314718055994530941723212145818f * (2.0f / (cost.minTheta + 1.0e-6f)), 0.0f), copysignf(1.0f, q_local[i]));
gpen = fmaf(dpen_dtheta, 1.0f, gpen);
}
}
{
const float tsg = fmaf(-q_local[i], cost.archBiasK, 0.0f);
const float exp_arg = -tsg;
const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float sig = 1.0f / fmaf(exp_val, 1.0f, 1.0f);
gpen = fmaf(-fmaf(cost.archBiasW, cost.archBiasK, 0.0f), sig, gpen);
}
const float g = fmaf(fmaf(dx, -sum_s[i], fmaf(dy, sum_c[i], 0.0f)), 1.0f / dist, gpen);
q_try[i] = fmaf(-eta, g, q_local[i]);
const float lo = (i == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
const float hi = 2.6179938779914943653855361527329f;
if (q_try[i] < lo) q_try[i] = lo;
else if (q_try[i] > hi) q_try[i] = hi;
++i;
}
if (cost.variableLen) {
int j = 0;
#pragma loop ivdep
while (j < n) {
const float g = fmaf(fmaf(dx, c_arr[j], fmaf(dy, s_arr[j], 0.0f)), 1.0f / dist, 0.0f);
float v = fmaf(-eta, g, q_local[n + j]);
if (v < 0.5f) v = 0.5f;
else if (v > 2.0f) v = 2.0f;
q_try[n + j] = v;
++j;
}
}
float x2;
float y2;
const float f2 = cost(q_try, x2, y2);
const float rel_improvement = fmaf(-1.0f / f, f2, 1.0f);
if (f2 < f) {
memcpy(q_local, q_try, static_cast<size_t>(dim) * sizeof(float));
f = f2;
x = x2;
y = y2;
eta = fmaf(eta, fmaf(10.0f / sqrtf(dim_f), 1.0f / (1.0f + rel_improvement) * rel_improvement, 1.0f), 0.0f);
}
else {
const float caution_coeff = 5.0f / sqrtf(dim_f) * rel_improvement;
eta = fmaf(eta, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0f);
memcpy(q_try, q_local, static_cast<size_t>(dim) * sizeof(float));
++no_improve_steps;
}
++steps_since_check;
if (steps_since_check == check_interval) {
if (fmaf(-1.0f / f_at_check_start, f, 1.0f) < 1e-3f) break;
f_at_check_start = f;
steps_since_check = 0;
}
if (no_improve_steps > threshold_iters_grad >> 2) break;
++stepI;
}
}
if (f < bestF) {
if (0.95f < p || dim < 3) {
const int last = n - 1;
const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
const float hi = 2.6179938779914943653855361527329f;
float bestLocF = f;
float saved = q_local[last];
float delta = eta;
while (delta > fmaf(delta, 0.15f, 0.0f)) {
int sgn = -1;
while (sgn < 2) {
float cand = fmaf(static_cast<float>(sgn), delta, saved);
if (cand < lo) cand = lo;
else if (cand > hi) cand = hi;
const float backup = q_local[last];
q_local[last] = cand;
float x2;
float y2;
const float f2 = cost(q_local, x2, y2);
if (f2 < bestLocF) {
bestLocF = f2;
x = x2;
y = y2;
saved = cand;
}
q_local[last] = backup;
sgn += 2;
}
delta = fmaf(delta, 0.5f, 0.0f);
}
if (bestLocF < f) {
q_local[last] = saved;
f = bestLocF;
}
}
bestF = f;
bestQ.assign(q_local, q_local + dim);
bestX = x;
bestY = y;
no_improve = 0;
}
else ++no_improve;
return f;
};
const float f_a = evalAt(a);
const float f_b = evalAt(b);
const int K = static_cast<int>(fmaf(-fmaf(sqrtf(dim_f), dim_f, 0.0f), 0.725f, 10.95f));
H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + 16u);
const int rank = g_world->rank();
const int world = g_world->size();
alignas(16) float seeds[256 * 32];
const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, static_cast<unsigned>(fmaf(static_cast<float>(rank), 7919.0f, static_cast<float>(seed))));
int i = 0;
while (i < seedCnt) {
const float* s = seeds + static_cast<size_t>(fmaf(static_cast<float>(i), 32.0f, 0.0f));
const float t_seed = map.pointToT(s);
const float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f) : fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f), exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
const float t1 = fmaf(-interval_size, 1.0f, t_seed);
const float t2 = fmaf(interval_size, 1.0f, t_seed);
alignas(16) float q1[32];
alignas(16) float q2[32];
float x1;
float y1;
float x2;
float y2;
map.map01ToPoint(t1, q1);
const float f1 = cost(q1, x1, y1);
map.map01ToPoint(t2, q2);
const float f2 = cost(q2, x2, y2);
IntervalND* I = new IntervalND(t1, t2, f1, f2);
I->i1 = t_to_idx(t1);
I->i2 = t_to_idx(t2);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
if (i < 3) I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f);
else {
const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f);
const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f);
const float mult = fmaf(start_mult, exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
I->R = fmaf(I->R, mult, 0.0f);
}
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
if (f1 < bestF) {
bestF = f1;
bestQ.assign(q1, q1 + dim);
bestX = x1;
bestY = y1;
}
if (f2 < bestF) {
bestF = f2;
bestQ.assign(q2, q2 + dim);
bestX = x2;
bestY = y2;
}
++i;
}
float prev_t = a;
float prev_f = f_a;
int k = 1;
while (k <= K) {
const float t = fmaf(fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1)));
const float f = evalAt(t);
IntervalND* I = new IntervalND(prev_t, t, prev_f, f);
I->i1 = t_to_idx(prev_t);
I->i2 = t_to_idx(t);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
prev_t = t;
prev_f = f;
++k;
}
IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b);
tail->i1 = t_to_idx(prev_t);
tail->i2 = t_to_idx(b);
tail->diam = map.block_diameter(tail->i1, tail->i2);
tail->compute_span_level(map);
tail->set_metric(tail->diam);
update_pockets_and_Mmax(tail);
tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(tail);
std::push_heap(H.begin(), H.end(), ComparePtrND);
const int kickEveryDim = static_cast<int>(fmaf(22.5f, exp2f(-0.295f * dim_f), 0.0f));
const int noImproveThrDim = static_cast<int>(fmaf(32.5f, exp2f(-0.1f * dim_f), 0.0f));
while (it < maxIter) {
p = fmaf(-1.0f / initial_len, dmax, 1.0f);
const float p_arg = fmaf(p, 2.3f, -2.9775f);
const float r_eff = fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, r, 0.0f);
const bool staganition = no_improve > noImproveThrDim;
if (!(it % kickEveryDim) || staganition) {
const float t_best = map.pointToT(bestQ.data());
const float log_arg = fmaf(dim_f, 1.0f, -1.0f);
const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f);
int ii = 0;
while (ii < 2) {
const float off = (ii == 0) ? fmaf(adaptive_diameter, 2.0f, 0.0f) : fmaf(-adaptive_diameter, 2.0f, 0.0f);
const float t_seed = fminf(b, fmaxf(a, fmaf(off, 1.0f, t_best)));
const float f_seed = evalAt(t_seed);
IntervalND* J = new IntervalND(fmaf(-adaptive_diameter, 1.0f, t_seed), fmaf(adaptive_diameter, 1.0f, t_seed), f_seed, f_seed);
J->i1 = t_to_idx(fmaf(-adaptive_diameter, 1.0f, t_seed));
J->i2 = t_to_idx(fmaf(adaptive_diameter, 1.0f, t_seed));
J->diam = map.block_diameter(J->i1, J->i2);
J->compute_span_level(map);
J->set_metric(J->diam);
update_pockets_and_Mmax(J);
J->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
J->R = fmaf(J->R, 2.03f - adaptive_coeff, 0.0f);
H.emplace_back(J);
std::push_heap(H.begin(), H.end(), ComparePtrND);
++ii;
}
no_improve = 0;
}
const float tmp_exp_arg_threshold = fmaf(-exp_arg_threshold * p, 10.0f, 0.0f);
adaptive_coeff_threshold = fmaf(fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition_threshold, 1.0f);
const float exp_arg = fmaf(B_dim, p, 0.0f);
adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim);
if (dim < 3) {
const float adaptive_coeff_arg = adaptive_coeff - 1.0f;
const float adaptive_coeff_threshold_arg = adaptive_coeff_threshold - 1.0f;
adaptive_coeff = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
adaptive_coeff_threshold = fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_threshold_arg);
}
const float arg_for_T = fmaf(p, 4.5f, 2.475f);
const float exp_argument = fmaf(-0.2925f, dim_f, 0.0f);
const float exp2_exp_arg = fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float A = fmaf(39.995f, exp2_exp_arg, 0.0f);
const int T = static_cast<int>(fmaf(fmaf(fmaf(1.0f / fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 2.0f), 1.0498f, -0.04978f), fmaf(sqrtf(dim_f), 4.88f, 0.0f), 0.0f), fmaf(-(exp2_exp_arg - 1.0f), A, A), 0.0f));
std::pop_heap(H.begin(), H.end(), ComparePtrND);
IntervalND* cur = H.back();
H.pop_back();
const float x1 = cur->x1;
const float x2 = cur->x2;
const float y1 = cur->y1;
const float y2 = cur->y2;
float m = fmaf(r_eff, Mmax, 0.0f);
float tNew = step(m, x1, x2, y1, y2, dim_f, r_eff);
const float bestFOld = bestF;
const float fNew = evalAt(tNew);
IntervalND* L = new IntervalND(x1, tNew, y1, fNew);
IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2);
L->i1 = t_to_idx(x1);
L->i2 = t_to_idx(tNew);
Rv->i1 = t_to_idx(tNew);
Rv->i2 = t_to_idx(x2);
L->diam = map.block_diameter(L->i1, L->i2);
Rv->diam = map.block_diameter(Rv->i1, Rv->i2);
L->compute_span_level(map);
Rv->compute_span_level(map);
L->set_metric(L->diam);
Rv->set_metric(Rv->diam);
const float Mloc = fmaxf(L->M, Rv->M);
update_pockets_and_Mmax(L);
update_pockets_and_Mmax(Rv);
const float prevMmax = Mmax;
if (Mloc > Mmax) Mmax = Mloc;
m = fmaf(r_eff, Mmax, 0.0f);
if (adaptive) {
const float len1 = fmaf(tNew, 1.0f, -x1);
const float len2 = fmaf(x2, 1.0f, -tNew);
if (fmaf(len1, 1.0f, len2) == dmax) {
dmax = fmaxf(len1, len2);
for (auto pI : H) {
const float Ls = fmaf(pI->x2, 1.0f, -pI->x1);
if (Ls > dmax) dmax = Ls;
}
}
if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) {
const float alpha = p * p;
const float beta = fmaf(-alpha, 1.0f, 2.0f);
const float MULT = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULT, r_eff, -MULT);
const float GF = beta * global_coeff;
L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f)));
Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f)));
const size_t sz = H.size();
RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
else {
if (Mloc > prevMmax) {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
const size_t sz = H.size();
RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
}
else {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
}
}
}
else {
if (Mloc > prevMmax) {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
const size_t sz = H.size();
RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
}
else {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
}
}
H.emplace_back(L);
std::push_heap(H.begin(), H.end(), ComparePtrND);
H.emplace_back(Rv);
std::push_heap(H.begin(), H.end(), ComparePtrND);
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
IntervalND* const top = H.front();
const float interval_len = dim > 1 ? fmaf(top->x2, 1.0f, -top->x1) : top->diam;
if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter) {
out_iterations = static_cast<size_t>(it);
out_achieved_epsilon = interval_len;
return;
}
if (!(it % T) || staganition) {
MultiCrossMsg out;
const unsigned intervals_to_send = static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f)));
out.count = intervals_to_send;
float* dest = out.intervals;
unsigned ii = 0;
while (ii < 3) {
IntervalND* Tt = H[ii];
dest[0] = Tt->x1;
dest[1] = 0.0f;
dest[2] = Tt->x2;
dest[3] = 0.0f;
dest[4] = Tt->R;
dest += 5;
++ii;
}
const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
bool active = true;
const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter_T), 1.0f, 1.0f)) & 1);
size_t ii2 = 0;
while (ii2 < iterations && active) {
const size_t step = 1ULL << ii2;
const int partner = rank ^ static_cast<int>(step);
if (partner < world) {
const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
if (am_sender) {
g_world->isend(partner, 0, out);
active = false;
}
}
++ii2;
}
++exchange_counter_T;
}
const float improvement_threshold = 2.03f - adaptive_coeff;
if (bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 9), 0.0f) && p > 0.85f || bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 7), 0.0f) && p > 0.9f || bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 5), 0.0f) && p > 0.95f) {
BestSolutionMsg out;
out.bestF = bestF;
out.bestX = bestX;
out.bestY = bestY;
out.dim = static_cast<unsigned>(bestQ.size());
memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float));
{
IntervalND* const I0 = H[0];
out.bestI[0] = I0->x1;
out.bestI[1] = I0->x2;
out.bestI[2] = I0->y1;
out.bestI[3] = I0->y2;
if (dim > 2) {
IntervalND* const I1 = H[1];
out.bestI[4] = I1->x1;
out.bestI[5] = I1->x2;
out.bestI[6] = I1->y1;
out.bestI[7] = I1->y2;
}
}
const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
bool active = true;
const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter), 1.0f, 1.0f)) & 1);
size_t ii2 = 0;
while (ii2 < iterations && active) {
const size_t step = 1ULL << ii2;
const int partner = rank ^ static_cast<int>(step);
if (partner < world) {
const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
if (am_sender) {
g_world->isend(partner, 2, out);
active = false;
}
}
++ii2;
}
++exchange_counter;
}
while (g_world->iprobe(boost::mpi::any_source, 0)) {
MultiCrossMsg in;
g_world->recv(boost::mpi::any_source, 0, in);
const MultiCrossMsg& mX = in;
unsigned ii = 0;
while (ii < mX.count) {
const float* d = &mX.intervals[ii * 5];
float sx = d[0];
float ex = d[2];
if (ex > sx) {
alignas(16) float tmp[32];
float tx;
float ty;
map.map01ToPoint(sx, tmp);
const float y1i = cost(tmp, tx, ty);
map.map01ToPoint(ex, tmp);
const float y2i = cost(tmp, tx, ty);
IntervalND* inj = new IntervalND(sx, ex, y1i, y2i);
inj->i1 = t_to_idx(sx);
inj->i2 = t_to_idx(ex);
inj->diam = map.block_diameter(inj->i1, inj->i2);
inj->compute_span_level(map);
inj->set_metric(inj->diam);
update_pockets_and_Mmax(inj);
inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
IntervalND* topH = H.front();
const float topH_R = topH->R;
const float arg_for_adaptive = fmaf(2.03f - adaptive_coeff, topH_R, 0.0f);
if (inj->R > fmaf(fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.5819767068693264243850020f, -1.0f) || staganition) {
const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f;
const float kf = staganition ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f);
const float exp_arg = fmaf(B_dim, fmaf(1.0f / static_cast<float>(mX.count - 1u), static_cast<float>(ii), 0.0f), 0.0f);
float adaptive_coeff_clone = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim_clone);
if (dim < 3) {
const float adaptive_coeff_arg = adaptive_coeff_clone - 1.0f;
adaptive_coeff_clone = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
}
const float arg_for_inj = fmaf(1.0f / fmaf(adaptive_coeff, topH_R, 0.0f), inj->R - topH_R, 1.0f);
inj->R = fmaf(d[4], fmaf(fmaf(fmaf(arg_for_inj, fmaf(arg_for_inj, fmaf(arg_for_inj, fmaf(arg_for_inj, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), kf, 0.0f), adaptive_coeff_clone, 0.0f), 0.0f);
H.emplace_back(inj);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
}
++ii;
}
}
while (g_world->iprobe(boost::mpi::any_source, 2)) {
BestSolutionMsg bm;
g_world->recv(boost::mpi::any_source, 2, bm);
if (bm.bestF < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) || staganition) {
if (dim > 2) {
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
const float adaptive_coeff_clone0 = fmaf(-1.0f, adaptive_coeff_addition, A_dim_clone);
const float exp_arg = B_dim;
const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float adaptive_coeff_clone1 = fmaf(-exp_val, adaptive_coeff_addition, A_dim_clone);
const float coeff0 = fmaf(adaptive_coeff_clone0, adaptive_coeff, 0.0f);
const float coeff1 = fmaf(adaptive_coeff_clone1, adaptive_coeff, 0.0f);
{
const float sx = bm.bestI[0];
const float ex = bm.bestI[1];
const float y1i = bm.bestI[2];
const float y2i = bm.bestI[3];
IntervalND* I = new IntervalND(sx, ex, y1i, y2i);
I->i1 = t_to_idx(sx);
I->i2 = t_to_idx(ex);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, coeff0, 0.0f);
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
{
const float sx = bm.bestI[4];
const float ex = bm.bestI[5];
const float y1i = bm.bestI[6];
const float y2i = bm.bestI[7];
IntervalND* I = new IntervalND(sx, ex, y1i, y2i);
I->i1 = t_to_idx(sx);
I->i2 = t_to_idx(ex);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, coeff1, 0.0f);
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
}
else {
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
const float sx = bm.bestI[0];
const float ex = bm.bestI[1];
const float y1i = bm.bestI[2];
const float y2i = bm.bestI[3];
IntervalND* I = new IntervalND(sx, ex, y1i, y2i);
I->i1 = t_to_idx(sx);
I->i2 = t_to_idx(ex);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, adaptive_coeff, 0.0f);
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
if (bm.bestF < bestF) {
bestF = bm.bestF;
bestX = bm.bestX;
bestY = bm.bestY;
bestQ.assign(bm.bestQ, bm.bestQ + bm.dim);
}
}
}
++it;
}
}" vs "static __declspec(noalias) void agp_run_branch_mpi(
const MortonND& map,
const ManipCost& cost,
int maxIter,
float r,
bool adaptive,
float eps,
unsigned seed,
std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H,
std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ,
float& bestF,
float& bestX,
float& bestY,
size_t& out_iterations,
float& out_achieved_epsilon,
float M_prior) noexcept
{
const int n = cost.n;
const int dim = n + (cost.variableLen ? n : 0);
const float dim_f = static_cast<float>(dim);
unsigned exchange_counter = 0;
unsigned exchange_counter_T = 0;
alignas(16) float M_by_span[12];
int msi = 0;
while (msi < 12) M_by_span[msi++] = M_prior;
float Mmax = M_prior;
alignas(16) float q_local[32];
alignas(16) float phi[32];
alignas(16) float s_arr[32];
alignas(16) float c_arr[32];
alignas(16) float sum_s[32];
alignas(16) float sum_c[32];
alignas(16) float q_try[32];
bestQ.reserve(static_cast<size_t>(dim));
float x = 0.0f;
float y = 0.0f;
int no_improve = 0;
auto t_to_idx = [&](float t) -> unsigned long long {
unsigned long long idx = static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f));
return idx;
};
auto update_pockets_and_Mmax = [&](IntervalND* I) {
const int k = I->span_level;
if (I->M > M_by_span[k]) M_by_span[k] = I->M;
if (M_by_span[k] > Mmax) Mmax = M_by_span[k];
};
const float a = 0.0f;
const float b = 1.0f;
float p = 0.0f;
float dmax = fmaf(b, 1.0f, -a);
const float initial_len = dmax;
float exp_arg_threshold = fmaf(1.0f / fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, -0.2f, 0.25f), -0.33333333f), 0.5f), -1.0f), dim_f), 0.415888308336f, 0.0f);
const float log_arg_threshold = fmaf(dim_f, 0.1f, -0.105f);
const float adaptive_coeff_addition_threshold = fmaf(fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 1.0f / fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg_threshold), 0.17814618538f);
float adaptive_coeff_threshold = adaptive_coeff_addition_threshold + 1.0f;
const float log_arg = dim_f + 5.0f;
const float A_dim = fmaf(1.0f / fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), 3.0f, 0.0f);
const float B_dim = fmaf(A_dim, 0.65f, 0.0f);
const float log_argument = A_dim - 2.03f;
const float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim;
const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.55f), 1.0f);
float adaptive_coeff = A_dim - adaptive_coeff_addition;
const float A_dim_clone = fmaf(A_dim - fmaf(-fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim), 0.5f, 0.0f);
int it = 0;
auto evalAt = [&](float t) -> float {
map.map01ToPoint(t, q_local);
float f = cost(q_local, x, y);
const float step_arg = fmaf(-dim_f, 0.825f, 3.3f);
float eta = fmaf(fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f);
if ((((f < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) && p > 0.55f) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && ((p > 0.7f && !(it % 3)) || p > 0.9f))) && dim > 2) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && dim < 3)) {
float acc = 0.0f;
int ii = 0;
while (ii < n) {
acc = fmaf(q_local[ii], 1.0f, acc);
phi[ii] = acc;
++ii;
}
FABE13_SINCOS(phi, s_arr, c_arr, n);
float as = 0.0f;
float ac = 0.0f;
int k = n - 1;
while (k >= 0) {
const float Lk = cost.variableLen ? q_local[n + k] : 1.0f;
as = fmaf(Lk, s_arr[k], as);
ac = fmaf(Lk, c_arr[k], ac);
sum_s[k] = as;
sum_c[k] = ac;
--k;
}
const float dx = fmaf(x, 1.0f, -cost.targetX);
const float dy = fmaf(y, 1.0f, -cost.targetY);
const float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1.0e-8f;
int stepI = 0;
const float adaptive_window = fmaf(1.0f / fmaf(bestF, adaptive_coeff_threshold, 0.0f), fmaf(bestF, adaptive_coeff_threshold, -f), 0.0f);
const float curse_dim_arg = fmaf(dim_f, 0.825f, 0.0f);
const int threshold_iters_grad = static_cast<int>(fmaf(fmaf(adaptive_window, adaptive_window, 1.0f), fmaf(fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f), 0.0f));
int no_improve_steps = 0;
float f_at_check_start = f;
int steps_since_check = 0;
const int check_interval = static_cast<int>(fmaf(1.0f / log2f(fmaf(dim_f, 0.5f, 1.0f)), static_cast<float>(threshold_iters_grad >> 2), 0.0f));
while (stepI < threshold_iters_grad) {
int i = 0;
#pragma loop ivdep
while (i < n) {
float gpen = 0.0f;
{
const float ai = fabsf(q_local[i]);
const float v = fmaf(ai, 1.0f, -cost.minTheta);
if (v > 0.0f) {
const float scale_arg = fmaf(2.0f / (cost.minTheta + 1.0e-6f), fmaf(v, 0.69314718055994530941723212145818f, 0.0f), 0.0f);
const float exp_val = fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float dpen_dtheta = fmaf(cost.sharpW, fmaf(exp_val, 0.69314718055994530941723212145818f * (2.0f / (cost.minTheta + 1.0e-6f)), 0.0f), copysignf(1.0f, q_local[i]));
gpen = fmaf(dpen_dtheta, 1.0f, gpen);
}
}
{
const float tsg = fmaf(-q_local[i], cost.archBiasK, 0.0f);
const float exp_arg = -tsg;
const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float sig = 1.0f / fmaf(exp_val, 1.0f, 1.0f);
gpen = fmaf(-fmaf(cost.archBiasW, cost.archBiasK, 0.0f), sig, gpen);
}
const float g = fmaf(fmaf(dx, -sum_s[i], fmaf(dy, sum_c[i], 0.0f)), 1.0f / dist, gpen);
q_try[i] = fmaf(-eta, g, q_local[i]);
const float lo = (i == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
const float hi = 2.6179938779914943653855361527329f;
if (q_try[i] < lo) q_try[i] = lo;
else if (q_try[i] > hi) q_try[i] = hi;
++i;
}
if (cost.variableLen) {
int j = 0;
#pragma loop ivdep
while (j < n) {
const float g = fmaf(fmaf(dx, c_arr[j], fmaf(dy, s_arr[j], 0.0f)), 1.0f / dist, 0.0f);
float v = fmaf(-eta, g, q_local[n + j]);
if (v < 0.5f) v = 0.5f;
else if (v > 2.0f) v = 2.0f;
q_try[n + j] = v;
++j;
}
}
float x2;
float y2;
const float f2 = cost(q_try, x2, y2);
const float rel_improvement = fmaf(-1.0f / f, f2, 1.0f);
if (f2 < f) {
memcpy(q_local, q_try, static_cast<size_t>(dim) * sizeof(float));
f = f2;
x = x2;
y = y2;
eta = fmaf(eta, fmaf(10.0f / sqrtf(dim_f), 1.0f / (1.0f + rel_improvement) * rel_improvement, 1.0f), 0.0f);
}
else {
const float caution_coeff = 5.0f / sqrtf(dim_f) * rel_improvement;
eta = fmaf(eta, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0f);
memcpy(q_try, q_local, static_cast<size_t>(dim) * sizeof(float));
++no_improve_steps;
}
++steps_since_check;
if (steps_since_check == check_interval) {
if (fmaf(-1.0f / f_at_check_start, f, 1.0f) < 1e-3f) break;
f_at_check_start = f;
steps_since_check = 0;
}
if (no_improve_steps > threshold_iters_grad >> 2) break;
++stepI;
}
}
if (f < bestF) {
if (0.95f < p || dim < 3) {
const int last = n - 1;
const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
const float hi = 2.6179938779914943653855361527329f;
float bestLocF = f;
float saved = q_local[last];
float delta = eta;
while (delta > fmaf(delta, 0.15f, 0.0f)) {
int sgn = -1;
while (sgn < 2) {
float cand = fmaf(static_cast<float>(sgn), delta, saved);
if (cand < lo) cand = lo;
else if (cand > hi) cand = hi;
const float backup = q_local[last];
q_local[last] = cand;
float x2;
float y2;
const float f2 = cost(q_local, x2, y2);
if (f2 < bestLocF) {
bestLocF = f2;
x = x2;
y = y2;
saved = cand;
}
q_local[last] = backup;
sgn += 2;
}
delta = fmaf(delta, 0.5f, 0.0f);
}
if (bestLocF < f) {
q_local[last] = saved;
f = bestLocF;
}
}
bestF = f;
bestQ.assign(q_local, q_local + dim);
bestX = x;
bestY = y;
no_improve = 0;
}
else ++no_improve;
return f;
};
const float f_a = evalAt(a);
const float f_b = evalAt(b);
const int K = static_cast<int>(fmaf(-fmaf(sqrtf(dim_f), dim_f, 0.0f), 0.725f, 10.95f));
H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + 16u);
const int rank = g_world->rank();
const int world = g_world->size();
alignas(16) float seeds[256 * 32];
const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, static_cast<unsigned>(fmaf(static_cast<float>(rank), 7919.0f, static_cast<float>(seed))));
int i = 0;
while (i < seedCnt) {
const float* s = seeds + static_cast<size_t>(fmaf(static_cast<float>(i), 32.0f, 0.0f));
const float t_seed = map.pointToT(s);
const float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f) : fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f), exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
const float t1 = fmaf(-interval_size, 1.0f, t_seed);
const float t2 = fmaf(interval_size, 1.0f, t_seed);
alignas(16) float q1[32];
alignas(16) float q2[32];
float x1;
float y1;
float x2;
float y2;
map.map01ToPoint(t1, q1);
const float f1 = cost(q1, x1, y1);
map.map01ToPoint(t2, q2);
const float f2 = cost(q2, x2, y2);
IntervalND* I = new IntervalND(t1, t2, f1, f2);
I->i1 = t_to_idx(t1);
I->i2 = t_to_idx(t2);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
if (i < 3) I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f);
else {
const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f);
const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f);
const float mult = fmaf(start_mult, exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
I->R = fmaf(I->R, mult, 0.0f);
}
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
if (f1 < bestF) {
bestF = f1;
bestQ.assign(q1, q1 + dim);
bestX = x1;
bestY = y1;
}
if (f2 < bestF) {
bestF = f2;
bestQ.assign(q2, q2 + dim);
bestX = x2;
bestY = y2;
}
++i;
}
float prev_t = a;
float prev_f = f_a;
int k = 1;
while (k <= K) {
const float t = fmaf(fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1)));
const float f = evalAt(t);
IntervalND* I = new IntervalND(prev_t, t, prev_f, f);
I->i1 = t_to_idx(prev_t);
I->i2 = t_to_idx(t);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
prev_t = t;
prev_f = f;
++k;
}
IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b);
tail->i1 = t_to_idx(prev_t);
tail->i2 = t_to_idx(b);
tail->diam = map.block_diameter(tail->i1, tail->i2);
tail->compute_span_level(map);
tail->set_metric(tail->diam);
update_pockets_and_Mmax(tail);
tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(tail);
std::push_heap(H.begin(), H.end(), ComparePtrND);
const int kickEveryDim = static_cast<int>(fmaf(22.5f, exp2f(-0.295f * dim_f), 0.0f));
const int noImproveThrDim = static_cast<int>(fmaf(32.5f, exp2f(-0.1f * dim_f), 0.0f));
while (it < maxIter) {
p = fmaf(-1.0f / initial_len, dmax, 1.0f);
const float p_arg = fmaf(p, 2.3f, -2.9775f);
const float r_eff = fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, r, 0.0f);
const bool staganition = no_improve > noImproveThrDim;
if (!(it % kickEveryDim) || staganition) {
const float t_best = map.pointToT(bestQ.data());
const float log_arg = fmaf(dim_f, 1.0f, -1.0f);
const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f);
int ii = 0;
while (ii < 2) {
const float off = (ii == 0) ? fmaf(adaptive_diameter, 2.0f, 0.0f) : fmaf(-adaptive_diameter, 2.0f, 0.0f);
const float t_seed = fmaf(off, 1.0f, t_best);
const float f_seed = evalAt(t_seed);
IntervalND* J = new IntervalND(fmaf(-adaptive_diameter, 1.0f, t_seed), fmaf(adaptive_diameter, 1.0f, t_seed), f_seed, f_seed);
J->i1 = t_to_idx(fmaf(-adaptive_diameter, 1.0f, t_seed));
J->i2 = t_to_idx(fmaf(adaptive_diameter, 1.0f, t_seed));
J->diam = map.block_diameter(J->i1, J->i2);
J->compute_span_level(map);
J->set_metric(J->diam);
update_pockets_and_Mmax(J);
J->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
J->R = fmaf(J->R, 2.03f - adaptive_coeff, 0.0f);
H.emplace_back(J);
std::push_heap(H.begin(), H.end(), ComparePtrND);
++ii;
}
no_improve = 0;
}
const float tmp_exp_arg_threshold = fmaf(-exp_arg_threshold * p, 10.0f, 0.0f);
adaptive_coeff_threshold = fmaf(fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition_threshold, 1.0f);
const float exp_arg = fmaf(B_dim, p, 0.0f);
adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim);
if (dim < 3) {
const float adaptive_coeff_arg = adaptive_coeff - 1.0f;
const float adaptive_coeff_threshold_arg = adaptive_coeff_threshold - 1.0f;
adaptive_coeff = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
adaptive_coeff_threshold = fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_threshold_arg);
}
const float arg_for_T = fmaf(p, 4.5f, 2.475f);
const float exp_argument = fmaf(-0.2925f, dim_f, 0.0f);
const float exp2_exp_arg = fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float A = fmaf(39.995f, exp2_exp_arg, 0.0f);
const int T = static_cast<int>(fmaf(fmaf(fmaf(1.0f / fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 2.0f), 1.0498f, -0.04978f), fmaf(sqrtf(dim_f), 4.88f, 0.0f), 0.0f), fmaf(-(exp2_exp_arg - 1.0f), A, A), 0.0f));
std::pop_heap(H.begin(), H.end(), ComparePtrND);
IntervalND* cur = H.back();
H.pop_back();
const float x1 = cur->x1;
const float x2 = cur->x2;
const float y1 = cur->y1;
const float y2 = cur->y2;
float m = fmaf(r_eff, Mmax, 0.0f);
float tNew = step(m, x1, x2, y1, y2, dim_f, r_eff);
const float bestFOld = bestF;
const float fNew = evalAt(tNew);
IntervalND* L = new IntervalND(x1, tNew, y1, fNew);
IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2);
L->i1 = t_to_idx(x1);
L->i2 = t_to_idx(tNew);
Rv->i1 = t_to_idx(tNew);
Rv->i2 = t_to_idx(x2);
L->diam = map.block_diameter(L->i1, L->i2);
Rv->diam = map.block_diameter(Rv->i1, Rv->i2);
L->compute_span_level(map);
Rv->compute_span_level(map);
L->set_metric(L->diam);
Rv->set_metric(Rv->diam);
const float Mloc = fmaxf(L->M, Rv->M);
update_pockets_and_Mmax(L);
update_pockets_and_Mmax(Rv);
const float prevMmax = Mmax;
if (Mloc > Mmax) Mmax = Mloc;
m = fmaf(r_eff, Mmax, 0.0f);
if (adaptive) {
const float len1 = fmaf(tNew, 1.0f, -x1);
const float len2 = fmaf(x2, 1.0f, -tNew);
if (fmaf(len1, 1.0f, len2) == dmax) {
dmax = fmaxf(len1, len2);
for (auto pI : H) {
const float Ls = fmaf(pI->x2, 1.0f, -pI->x1);
if (Ls > dmax) dmax = Ls;
}
}
if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) {
const float alpha = p * p;
const float beta = fmaf(-alpha, 1.0f, 2.0f);
const float MULT = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULT, r_eff, -MULT);
const float GF = beta * global_coeff;
L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f)));
Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f)));
const size_t sz = H.size();
RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
else {
if (Mloc > prevMmax) {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
const size_t sz = H.size();
RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
}
else {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
}
}
}
else {
if (Mloc > prevMmax) {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
const size_t sz = H.size();
RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
}
else {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
}
}
H.emplace_back(L);
std::push_heap(H.begin(), H.end(), ComparePtrND);
H.emplace_back(Rv);
std::push_heap(H.begin(), H.end(), ComparePtrND);
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
IntervalND* const top = H.front();
const float interval_len = dim > 1 ? fmaf(top->x2, 1.0f, -top->x1) : top->diam;
if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter) {
out_iterations = static_cast<size_t>(it);
out_achieved_epsilon = interval_len;
return;
}
if (!(it % T) || staganition) {
MultiCrossMsg out;
const unsigned intervals_to_send = static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f)));
out.count = intervals_to_send;
float* dest = out.intervals;
unsigned ii = 0;
while (ii < intervals_to_send) {
IntervalND* Tt = H[ii];
dest[0] = Tt->x1;
dest[1] = 0.0f;
dest[2] = Tt->x2;
dest[3] = 0.0f;
dest[4] = Tt->R;
dest += 5;
++ii;
}
const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
bool active = true;
const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter_T), 1.0f, 1.0f)) & 1);
size_t ii2 = 0;
while (ii2 < iterations && active) {
const size_t step = 1ULL << ii2;
const int partner = rank ^ static_cast<int>(step);
if (partner < world) {
const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
if (am_sender) {
g_world->isend(partner, 0, out);
active = false;
}
}
++ii2;
}
++exchange_counter_T;
}
const float improvement_threshold = 2.03f - adaptive_coeff;
if (
bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 7), 0.0f) && p > 0.7f ||
bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 5), 0.0f) && p > 0.9f) {
BestSolutionMsg out;
out.bestF = bestF;
out.bestX = bestX;
out.bestY = bestY;
out.dim = static_cast<unsigned>(bestQ.size());
memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float));
{
IntervalND* const I0 = H[0];
out.bestI[0] = I0->x1;
out.bestI[1] = I0->x2;
out.bestI[2] = I0->y1;
out.bestI[3] = I0->y2;
if (dim > 2) {
IntervalND* const I1 = H[1];
out.bestI[4] = I1->x1;
out.bestI[5] = I1->x2;
out.bestI[6] = I1->y1;
out.bestI[7] = I1->y2;
}
}
const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
bool active = true;
const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter), 1.0f, 1.0f)) & 1);
size_t ii2 = 0;
while (ii2 < iterations && active) {
const size_t step = 1ULL << ii2;
const int partner = rank ^ static_cast<int>(step);
if (partner < world) {
const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
if (am_sender) {
g_world->isend(partner, 2, out);
active = false;
}
}
++ii2;
}
++exchange_counter;
}
while (g_world->iprobe(boost::mpi::any_source, 0)) {
MultiCrossMsg in;
g_world->recv(boost::mpi::any_source, 0, in);
const MultiCrossMsg& mX = in;
unsigned ii = 0;
while (ii < mX.count) {
const float* d = &mX.intervals[ii * 5];
float sx = d[0];
float ex = d[2];
if (ex > sx) {
alignas(16) float tmp[32];
float tx;
float ty;
map.map01ToPoint(sx, tmp);
const float y1i = cost(tmp, tx, ty);
map.map01ToPoint(ex, tmp);
const float y2i = cost(tmp, tx, ty);
IntervalND* inj = new IntervalND(sx, ex, y1i, y2i);
inj->i1 = t_to_idx(sx);
inj->i2 = t_to_idx(ex);
inj->diam = map.block_diameter(inj->i1, inj->i2);
inj->compute_span_level(map);
inj->set_metric(inj->diam);
update_pockets_and_Mmax(inj);
inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
IntervalND* topH = H.front();
const float topH_R = topH->R;
const float arg_for_adaptive = fmaf(2.03f - adaptive_coeff, topH_R, 0.0f);
if (inj->R > fmaf(fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.5819767068693264243850020f, -1.0f) || staganition) {
const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f;
const float kf = staganition ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f);
const float exp_arg = fmaf(B_dim, fmaf(1.0f / static_cast<float>(mX.count - 1u), static_cast<float>(ii), 0.0f), 0.0f);
float adaptive_coeff_clone = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim_clone);
if (dim < 3) {
const float adaptive_coeff_arg = adaptive_coeff_clone - 1.0f;
adaptive_coeff_clone = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
}
const float arg_for_inj = fmaf(1.0f / fmaf(adaptive_coeff, topH_R, 0.0f), inj->R - topH_R, 1.0f);
inj->R = fmaf(d[4], fmaf(fmaf(fmaf(arg_for_inj, fmaf(arg_for_inj, fmaf(arg_for_inj, fmaf(arg_for_inj, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), kf, 0.0f), adaptive_coeff_clone, 0.0f), 0.0f);
H.emplace_back(inj);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
}
++ii;
}
}
while (g_world->iprobe(boost::mpi::any_source, 2)) {
BestSolutionMsg bm;
g_world->recv(boost::mpi::any_source, 2, bm);
if (bm.bestF < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) || staganition) {
if (dim > 2) {
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
const float adaptive_coeff_clone0 = fmaf(-1.0f, adaptive_coeff_addition, A_dim_clone);
const float exp_arg = B_dim;
const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float adaptive_coeff_clone1 = fmaf(-exp_val, adaptive_coeff_addition, A_dim_clone);
const float coeff0 = fmaf(adaptive_coeff_clone0, adaptive_coeff, 0.0f);
const float coeff1 = fmaf(adaptive_coeff_clone1, adaptive_coeff, 0.0f);
{
const float sx = bm.bestI[0];
const float ex = bm.bestI[1];
const float y1i = bm.bestI[2];
const float y2i = bm.bestI[3];
IntervalND* I = new IntervalND(sx, ex, y1i, y2i);
I->i1 = t_to_idx(sx);
I->i2 = t_to_idx(ex);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, coeff0, 0.0f);
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
{
const float sx = bm.bestI[4];
const float ex = bm.bestI[5];
const float y1i = bm.bestI[6];
const float y2i = bm.bestI[7];
IntervalND* I = new IntervalND(sx, ex, y1i, y2i);
I->i1 = t_to_idx(sx);
I->i2 = t_to_idx(ex);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, coeff1, 0.0f);
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
}
else {
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
const float sx = bm.bestI[0];
const float ex = bm.bestI[1];
const float y1i = bm.bestI[2];
const float y2i = bm.bestI[3];
IntervalND* I = new IntervalND(sx, ex, y1i, y2i);
I->i1 = t_to_idx(sx);
I->i2 = t_to_idx(ex);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, adaptive_coeff, 0.0f);
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
if (bm.bestF < bestF) {
bestF = bm.bestF;
bestX = bm.bestX;
bestY = bm.bestY;
bestQ.assign(bm.bestQ, bm.bestQ + bm.dim);
}
}
}
++it;
}
}" - первый код считает идеально, очень точно, сходимость гарантирована почти на всех задач, промахи очень редки, т.е. очень надёжный код, единственное преимущество второго кода - он считает быстрее, и иногда (но редко) на очень пологих участках где очень тупые углы - второй код иногда показывает красивые гладкие конфигурации арочного типа когда целевая точка на таком удалении когда это может быть возможно, в целом же даже на пологих участках и удалённых целевых точках вторая конфигурация считает НАМНОГО менее точно, но я не хочу выкидывать все наработки из второго кода, так как обидно признавать, что это было впустую - ты должен проанализировать что реально можно оставить и выдать мне гибридный код, который будет точен как первый и быстрый как второй, похоже я перемудрил и использовал во втором коде для адаптации к размерности и прогрессу слишком сложные зависимости, которые в совокупности могут вести себя не так как ожидалось, ты должен очень тщательно посмотреть что действительно можно оставить и выдать мне гибридный код, я заметил что в первом коде условие на редкую отправку интервалов во время стагнации или во время достижения порога итераций в действительности почти никогда не выполнялось и в первом коде по сути происходил только обмен первого типа, во втором коде я постарался сделать первый обмен для исследования и второй обмен для использования, второй код показывает себя значительно хуже на задачах малой размерности, иногда (но редко) на задачах большой размерности с целевой точкой допускающей выстраивание конфигурации по арочному типу - второй код показывает красивые гладкие конфигурации - это доказывает что иногда математическая красота работает - но это происходит слишком редко - в основном же застревание в локальных минимумах, ты можешь даже оставить все сложные зависимости второго кода так как они призваны ускорять (спасать от избыточных вычислений) - но тогда ты должен тщательно подобрать коэффициенты - смягчая условия, сделай второй код более консервативных, чтобы его коэффициенты приближались к коэффициентам первого кода на задачах малой размерности 2-3, ты также можешь сделать вторую отправку во втором коде адаптивной - чтобы число отправляемых интервалов в сообщении второго типа было не просто 1-2 а зависело от размерности задачи например по закону корня или логарифма, большинство fmaf вложенных выражений во втором коде это аппроксимации Шраудольфа для экспоненты и логарифма, т.е. задача сделать второй код более консервативных за счёт смягчения коэффициентов или вовсе избавлению от некоторых сложных зависимостей или их изменения если это не приносит плодов, и также адаптации к числу пересылаемых интервалов второго типа в зависимости от размерности, ты также должен сделать так чтобы интервалы второго типа изменялись по тому же закону по которому сейчас изменяются интервалы первого типа пересылки, я также заметил что если делать градиентный спуск не по условию а всегда - это может улучшить точность, так что условие на градиентный спуск надо смягчить, незначительные отличия как например изменённая формула периода T оставляй такими какие они во втором коде, потому что второй код лучше оптимизирован под скорость, но твоя задача обеспечить ту же сходимость что и у первого кода при минимальных потерях производительности, я также мог где-то использовать зависимость не в том направлении (например увеличение коэффициента с ростом размерности тогда как по логике оптимизации роботов-манипуляторов этот коэффициент должен уменьшаться с размерностью). Верни полную исправленную версию кода без пропусков и комментариев, также заметь что многие проверки в первом коде для безопасности на самом деле не нужны и внедрять их не надо, проверки на выход за пределы диапазона и прочее - запрещены

Answer 1

Ниже — гибридная версия agp_run_branch_mpi (база = второй код по скорости), но с ключевыми правками для сходимости как у первого:

смягчено условие включения градиентного спуска (для малых размерностей фактически всегда, для остальных — заметно чаще);
возвращён консервативный триггер/порог для «эксплуатационного» обмена (BestSolution) на dim=2–3 и более осторожный на больших dim;
«kick» снова не уводит t за [0,1] (как в первом — без ветвлений, через fminf/fmaxf);
второй тип обмена (tag=2) теперь шлёт не 1–2 интервала, а число, зависящее от размерности (логарифм), и вставка/модификация R у этих интервалов происходит по тому же закону, что и у первого типа (tag=0).

Важно: этот код предполагает, что BestSolutionMsg расширен (добавлены count и intervals[] как 5-float пакеты x1,y1,x2,y2,R). Никаких комментариев в коде нет, как ты просил.

cpp
static __declspec(noalias) void agp_run_branch_mpi(
	const MortonND& map,
	const ManipCost& cost,
	int maxIter,
	float r,
	bool adaptive,
	float eps,
	unsigned seed,
	std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H,
	std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ,
	float& bestF,
	float& bestX,
	float& bestY,
	size_t& out_iterations,
	float& out_achieved_epsilon,
	float M_prior) noexcept
{
	const int n = cost.n;
	const int dim = n + (cost.variableLen ? n : 0);
	const float dim_f = static_cast<float>(dim);
	unsigned exchange_counter = 0;
	unsigned exchange_counter_T = 0;
	alignas(16) float M_by_span[12];
	int msi = 0;
	while (msi < 12) M_by_span[msi++] = M_prior;
	float Mmax = M_prior;
	alignas(16) float q_local[32];
	alignas(16) float phi[32];
	alignas(16) float s_arr[32];
	alignas(16) float c_arr[32];
	alignas(16) float sum_s[32];
	alignas(16) float sum_c[32];
	alignas(16) float q_try[32];
	bestQ.reserve(static_cast<size_t>(dim));
	float x = 0.0f;
	float y = 0.0f;
	int no_improve = 0;

	auto poly_exp = [&](float v) -> float {
		return fmaf(v, fmaf(v, fmaf(v, fmaf(v, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
	};
	auto poly_log = [&](float v) -> float {
		return fmaf(v, fmaf(v, fmaf(v, fmaf(v, fmaf(v, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), v);
	};

	auto t_to_idx = [&](float t) -> unsigned long long {
		unsigned long long idx = static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f));
		return idx;
	};
	auto update_pockets_and_Mmax = [&](IntervalND* I) {
		const int k = I->span_level;
		if (I->M > M_by_span[k]) M_by_span[k] = I->M;
		if (M_by_span[k] > Mmax) Mmax = M_by_span[k];
	};

	const float a = 0.0f;
	const float b = 1.0f;
	float p = 0.0f;
	float dmax = fmaf(b, 1.0f, -a);
	const float initial_len = dmax;

	float exp_arg_threshold = fmaf(1.0f / fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, -0.2f, 0.25f), -0.33333333f), 0.5f), -1.0f), dim_f), 0.415888308336f, 0.0f);
	const float log_arg_threshold = fmaf(dim_f, 0.1f, -0.105f);
	const float adaptive_coeff_addition_threshold = fmaf(
		poly_exp(exp_arg_threshold),
		1.0f / fmaf(log_arg_threshold, poly_log(log_arg_threshold), 0.0f),
		0.17814618538f
	);
	float adaptive_coeff_threshold = adaptive_coeff_addition_threshold + 1.0f;

	const float log_arg = dim_f + 5.0f;
	const float A_dim = fmaf(1.0f / fmaf(log_arg, poly_log(log_arg), 0.0f), 3.0f, 0.0f);
	const float B_dim = fmaf(A_dim, 0.65f, 0.0f);
	const float log_argument = A_dim - 2.03f;
	const float C_dim = poly_log(log_argument) - B_dim;
	const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.55f), 1.0f);
	float adaptive_coeff = A_dim - adaptive_coeff_addition;
	const float A_dim_clone = fmaf(A_dim - fmaf(-fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim), 0.5f, 0.0f);

	int it = 0;

	auto evalAt = [&](float t) -> float {
		map.map01ToPoint(t, q_local);
		float f = cost(q_local, x, y);

		const float step_arg = fmaf(-dim_f, 0.825f, 3.3f);
		float eta = fmaf(poly_exp(step_arg), 0.0685f, 0.0f);

		const float f_thr_hi = fmaf(bestF, 2.03f - adaptive_coeff, 0.0f);
		const float f_thr_lo = fmaf(bestF, adaptive_coeff, 0.0f);

		const bool do_grad =
			(dim < 4) ||
			((f < f_thr_hi) && (p > 0.35f)) ||
			((f < f_thr_lo) && (p > 0.55f));

		if (do_grad) {
			float acc = 0.0f;
			int ii = 0;
			while (ii < n) {
				acc = fmaf(q_local[ii], 1.0f, acc);
				phi[ii] = acc;
				++ii;
			}
			FABE13_SINCOS(phi, s_arr, c_arr, n);
			float as = 0.0f;
			float ac = 0.0f;
			int k = n - 1;
			while (k >= 0) {
				const float Lk = cost.variableLen ? q_local[n + k] : 1.0f;
				as = fmaf(Lk, s_arr[k], as);
				ac = fmaf(Lk, c_arr[k], ac);
				sum_s[k] = as;
				sum_c[k] = ac;
				--k;
			}
			const float dx = fmaf(x, 1.0f, -cost.targetX);
			const float dy = fmaf(y, 1.0f, -cost.targetY);
			const float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1.0e-8f;

			int stepI = 0;
			const float adaptive_window = fmaf(1.0f / fmaf(bestF, adaptive_coeff_threshold, 0.0f), fmaf(bestF, adaptive_coeff_threshold, -f), 0.0f);
			const float curse_dim_arg = fmaf(dim_f, 0.825f, 0.0f);
			const int threshold_iters_grad = static_cast<int>(fmaf(fmaf(adaptive_window, adaptive_window, 1.0f), fmaf(poly_exp(curse_dim_arg), 0.0685f, 0.0f), 0.0f));

			int no_improve_steps = 0;
			float f_at_check_start = f;
			int steps_since_check = 0;
			const int check_interval = static_cast<int>(fmaf(1.0f / log2f(fmaf(dim_f, 0.5f, 1.0f)), static_cast<float>(threshold_iters_grad >> 2), 0.0f));

			while (stepI < threshold_iters_grad) {
				int i = 0;
#pragma loop ivdep
				while (i < n) {
					float gpen = 0.0f;
					{
						const float ai = fabsf(q_local[i]);
						const float v = fmaf(ai, 1.0f, -cost.minTheta);
						if (v > 0.0f) {
							const float scale_arg = fmaf(2.0f / (cost.minTheta + 1.0e-6f), fmaf(v, 0.69314718055994530941723212145818f, 0.0f), 0.0f);
							const float exp_val = poly_exp(scale_arg);
							const float dpen_dtheta = fmaf(cost.sharpW, fmaf(exp_val, 0.69314718055994530941723212145818f * (2.0f / (cost.minTheta + 1.0e-6f)), 0.0f), copysignf(1.0f, q_local[i]));
							gpen = fmaf(dpen_dtheta, 1.0f, gpen);
						}
					}
					{
						const float tsg = fmaf(-q_local[i], cost.archBiasK, 0.0f);
						const float exp_arg = -tsg;
						const float exp_val = poly_exp(exp_arg);
						const float sig = 1.0f / fmaf(exp_val, 1.0f, 1.0f);
						gpen = fmaf(-fmaf(cost.archBiasW, cost.archBiasK, 0.0f), sig, gpen);
					}
					const float g = fmaf(fmaf(dx, -sum_s[i], fmaf(dy, sum_c[i], 0.0f)), 1.0f / dist, gpen);
					q_try[i] = fmaf(-eta, g, q_local[i]);
					const float lo = (i == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
					const float hi = 2.6179938779914943653855361527329f;
					if (q_try[i] < lo) q_try[i] = lo;
					else if (q_try[i] > hi) q_try[i] = hi;
					++i;
				}
				if (cost.variableLen) {
					int j = 0;
#pragma loop ivdep
					while (j < n) {
						const float g = fmaf(fmaf(dx, c_arr[j], fmaf(dy, s_arr[j], 0.0f)), 1.0f / dist, 0.0f);
						float v = fmaf(-eta, g, q_local[n + j]);
						if (v < 0.5f) v = 0.5f;
						else if (v > 2.0f) v = 2.0f;
						q_try[n + j] = v;
						++j;
					}
				}
				float x2;
				float y2;
				const float f2 = cost(q_try, x2, y2);
				const float rel_improvement = fmaf(-1.0f / f, f2, 1.0f);
				if (f2 < f) {
					memcpy(q_local, q_try, static_cast<size_t>(dim) * sizeof(float));
					f = f2;
					x = x2;
					y = y2;
					eta = fmaf(eta, fmaf(10.0f / sqrtf(dim_f), 1.0f / (1.0f + rel_improvement) * rel_improvement, 1.0f), 0.0f);
				}
				else {
					const float caution_coeff = 5.0f / sqrtf(dim_f) * rel_improvement;
					eta = fmaf(eta, poly_exp(caution_coeff), 0.0f);
					memcpy(q_try, q_local, static_cast<size_t>(dim) * sizeof(float));
					++no_improve_steps;
				}
				++steps_since_check;
				if (steps_since_check == check_interval) {
					if (fmaf(-1.0f / f_at_check_start, f, 1.0f) < 1e-3f) break;
					f_at_check_start = f;
					steps_since_check = 0;
				}
				if (no_improve_steps > threshold_iters_grad >> 2) break;
				++stepI;
			}
		}

		if (f < bestF) {
			if (0.95f < p || dim < 3) {
				const int last = n - 1;
				const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
				const float hi = 2.6179938779914943653855361527329f;
				float bestLocF = f;
				float saved = q_local[last];
				float delta = eta;
				while (delta > fmaf(delta, 0.15f, 0.0f)) {
					int sgn = -1;
					while (sgn < 2) {
						float cand = fmaf(static_cast<float>(sgn), delta, saved);
						if (cand < lo) cand = lo;
						else if (cand > hi) cand = hi;
						const float backup = q_local[last];
						q_local[last] = cand;
						float x2;
						float y2;
						const float f2 = cost(q_local, x2, y2);
						if (f2 < bestLocF) {
							bestLocF = f2;
							x = x2;
							y = y2;
							saved = cand;
						}
						q_local[last] = backup;
						sgn += 2;
					}
					delta = fmaf(delta, 0.5f, 0.0f);
				}
				if (bestLocF < f) {
					q_local[last] = saved;
					f = bestLocF;
				}
			}
			bestF = f;
			bestQ.assign(q_local, q_local + dim);
			bestX = x;
			bestY = y;
			no_improve = 0;
		}
		else ++no_improve;

		return f;
	};

	const float f_a = evalAt(a);
	const float f_b = evalAt(b);
	const int K = static_cast<int>(fmaf(-fmaf(sqrtf(dim_f), dim_f, 0.0f), 0.725f, 10.95f));
	H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + 16u);

	const int rank = g_world->rank();
	const int world = g_world->size();

	alignas(16) float seeds[256 * 32];
	const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, static_cast<unsigned>(fmaf(static_cast<float>(rank), 7919.0f, static_cast<float>(seed))));

	int i = 0;
	while (i < seedCnt) {
		const float* s = seeds + static_cast<size_t>(fmaf(static_cast<float>(i), 32.0f, 0.0f));
		const float t_seed = map.pointToT(s);
		const float interval_size = (i < 3)
			? fmaf(0.0004f, static_cast<float>(dim), 0.0f)
			: fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f),
				exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)),
				0.0f);

		const float t1 = fmaf(-interval_size, 1.0f, t_seed);
		const float t2 = fmaf(interval_size, 1.0f, t_seed);

		alignas(16) float q1[32];
		alignas(16) float q2[32];
		float x1;
		float y1;
		float x2;
		float y2;
		map.map01ToPoint(t1, q1);
		const float f1 = cost(q1, x1, y1);
		map.map01ToPoint(t2, q2);
		const float f2 = cost(q2, x2, y2);

		IntervalND* I = new IntervalND(t1, t2, f1, f2);
		I->i1 = t_to_idx(t1);
		I->i2 = t_to_idx(t2);
		I->diam = map.block_diameter(I->i1, I->i2);
		I->compute_span_level(map);
		I->set_metric(I->diam);
		update_pockets_and_Mmax(I);
		I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
		if (i < 3) I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f);
		else {
			const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f);
			const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f);
			const float mult = fmaf(start_mult,
				exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)),
				0.0f);
			I->R = fmaf(I->R, mult, 0.0f);
		}

		H.emplace_back(I);
		std::push_heap(H.begin(), H.end(), ComparePtrND);

		if (f1 < bestF) {
			bestF = f1;
			bestQ.assign(q1, q1 + dim);
			bestX = x1;
			bestY = y1;
		}
		if (f2 < bestF) {
			bestF = f2;
			bestQ.assign(q2, q2 + dim);
			bestX = x2;
			bestY = y2;
		}
		++i;
	}

	float prev_t = a;
	float prev_f = f_a;
	int k = 1;
	while (k <= K) {
		const float t = fmaf(fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1)));
		const float f = evalAt(t);

		IntervalND* I = new IntervalND(prev_t, t, prev_f, f);
		I->i1 = t_to_idx(prev_t);
		I->i2 = t_to_idx(t);
		I->diam = map.block_diameter(I->i1, I->i2);
		I->compute_span_level(map);
		I->set_metric(I->diam);
		update_pockets_and_Mmax(I);
		I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));

		H.emplace_back(I);
		std::push_heap(H.begin(), H.end(), ComparePtrND);

		prev_t = t;
		prev_f = f;
		++k;
	}

	IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b);
	tail->i1 = t_to_idx(prev_t);
	tail->i2 = t_to_idx(b);
	tail->diam = map.block_diameter(tail->i1, tail->i2);
	tail->compute_span_level(map);
	tail->set_metric(tail->diam);
	update_pockets_and_Mmax(tail);
	tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
	H.emplace_back(tail);
	std::push_heap(H.begin(), H.end(), ComparePtrND);

	const int kickEveryDim = static_cast<int>(fmaf(22.5f, exp2f(-0.295f * dim_f), 0.0f));
	const int noImproveThrDim = static_cast<int>(fmaf(32.5f, exp2f(-0.1f * dim_f), 0.0f));

	while (it < maxIter) {
		p = fmaf(-1.0f / initial_len, dmax, 1.0f);
		const float p_arg = fmaf(p, 2.3f, -2.9775f);
		const float r_eff = fmaf(-poly_exp(p_arg) + 1.05f, r, 0.0f);

		const bool staganition = no_improve > noImproveThrDim;

		if (!(it % kickEveryDim) || staganition) {
			const float t_best = map.pointToT(bestQ.data());
			const float log_arg2 = fmaf(dim_f, 1.0f, -1.0f);
			const float adaptive_diameter = fmaf(poly_log(log_arg2), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f);

			int ii = 0;
			while (ii < 2) {
				const float off = (ii == 0) ? fmaf(adaptive_diameter, 2.0f, 0.0f) : fmaf(-adaptive_diameter, 2.0f, 0.0f);
				const float t_seed = fminf(b, fmaxf(a, fmaf(off, 1.0f, t_best)));
				const float f_seed = evalAt(t_seed);

				IntervalND* J = new IntervalND(fmaf(-adaptive_diameter, 1.0f, t_seed), fmaf(adaptive_diameter, 1.0f, t_seed), f_seed, f_seed);
				J->i1 = t_to_idx(fmaf(-adaptive_diameter, 1.0f, t_seed));
				J->i2 = t_to_idx(fmaf(adaptive_diameter, 1.0f, t_seed));
				J->diam = map.block_diameter(J->i1, J->i2);
				J->compute_span_level(map);
				J->set_metric(J->diam);
				update_pockets_and_Mmax(J);
				J->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
				J->R = fmaf(J->R, 2.03f - adaptive_coeff, 0.0f);
				H.emplace_back(J);
				std::push_heap(H.begin(), H.end(), ComparePtrND);
				++ii;
			}
			no_improve = 0;
		}

		const float tmp_exp_arg_threshold = fmaf(-exp_arg_threshold * p, 10.0f, 0.0f);
		adaptive_coeff_threshold = fmaf(poly_exp(tmp_exp_arg_threshold), adaptive_coeff_addition_threshold, 1.0f);
		const float exp_arg = fmaf(B_dim, p, 0.0f);
		adaptive_coeff = fmaf(-poly_exp(exp_arg), adaptive_coeff_addition, A_dim);

		if (dim < 3) {
			const float adaptive_coeff_arg = adaptive_coeff - 1.0f;
			const float adaptive_coeff_threshold_arg = adaptive_coeff_threshold - 1.0f;
			adaptive_coeff = poly_log(adaptive_coeff_arg);
			adaptive_coeff_threshold = poly_log(adaptive_coeff_threshold_arg);
		}

		const float arg_for_T = fmaf(p, 4.5f, 2.475f);
		const float exp_argument = fmaf(-0.2925f, dim_f, 0.0f);
		const float exp2_exp_arg = poly_exp(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f));
		const float A = fmaf(39.995f, exp2_exp_arg, 0.0f);
		const int T = static_cast<int>(fmaf(
			fmaf(
				fmaf(1.0f / fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 2.0f), 1.0498f, -0.04978f),
				fmaf(sqrtf(dim_f), 4.88f, 0.0f),
				0.0f),
			fmaf(-(exp2_exp_arg - 1.0f), A, A),
			0.0f));

		auto maybe_inject_interval = [&](float sx, float sy, float ex, float ey, float Rin, unsigned idx, unsigned cnt, bool has_y) {
			if (ex > sx) {
				float y1i = sy;
				float y2i = ey;
				if (!has_y) {
					alignas(16) float tmp[32];
					float tx;
					float ty;
					map.map01ToPoint(sx, tmp);
					y1i = cost(tmp, tx, ty);
					map.map01ToPoint(ex, tmp);
					y2i = cost(tmp, tx, ty);
				}

				IntervalND* inj = new IntervalND(sx, ex, y1i, y2i);
				inj->i1 = t_to_idx(sx);
				inj->i2 = t_to_idx(ex);
				inj->diam = map.block_diameter(inj->i1, inj->i2);
				inj->compute_span_level(map);
				inj->set_metric(inj->diam);
				update_pockets_and_Mmax(inj);
				inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));

				_mm_prefetch((const char*)H[0], _MM_HINT_T0);
				_mm_prefetch((const char*)H[1], _MM_HINT_T0);

				IntervalND* topH = H.front();
				const float topH_R = topH->R;
				const float arg_for_adaptive = fmaf(2.03f - adaptive_coeff, topH_R, 0.0f);
				const float thr = fmaf(poly_exp(arg_for_adaptive), 0.5819767068693264243850020f, -1.0f);

				if (inj->R > thr || staganition) {
					const float poly_p = poly_exp(fmaf(p, 0.69314718055994530941723212145818f, 0.0f)) - 1.0f;
					const float kf = staganition ? fmaf(0.5819767068693265f, poly_p, 0.4f) : fmaf(0.3891860241215959f, poly_p, 0.5f);

					const float u = fmaf(1.0f / static_cast<float>(cnt - 1u), static_cast<float>(idx), 0.0f);
					const float earg = fmaf(B_dim, u, 0.0f);
					float adaptive_coeff_clone = fmaf(-poly_exp(earg), adaptive_coeff_addition, A_dim_clone);
					if (dim < 3) {
						const float aa = adaptive_coeff_clone - 1.0f;
						adaptive_coeff_clone = poly_log(aa);
					}

					const float arg_for_inj = fmaf(1.0f / fmaf(adaptive_coeff, topH_R, 0.0f), inj->R - topH_R, 1.0f);
					inj->R = fmaf(Rin, fmaf(fmaf(poly_exp(arg_for_inj), kf, 0.0f), adaptive_coeff_clone, 0.0f), 0.0f);

					H.emplace_back(inj);
					std::push_heap(H.begin(), H.end(), ComparePtrND);
				}
			}
		};

		std::pop_heap(H.begin(), H.end(), ComparePtrND);
		IntervalND* cur = H.back();
		H.pop_back();

		const float x1 = cur->x1;
		const float x2 = cur->x2;
		const float y1 = cur->y1;
		const float y2 = cur->y2;

		float m = fmaf(r_eff, Mmax, 0.0f);
		float tNew = step(m, x1, x2, y1, y2, dim_f, r_eff);

		const float bestFOld = bestF;
		const float fNew = evalAt(tNew);

		IntervalND* L = new IntervalND(x1, tNew, y1, fNew);
		IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2);

		L->i1 = t_to_idx(x1);
		L->i2 = t_to_idx(tNew);
		Rv->i1 = t_to_idx(tNew);
		Rv->i2 = t_to_idx(x2);

		L->diam = map.block_diameter(L->i1, L->i2);
		Rv->diam = map.block_diameter(Rv->i1, Rv->i2);

		L->compute_span_level(map);
		Rv->compute_span_level(map);

		L->set_metric(L->diam);
		Rv->set_metric(Rv->diam);

		const float Mloc = fmaxf(L->M, Rv->M);
		update_pockets_and_Mmax(L);
		update_pockets_and_Mmax(Rv);

		const float prevMmax = Mmax;
		if (Mloc > Mmax) Mmax = Mloc;

		m = fmaf(r_eff, Mmax, 0.0f);

		if (adaptive) {
			const float len1 = fmaf(tNew, 1.0f, -x1);
			const float len2 = fmaf(x2, 1.0f, -tNew);
			if (fmaf(len1, 1.0f, len2) == dmax) {
				dmax = fmaxf(len1, len2);
				for (auto pI : H) {
					const float Ls = fmaf(pI->x2, 1.0f, -pI->x1);
					if (Ls > dmax) dmax = Ls;
				}
			}
			if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) {
				const float alpha = p * p;
				const float beta = fmaf(-alpha, 1.0f, 2.0f);
				const float MULT = (1.0f / dmax) * Mmax;
				const float global_coeff = fmaf(MULT, r_eff, -MULT);
				const float GF = beta * global_coeff;
				L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f)));
				Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f)));
				const size_t sz = H.size();
				RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha);
				std::make_heap(H.begin(), H.end(), ComparePtrND);
			}
			else {
				if (Mloc > prevMmax) {
					L->ChangeCharacteristic(m);
					Rv->ChangeCharacteristic(m);
					if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
						const size_t sz = H.size();
						RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
						std::make_heap(H.begin(), H.end(), ComparePtrND);
					}
				}
				else {
					L->ChangeCharacteristic(m);
					Rv->ChangeCharacteristic(m);
				}
			}
		}
		else {
			if (Mloc > prevMmax) {
				L->ChangeCharacteristic(m);
				Rv->ChangeCharacteristic(m);
				if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
					const size_t sz = H.size();
					RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
					std::make_heap(H.begin(), H.end(), ComparePtrND);
				}
			}
			else {
				L->ChangeCharacteristic(m);
				Rv->ChangeCharacteristic(m);
			}
		}

		H.emplace_back(L);
		std::push_heap(H.begin(), H.end(), ComparePtrND);
		H.emplace_back(Rv);
		std::push_heap(H.begin(), H.end(), ComparePtrND);

		_mm_prefetch((const char*)H[0], _MM_HINT_T0);
		_mm_prefetch((const char*)H[1], _MM_HINT_T0);

		IntervalND* const top = H.front();
		const float interval_len = dim > 1 ? fmaf(top->x2, 1.0f, -top->x1) : top->diam;
		if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter) {
			out_iterations = static_cast<size_t>(it);
			out_achieved_epsilon = interval_len;
			return;
		}

		if (!(it % T) || staganition) {
			MultiCrossMsg out;
			unsigned intervals_to_send = static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f)));
			if (intervals_to_send < 2u) intervals_to_send = 2u;
			if (dim < 6 && intervals_to_send > 3u) intervals_to_send = 3u;

			out.count = intervals_to_send;
			float* dest = out.intervals;
			unsigned ii = 0;
			while (ii < intervals_to_send) {
				IntervalND* Tt = H[ii];
				dest[0] = Tt->x1;
				dest[1] = 0.0f;
				dest[2] = Tt->x2;
				dest[3] = 0.0f;
				dest[4] = Tt->R;
				dest += 5;
				++ii;
			}

			const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
			bool active = true;
			const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter_T), 1.0f, 1.0f)) & 1);
			size_t ii2 = 0;
			while (ii2 < iterations && active) {
				const size_t step = 1ULL << ii2;
				const int partner = rank ^ static_cast<int>(step);
				if (partner < world) {
					const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
					if (am_sender) {
						g_world->isend(partner, 0, out);
						active = false;
					}
				}
				++ii2;
			}
			++exchange_counter_T;
		}

		const float improvement_threshold = 2.03f - adaptive_coeff;

		bool do_send_best = false;
		if (dim < 4) {
			if (bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 9), 0.0f) && p > 0.85f) do_send_best = true;
			else if (bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 7), 0.0f) && p > 0.9f) do_send_best = true;
			else if (bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 5), 0.0f) && p > 0.95f) do_send_best = true;
		}
		else {
			if (bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 7), 0.0f) && p > 0.82f) do_send_best = true;
			else if (bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 5), 0.0f) && p > 0.93f) do_send_best = true;
		}

		if (do_send_best) {
			BestSolutionMsg out;
			out.bestF = bestF;
			out.bestX = bestX;
			out.bestY = bestY;
			out.dim = static_cast<unsigned>(bestQ.size());
			memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float));

			unsigned best_intervals_to_send = static_cast<unsigned>(log2f(dim_f)) + 1u;
			if (best_intervals_to_send < 2u) best_intervals_to_send = 2u;
			if (best_intervals_to_send > 8u) best_intervals_to_send = 8u;

			out.count = best_intervals_to_send;
			float* dest = out.intervals;
			unsigned ii = 0;
			while (ii < best_intervals_to_send) {
				IntervalND* I0 = H[ii];
				dest[0] = I0->x1;
				dest[1] = I0->y1;
				dest[2] = I0->x2;
				dest[3] = I0->y2;
				dest[4] = I0->R;
				dest += 5;
				++ii;
			}

			const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
			bool active = true;
			const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter), 1.0f, 1.0f)) & 1);
			size_t ii2 = 0;
			while (ii2 < iterations && active) {
				const size_t step = 1ULL << ii2;
				const int partner = rank ^ static_cast<int>(step);
				if (partner < world) {
					const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
					if (am_sender) {
						g_world->isend(partner, 2, out);
						active = false;
					}
				}
				++ii2;
			}
			++exchange_counter;
		}

		while (g_world->iprobe(boost::mpi::any_source, 0)) {
			MultiCrossMsg in;
			g_world->recv(boost::mpi::any_source, 0, in);
			const MultiCrossMsg& mX = in;
			unsigned ii = 0;
			while (ii < mX.count) {
				const float* d = &mX.intervals[ii * 5];
				const float sx = d[0];
				const float ex = d[2];
				const float Rin = d[4];
				maybe_inject_interval(sx, 0.0f, ex, 0.0f, Rin, ii, mX.count, false);
				++ii;
			}
		}

		while (g_world->iprobe(boost::mpi::any_source, 2)) {
			BestSolutionMsg bm;
			g_world->recv(boost::mpi::any_source, 2, bm);
			if (bm.bestF < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) || staganition) {
				unsigned ii = 0;
				while (ii < bm.count) {
					const float* d = &bm.intervals[ii * 5];
					const float sx = d[0];
					const float y1i = d[1];
					const float ex = d[2];
					const float y2i = d[3];
					const float Rin = d[4];
					maybe_inject_interval(sx, y1i, ex, y2i, Rin, ii, bm.count, true);
					++ii;
				}
				if (bm.bestF < bestF) {
					bestF = bm.bestF;
					bestX = bm.bestX;
					bestY = bm.bestY;
					bestQ.assign(bm.bestQ, bm.bestQ + bm.dim);
				}
			}
		}

		++it;
	}
}

"static __declspec(noalias) void agp_run_branch_mp...

Pergunta

Pensamento

Evaluating multi-cross message behavior

Analyzing message behavior with intervals

Considering safety checks and clamps

Analyzing function signature and code differences

Evaluating the need for clamping versus safety checks

Considering clamp and interval adjustments in hybrid design

Addressing interval count adjustment and struct limitations

Determining how to adapt second type intervals

Considering changes to gradient descent behavior

Adjusting second code for consistency and conservatism

Adjusting MultiCross send for adaptability and accuracy

Refining MultiCross and BestSolution send adjustments

Adjusting the message structure and gradient descent behavior

Optimizing gradient descent and message adaptation

Designing message adjustments and injection optimization

Refining interval adjustments and injection laws

Making conservative changes while maintaining accuracy

Optimizing thresholds and conditions for messaging and sending

Finalizing code for interval adjustments in BestSolutionMsg

Resposta

Compartilhar esta Q&A