"__declspec(align(16)) struct CrossMsg final { fl...

Question

"__declspec(align(16)) struct CrossMsg final {
float s_x1;
float s_x2;
float e_x1;
float e_x2;
float Rtop;

text
template <typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept {
	ar& s_x1& s_x2& e_x1& e_x2& Rtop;
}

};

__declspec(align(16)) struct MultiCrossMsg final {
float intervals[35];
unsigned count;

text
template <typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept {
	ar& intervals& count;
}

};

__declspec(align(16)) struct BestSolutionMsg final {
float bestF;
float bestX;
float bestY;
float bestQ[32];
unsigned dim;

text
template <typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept {
	ar& bestF& bestX& bestY& bestQ& dim;
}

}; static __declspec(noalias) void agp_run_branch_mpi(
const MortonND& map,
const ManipCost& cost,
int maxIter,
float r,
bool adaptive,
float eps,
unsigned seed,
std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H,
std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ,
float& bestF,
float& bestX,
float& bestY,
size_t& out_iterations,
float& out_achieved_epsilon,
float M_prior) noexcept
{
const int n = cost.n;
const int dim = n + (cost.variableLen ? n : 0);
const float dim_f = static_cast<float>(dim);
unsigned exchange_counter = 0;
unsigned exchange_counter_T = 0;
alignas(16) float M_by_span[12];
int msi = 0;
while (msi < 12) M_by_span[msi++] = M_prior;
float Mmax = M_prior;
alignas(16) float q_local[32];
alignas(16) float phi[32];
alignas(16) float s_arr[32];
alignas(16) float c_arr[32];
alignas(16) float sum_s[32];
alignas(16) float sum_c[32];
alignas(16) float q_try[32];
bestQ.reserve(static_cast<size_t>(dim));
float x = 0.0f;
float y = 0.0f;
int no_improve = 0;
auto t_to_idx = [&](float t) -> unsigned long long {
unsigned long long idx = static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f));
return idx;
};
auto update_pockets_and_Mmax = [&](IntervalND* I) {
const int k = I->span_level;
if (I->M > M_by_span[k]) M_by_span[k] = I->M;
if (M_by_span[k] > Mmax) Mmax = M_by_span[k];
};
const float a = 0.0f;
const float b = 1.0f;
float p = 0.0f;
float dmax = fmaf(b, 1.0f, -a);
const float initial_len = dmax;
float exp_arg_threshold = fmaf(1.0f / fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, -0.2f, 0.25f), -0.33333333f), 0.5f), -1.0f), dim_f), 0.415888308336f, 0.0f);
const float log_arg_threshold = fmaf(dim_f, 0.1f, -0.105f);
const float adaptive_coeff_addition_threshold = fmaf(fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 1.0f / fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg_threshold), 0.17814618538f);
float adaptive_coeff_threshold = adaptive_coeff_addition_threshold + 1.0f;
const float log_arg = dim_f + 5.0f;
const float A_dim = fmaf(1.0f / fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), 3.0f, 0.0f);
const float B_dim = fmaf(A_dim, 0.65f, 0.0f);
const float log_argument = A_dim - 2.03f;
const float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim;
const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.55f), 1.0f);
float adaptive_coeff = A_dim - adaptive_coeff_addition;
const float A_dim_clone = fmaf(A_dim - fmaf(-fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim), 0.5f, 0.0f);
int it = 0;
auto evalAt = [&](float t) -> float {
map.map01ToPoint(t, q_local);
float f = cost(q_local, x, y);
const float step_arg = fmaf(-dim_f, 0.825f, 3.3f);
float eta = fmaf(fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f);
if ((((f < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) && p > 0.55f) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && ((p > 0.7f && !(it % 3)) || p > 0.9f))) && dim > 2) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && dim < 3)) {
float acc = 0.0f;
int ii = 0;
while (ii < n) {
acc = fmaf(q_local[ii], 1.0f, acc);
phi[ii] = acc;
++ii;
}
FABE13_SINCOS(phi, s_arr, c_arr, n);
float as = 0.0f;
float ac = 0.0f;
int k = n - 1;
while (k >= 0) {
const float Lk = cost.variableLen ? q_local[n + k] : 1.0f;
as = fmaf(Lk, s_arr[k], as);
ac = fmaf(Lk, c_arr[k], ac);
sum_s[k] = as;
sum_c[k] = ac;
--k;
}
const float dx = fmaf(x, 1.0f, -cost.targetX);
const float dy = fmaf(y, 1.0f, -cost.targetY);
const float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1.0e-8f;
int stepI = 0;
const float adaptive_window = fmaf(1.0f / fmaf(bestF, adaptive_coeff_threshold, 0.0f), fmaf(bestF, adaptive_coeff_threshold, -f), 0.0f);
const float curse_dim_arg = fmaf(dim_f, 0.825f, 0.0f);
const int threshold_iters_grad = static_cast<int>(fmaf(fmaf(adaptive_window, adaptive_window, 1.0f), fmaf(fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f), 0.0f));
int no_improve_steps = 0;
float f_at_check_start = f;
int steps_since_check = 0;
const int check_interval = static_cast<int>(fmaf(1.0f / log2f(fmaf(dim_f, 0.5f, 1.0f)), static_cast<float>(threshold_iters_grad >> 2), 0.0f));
while (stepI < threshold_iters_grad) {
int i = 0;
#pragma loop ivdep
while (i < n) {
float gpen = 0.0f;
{
const float ai = fabsf(q_local[i]);
const float v = fmaf(ai, 1.0f, -cost.minTheta);
if (v > 0.0f) {
const float scale_arg = fmaf(2.0f / (cost.minTheta + 1.0e-6f), fmaf(v, 0.69314718055994530941723212145818f, 0.0f), 0.0f);
const float exp_val = fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float dpen_dtheta = fmaf(cost.sharpW, fmaf(exp_val, 0.69314718055994530941723212145818f * (2.0f / (cost.minTheta + 1.0e-6f)), 0.0f), copysignf(1.0f, q_local[i]));
gpen = fmaf(dpen_dtheta, 1.0f, gpen);
}
}
{
const float tsg = fmaf(-q_local[i], cost.archBiasK, 0.0f);
const float exp_arg = -tsg;
const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float sig = 1.0f / fmaf(exp_val, 1.0f, 1.0f);
gpen = fmaf(-fmaf(cost.archBiasW, cost.archBiasK, 0.0f), sig, gpen);
}
const float g = fmaf(fmaf(dx, -sum_s[i], fmaf(dy, sum_c[i], 0.0f)), 1.0f / dist, gpen);
q_try[i] = fmaf(-eta, g, q_local[i]);
const float lo = (i == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
const float hi = 2.6179938779914943653855361527329f;
if (q_try[i] < lo) q_try[i] = lo;
else if (q_try[i] > hi) q_try[i] = hi;
++i;
}
if (cost.variableLen) {
int j = 0;
#pragma loop ivdep
while (j < n) {
const float g = fmaf(fmaf(dx, c_arr[j], fmaf(dy, s_arr[j], 0.0f)), 1.0f / dist, 0.0f);
float v = fmaf(-eta, g, q_local[n + j]);
if (v < 0.5f) v = 0.5f;
else if (v > 2.0f) v = 2.0f;
q_try[n + j] = v;
++j;
}
}
float x2;
float y2;
const float f2 = cost(q_try, x2, y2);
const float rel_improvement = fmaf(-1.0f / f, f2, 1.0f);
if (f2 < f) {
memcpy(q_local, q_try, static_cast<size_t>(dim) * sizeof(float));
f = f2;
x = x2;
y = y2;
eta = fmaf(eta, fmaf(10.0f / sqrtf(dim_f), 1.0f / (1.0f + rel_improvement) * rel_improvement, 1.0f), 0.0f);
}
else {
const float caution_coeff = 5.0f / sqrtf(dim_f) * rel_improvement;
eta = fmaf(eta, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0f);
memcpy(q_try, q_local, static_cast<size_t>(dim) * sizeof(float));
++no_improve_steps;
}
++steps_since_check;
if (steps_since_check == check_interval) {
if (fmaf(-1.0f / f_at_check_start, f, 1.0f) < 1e-3f) break;
f_at_check_start = f;
steps_since_check = 0;
}
if (no_improve_steps > threshold_iters_grad >> 2) break;
++stepI;
}
}
if (f < bestF) {
if (0.95f < p || dim < 3) {
const int last = n - 1;
const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
const float hi = 2.6179938779914943653855361527329f;
float bestLocF = f;
float saved = q_local[last];
float delta = eta;
while (delta > fmaf(delta, 0.15f, 0.0f)) {
int sgn = -1;
while (sgn < 2) {
float cand = fmaf(static_cast<float>(sgn), delta, saved);
if (cand < lo) cand = lo;
else if (cand > hi) cand = hi;
const float backup = q_local[last];
q_local[last] = cand;
float x2;
float y2;
const float f2 = cost(q_local, x2, y2);
if (f2 < bestLocF) {
bestLocF = f2;
x = x2;
y = y2;
saved = cand;
}
q_local[last] = backup;
sgn += 2;
}
delta = fmaf(delta, 0.5f, 0.0f);
}
if (bestLocF < f) {
q_local[last] = saved;
f = bestLocF;
}
}
bestF = f;
bestQ.assign(q_local, q_local + dim);
bestX = x;
bestY = y;
no_improve = 0;
}
else ++no_improve;
return f;
};
const float f_a = evalAt(a);
const float f_b = evalAt(b);
const float Kf = fminf(fmaxf(2.0f * dim_f, 8.0f), 128.0f);
const int K = static_cast<int>(Kf);
H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + 16u);
const int rank = g_world->rank();
const int world = g_world->size();
alignas(16) float seeds[256 * 32];
const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, static_cast<unsigned>(fmaf(static_cast<float>(rank), 7919.0f, static_cast<float>(seed))));
int i = 0;
while (i < seedCnt) {
const float* s = seeds + static_cast<size_t>(fmaf(static_cast<float>(i), 32.0f, 0.0f));
const float t_seed = map.pointToT(s);
const float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f) : fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f), exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
const float t1 = fmaxf(a, fmaf(-interval_size, 1.0f, t_seed));
const float t2 = fminf(b, fmaf(interval_size, 1.0f, t_seed));
if (t2 > t1) {
alignas(16) float q1[32];
alignas(16) float q2[32];
float x1;
float y1;
float x2;
float y2;
map.map01ToPoint(t1, q1);
const float f1 = cost(q1, x1, y1);
map.map01ToPoint(t2, q2);
const float f2 = cost(q2, x2, y2);
IntervalND* I = new IntervalND(t1, t2, f1, f2);
I->i1 = t_to_idx(t1);
I->i2 = t_to_idx(t2);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
if (i < 3) I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f);
else {
const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f);
const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f);
const float mult = fmaf(start_mult, exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
I->R = fmaf(I->R, mult, 0.0f);
}
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
if (f1 < bestF) {
bestF = f1;
bestQ.assign(q1, q1 + dim);
bestX = x1;
bestY = y1;
}
if (f2 < bestF) {
bestF = f2;
bestQ.assign(q2, q2 + dim);
bestX = x2;
bestY = y2;
}
}
++i;
}
float prev_t = a;
float prev_f = f_a;
int k = 1;
while (k <= K) {
const float t = fmaf(fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1)));
const float f = evalAt(t);
IntervalND* I = new IntervalND(prev_t, t, prev_f, f);
I->i1 = t_to_idx(prev_t);
I->i2 = t_to_idx(t);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
prev_t = t;
prev_f = f;
++k;
}
IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b);
tail->i1 = t_to_idx(prev_t);
tail->i2 = t_to_idx(b);
tail->diam = map.block_diameter(tail->i1, tail->i2);
tail->compute_span_level(map);
tail->set_metric(tail->diam);
update_pockets_and_Mmax(tail);
tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(tail);
std::push_heap(H.begin(), H.end(), ComparePtrND);
const int kickEveryDim = static_cast<int>(fmaf(22.5f, exp2f(-0.295f * dim_f), 0.0f));
const int noImproveThrDim = static_cast<int>(fmaf(32.5f, exp2f(-0.1f * dim_f), 0.0f));
while (it < maxIter) {
p = fmaf(-1.0f / initial_len, dmax, 1.0f);
const float p_arg = fmaf(p, 2.3f, -2.9775f);
const float r_eff = fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, r, 0.0f);
const bool staganition = no_improve > noImproveThrDim;
if (!(it % kickEveryDim) || staganition) {
const float t_best = map.pointToT(bestQ.data());
const float log_arg = fmaf(dim_f, 1.0f, -1.0f);
const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f);
int ii = 0;
while (ii < 2) {
const float off = (ii == 0) ? fmaf(adaptive_diameter, 2.0f, 0.0f) : fmaf(-adaptive_diameter, 2.0f, 0.0f);
const float t_seed = fminf(b, fmaxf(a, fmaf(off, 1.0f, t_best)));
const float f_seed = evalAt(t_seed);
IntervalND* J = new IntervalND(fmaf(-adaptive_diameter, 1.0f, t_seed), fmaf(adaptive_diameter, 1.0f, t_seed), f_seed, f_seed);
J->i1 = t_to_idx(fmaf(-adaptive_diameter, 1.0f, t_seed));
J->i2 = t_to_idx(fmaf(adaptive_diameter, 1.0f, t_seed));
J->diam = map.block_diameter(J->i1, J->i2);
J->compute_span_level(map);
J->set_metric(J->diam);
update_pockets_and_Mmax(J);
J->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
J->R = fmaf(J->R, 2.03f - adaptive_coeff, 0.0f);
H.emplace_back(J);
std::push_heap(H.begin(), H.end(), ComparePtrND);
++ii;
}
no_improve = 0;
}
const float tmp_exp_arg_threshold = fmaf(-exp_arg_threshold * p, 10.0f, 0.0f);
adaptive_coeff_threshold = fmaf(fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition_threshold, 1.0f);
const float exp_arg = fmaf(B_dim, p, 0.0f);
adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim);
if (dim < 3) {
const float adaptive_coeff_arg = adaptive_coeff - 1.0f;
const float adaptive_coeff_threshold_arg = adaptive_coeff_threshold - 1.0f;
adaptive_coeff = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
adaptive_coeff_threshold = fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_threshold_arg);
}
const float arg_for_T = fmaf(p, 4.5f, 2.475f);
const float exp_argument = fmaf(-0.2925f, dim_f, 0.0f);
const float exp2_exp_arg = fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float A = fmaf(39.995f, exp2_exp_arg, 0.0f);
const int T = static_cast<int>(fmaf(fmaf(fmaf(1.0f / fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 2.0f), 1.0498f, -0.04978f), fmaf(sqrtf(dim_f), 4.875f, 0.0f), 0.0f), fmaf(-(exp2_exp_arg - 1.0f), A, A), 0.0f));
std::pop_heap(H.begin(), H.end(), ComparePtrND);
IntervalND* cur = H.back();
H.pop_back();
const float x1 = cur->x1;
const float x2 = cur->x2;
const float y1 = cur->y1;
const float y2 = cur->y2;
float m = fmaf(r_eff, Mmax, 0.0f);
float tNew = step(m, x1, x2, y1, y2, dim_f, r_eff);
const float bestFOld = bestF;
const float fNew = evalAt(tNew);
IntervalND* L = new IntervalND(x1, tNew, y1, fNew);
IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2);
L->i1 = t_to_idx(x1);
L->i2 = t_to_idx(tNew);
Rv->i1 = t_to_idx(tNew);
Rv->i2 = t_to_idx(x2);
L->diam = map.block_diameter(L->i1, L->i2);
Rv->diam = map.block_diameter(Rv->i1, Rv->i2);
L->compute_span_level(map);
Rv->compute_span_level(map);
L->set_metric(L->diam);
Rv->set_metric(Rv->diam);
const float Mloc = fmaxf(L->M, Rv->M);
update_pockets_and_Mmax(L);
update_pockets_and_Mmax(Rv);
const float prevMmax = Mmax;
if (Mloc > Mmax) Mmax = Mloc;
m = fmaf(r_eff, Mmax, 0.0f);
if (adaptive) {
const float len1 = fmaf(tNew, 1.0f, -x1);
const float len2 = fmaf(x2, 1.0f, -tNew);
if (fmaf(len1, 1.0f, len2) == dmax) {
dmax = fmaxf(len1, len2);
for (auto pI : H) {
const float Ls = fmaf(pI->x2, 1.0f, -pI->x1);
if (Ls > dmax) dmax = Ls;
}
}
if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) {
const float alpha = p * p;
const float beta = fmaf(-alpha, 1.0f, 2.0f);
const float MULT = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULT, r_eff, -MULT);
const float GF = beta * global_coeff;
L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f)));
Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f)));
const size_t sz = H.size();
RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
else {
if (Mloc > prevMmax) {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
const size_t sz = H.size();
RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
}
else {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
}
}
}
else {
if (Mloc > prevMmax) {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
const size_t sz = H.size();
RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
}
else {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
}
}
H.emplace_back(L);
std::push_heap(H.begin(), H.end(), ComparePtrND);
H.emplace_back(Rv);
std::push_heap(H.begin(), H.end(), ComparePtrND);
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
IntervalND* const top = H.front();
const float interval_len = dim > 1 ? fmaf(top->x2, 1.0f, -top->x1) : top->diam;
if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter) {
out_iterations = static_cast<size_t>(it);
out_achieved_epsilon = interval_len;
return;
}
if (!(it % T) || staganition) {
MultiCrossMsg out;
const unsigned intervals_to_send = static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f)));
out.count = intervals_to_send;
float* dest = out.intervals;
unsigned ii = 0;
while (ii < 3) {
IntervalND* Tt = H[ii];
dest[0] = Tt->x1;
dest[1] = 0.0f;
dest[2] = Tt->x2;
dest[3] = 0.0f;
dest[4] = Tt->R;
dest += 5;
++ii;
}
const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
bool active = true;
const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter_T), 1.0f, 1.0f)) & 1);
size_t ii2 = 0;
while (ii2 < iterations && active) {
const size_t step = 1ULL << ii2;
const int partner = rank ^ static_cast<int>(step);
if (partner < world) {
const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
if (am_sender) {
g_world->isend(partner, 0, out);
active = false;
}
}
++ii2;
}
++exchange_counter_T;
}
const float improvement_threshold = 2.03f - adaptive_coeff;
if (bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 7), 0.0f) && (p > 0.7f && !(it % 3)) || bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 5), 0.0f) && p > 0.9f) {
BestSolutionMsg out;
out.bestF = bestF;
out.bestX = bestX;
out.bestY = bestY;
out.dim = static_cast<unsigned>(bestQ.size());
memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float));
const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
bool active = true;
const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter), 1.0f, 1.0f)) & 1);
size_t ii2 = 0;
while (ii2 < iterations && active) {
const size_t step = 1ULL << ii2;
const int partner = rank ^ static_cast<int>(step);
if (partner < world) {
const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
if (am_sender) {
g_world->isend(partner, 2, out);
active = false;
}
}
++ii2;
}
++exchange_counter;
}
while (g_world->iprobe(boost::mpi::any_source, 0)) {
MultiCrossMsg in;
g_world->recv(boost::mpi::any_source, 0, in);
const MultiCrossMsg& mX = in;
unsigned ii = 0;
while (ii < mX.count) {
const float* d = &mX.intervals[ii * 5];
float sx = d[0];
float ex = d[2];
if (ex > sx) {
alignas(16) float tmp[32];
float tx;
float ty;
map.map01ToPoint(sx, tmp);
const float y1i = cost(tmp, tx, ty);
map.map01ToPoint(ex, tmp);
const float y2i = cost(tmp, tx, ty);
IntervalND* inj = new IntervalND(sx, ex, y1i, y2i);
inj->i1 = t_to_idx(sx);
inj->i2 = t_to_idx(ex);
inj->diam = map.block_diameter(inj->i1, inj->i2);
inj->compute_span_level(map);
inj->set_metric(inj->diam);
update_pockets_and_Mmax(inj);
inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
IntervalND* topH = H.front();
const float topH_R = topH->R;
const float arg_for_adaptive = fmaf(2.03f - adaptive_coeff, topH_R, 0.0f);
if (inj->R > fmaf(fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.5819767068693264243850020f, -1.0f) || staganition) {
const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f;
const float kf = staganition ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f);
const float exp_arg = fmaf(B_dim, fmaf(1.0f / static_cast<float>(mX.count - 1u), static_cast<float>(ii), 0.0f), 0.0f);
float adaptive_coeff_clone = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim_clone);
if (dim < 3) {
const float adaptive_coeff_arg = adaptive_coeff_clone - 1.0f;
adaptive_coeff_clone = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
}
const float arg_for_inj = fmaf(1.0f / fmaf(adaptive_coeff, topH_R, 0.0f), inj->R - topH_R, 1.0f);
inj->R = fmaf(d[4], fmaf(fmaf(fmaf(arg_for_inj, fmaf(arg_for_inj, fmaf(arg_for_inj, fmaf(arg_for_inj, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), kf, 0.0f), adaptive_coeff_clone, 0.0f), 0.0f);
H.emplace_back(inj);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
}
++ii;
}
}
while (g_world->iprobe(boost::mpi::any_source, 2)) {
BestSolutionMsg bm;
g_world->recv(boost::mpi::any_source, 2, bm);
if (bm.bestF < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) || staganition) {
alignas(16) float tmp_q[32];
memcpy(tmp_q, bm.bestQ, bm.dim * sizeof(float));
const float t_best = map.pointToT(tmp_q);
const float log_arg = fmaf(dim_f, 1.0f, -1.0f);
const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f);
const float t1 = fmaf(-adaptive_diameter, 0.2f, t_best);
const float t2 = fmaf(adaptive_diameter, 0.2f, t_best);
alignas(16) float tq1[32];
alignas(16) float tq2[32];
float xx1;
float yy1;
float xx2;
float yy2;
map.map01ToPoint(t1, tq1);
const float f1 = cost(tq1, xx1, yy1);
map.map01ToPoint(t2, tq2);
const float f2 = cost(tq2, xx2, yy2);
IntervalND* I = new IntervalND(t1, t2, f1, f2);
I->i1 = t_to_idx(t1);
I->i2 = t_to_idx(t2);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, adaptive_coeff, 0.0f);
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
if (bm.bestF < bestF) {
bestF = bm.bestF;
bestX = bm.bestX;
bestY = bm.bestY;
bestQ.assign(bm.bestQ, bm.bestQ + bm.dim);
}
}
}
++it;
}
}" - внимательно рассмотри этот код, тут два принципиально отличающихся типа mpi обменов - первый для исследования поискового пространства, второй для уточнения, проблема сейчас в том что при приёме второго типа сообщений создаются интервалы очень малого диаметра что вводит алгоритм глобального поиска в заблуждение так как создаются очень малые интервалы с большой характеристикой - что нужно сделать: условие для отправки сообщений второго типа трогать не нужно, условие внутри цикла приёма сообщений второго типа для рассмотрения точки как кандидата: "if (bm.bestF < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) || staganition) {" - также трогать не нужно, но теперь нужно чтобы при входе в это условие не создавался малый интервал вокруг лучшей точки а нужно теперь помимо только лишь лучшей точки в сообщениях второго типа передавать также лучший интервал если размерность меньше 3, и два лучших интервала если размерность больше 2, то есть все условие что в отправке что в приёме при рассмотрении точки как перспективного кандидата в сообщениях второго типа - остаются такими же - но теперь помимо точки передаются 1 или 2 лучших интервала, и что потом должно происходить с этими интервалалами: в текущем коде у вновь созданных интервалов при приёме сообщений второго типа характеристика меняется как I->R = fmaf(I->R, adaptive_coeff, 0.0f);, представим теперь что I это переданный интервал, тогда сначала его характеристика должна меняться аналогично тому как она сначала меняется в сообщениях первого типа inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); затем если передавался только 1 лучший интервал - то его характеристика должна будет измениться по закону I->R = fmaf(I->R, adaptive_coeff, 0.0f); - в случае же если передавалось два лучших интервала то дополнительно характеристика каждого из них должна будет умножиться на adaptive_coeff_clone - который будет вводится в сообщениях второго типа по полной аналогии с сообщениями первого типа с тем отличием что его не надо будет смягчать при размерности меньше 3 как это сейчас делается в сообщениях первого типа, то есть вот этого: " if (dim < 3) {
const float adaptive_coeff_arg = adaptive_coeff_clone - 1.0f;
adaptive_coeff_clone = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
}" - делать не надо, зато надо также как и в сообщениях первого типа делать _mm_prefetch((const char*)H[0 или (0 и затем 1 - если 2 интервала пересылается)], _MM_HINT_T0); то есть итоговая формула изменений характеристик в сообщениях второго типа будет: "I->R = fmaf(I->R, fmaf(adaptive_coeff_clone, adaptive_coeff, 0.0f), 0.0f);". Заметь в случае с сообщениями первого типа adaptive_coeff_clone зависит от числа передаваемых интервалов через "fmaf(1.0f / static_cast<float>(mX.count - 1u), static_cast<float>(ii), 0.0f)" - тут же сделай проще, используй вместо этой формулы: "fmaf(1.0f / static_cast<float>(mX.count - 1u), static_cast<float>(ii), 0.0f)": 0 для первого интервала, 1 для второго интервала, в случае когда их двое, в случае одного интервала напоминаю домножать на adaptive_coeff_clone не нужно, не добавляй лишних проверок в код по типу проверок на непустоту, эти кучи гарантировано не пусти, не делай никаких проверок на вылет за пределы диапазона и прочее - проверки запрещены, модифицируй мой код в соответствии с указанными требованиями - причём заметь - изменения в код должны быть в моём стиле максимально оптимизированного кода: заботься о скорости работы кода и думай о максимальной производительности, так что инлайнь вычисления, загружай регистры и умножители процессора через вложенные сложные длинные формулы с fmaf, старайся делать все вычисления in_place - минимум обращений к памяти - всё в регистрах, инструкции размещай чтобы обеспечить оптимальную загрузку CPU, минимум кеш-промахов, локальность доступа к данным, если кешируешь переменнные - кешируй их как можно ближе к месту использования, возможно потребуется внести изменения в сериализируемые MPI структуры чтобы сделать это максимально быстро по скорости - мне пришли полную исправленную версию кода без пропусков и комментариев, в полном соответствии со всеми моими требованиями

Answer 1

cpp
__declspec(align(16)) struct CrossMsg final {
	float s_x1;
	float s_x2;
	float e_x1;
	float e_x2;
	float Rtop;

	template <typename Archive>
	__declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept {
		ar& s_x1& s_x2& e_x1& e_x2& Rtop;
	}
};

__declspec(align(16)) struct MultiCrossMsg final {
	float intervals[35];
	unsigned count;

	template <typename Archive>
	__declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept {
		ar& intervals& count;
	}
};

__declspec(align(16)) struct BestSolutionMsg final {
	float bestF;
	float bestX;
	float bestY;
	float bestQ[32];
	float bestI[8];
	unsigned dim;

	template <typename Archive>
	__declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept {
		ar& bestF& bestX& bestY& bestQ& bestI& dim;
	}
};

static __declspec(noalias) void agp_run_branch_mpi(
	const MortonND& map,
	const ManipCost& cost,
	int maxIter,
	float r,
	bool adaptive,
	float eps,
	unsigned seed,
	std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H,
	std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ,
	float& bestF,
	float& bestX,
	float& bestY,
	size_t& out_iterations,
	float& out_achieved_epsilon,
	float M_prior) noexcept
{
	const int n = cost.n;
	const int dim = n + (cost.variableLen ? n : 0);
	const float dim_f = static_cast<float>(dim);
	unsigned exchange_counter = 0;
	unsigned exchange_counter_T = 0;
	alignas(16) float M_by_span[12];
	int msi = 0;
	while (msi < 12) M_by_span[msi++] = M_prior;
	float Mmax = M_prior;
	alignas(16) float q_local[32];
	alignas(16) float phi[32];
	alignas(16) float s_arr[32];
	alignas(16) float c_arr[32];
	alignas(16) float sum_s[32];
	alignas(16) float sum_c[32];
	alignas(16) float q_try[32];
	bestQ.reserve(static_cast<size_t>(dim));
	float x = 0.0f;
	float y = 0.0f;
	int no_improve = 0;
	auto t_to_idx = [&](float t) -> unsigned long long {
		unsigned long long idx = static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f));
		return idx;
		};
	auto update_pockets_and_Mmax = [&](IntervalND* I) {
		const int k = I->span_level;
		if (I->M > M_by_span[k]) M_by_span[k] = I->M;
		if (M_by_span[k] > Mmax) Mmax = M_by_span[k];
		};
	const float a = 0.0f;
	const float b = 1.0f;
	float p = 0.0f;
	float dmax = fmaf(b, 1.0f, -a);
	const float initial_len = dmax;
	float exp_arg_threshold = fmaf(1.0f / fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, -0.2f, 0.25f), -0.33333333f), 0.5f), -1.0f), dim_f), 0.415888308336f, 0.0f);
	const float log_arg_threshold = fmaf(dim_f, 0.1f, -0.105f);
	const float adaptive_coeff_addition_threshold = fmaf(fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 1.0f / fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg_threshold), 0.17814618538f);
	float adaptive_coeff_threshold = adaptive_coeff_addition_threshold + 1.0f;
	const float log_arg = dim_f + 5.0f;
	const float A_dim = fmaf(1.0f / fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), 3.0f, 0.0f);
	const float B_dim = fmaf(A_dim, 0.65f, 0.0f);
	const float log_argument = A_dim - 2.03f;
	const float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim;
	const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.55f), 1.0f);
	float adaptive_coeff = A_dim - adaptive_coeff_addition;
	const float A_dim_clone = fmaf(A_dim - fmaf(-fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim), 0.5f, 0.0f);
	int it = 0;
	auto evalAt = [&](float t) -> float {
		map.map01ToPoint(t, q_local);
		float f = cost(q_local, x, y);
		const float step_arg = fmaf(-dim_f, 0.825f, 3.3f);
		float eta = fmaf(fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f);
		if ((((f < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) && p > 0.55f) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && ((p > 0.7f && !(it % 3)) || p > 0.9f))) && dim > 2) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && dim < 3)) {
			float acc = 0.0f;
			int ii = 0;
			while (ii < n) {
				acc = fmaf(q_local[ii], 1.0f, acc);
				phi[ii] = acc;
				++ii;
			}
			FABE13_SINCOS(phi, s_arr, c_arr, n);
			float as = 0.0f;
			float ac = 0.0f;
			int k = n - 1;
			while (k >= 0) {
				const float Lk = cost.variableLen ? q_local[n + k] : 1.0f;
				as = fmaf(Lk, s_arr[k], as);
				ac = fmaf(Lk, c_arr[k], ac);
				sum_s[k] = as;
				sum_c[k] = ac;
				--k;
			}
			const float dx = fmaf(x, 1.0f, -cost.targetX);
			const float dy = fmaf(y, 1.0f, -cost.targetY);
			const float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1.0e-8f;
			int stepI = 0;
			const float adaptive_window = fmaf(1.0f / fmaf(bestF, adaptive_coeff_threshold, 0.0f), fmaf(bestF, adaptive_coeff_threshold, -f), 0.0f);
			const float curse_dim_arg = fmaf(dim_f, 0.825f, 0.0f);
			const int threshold_iters_grad = static_cast<int>(fmaf(fmaf(adaptive_window, adaptive_window, 1.0f), fmaf(fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f), 0.0f));
			int no_improve_steps = 0;
			float f_at_check_start = f;
			int steps_since_check = 0;
			const int check_interval = static_cast<int>(fmaf(1.0f / log2f(fmaf(dim_f, 0.5f, 1.0f)), static_cast<float>(threshold_iters_grad >> 2), 0.0f));
			while (stepI < threshold_iters_grad) {
				int i = 0;
#pragma loop ivdep
				while (i < n) {
					float gpen = 0.0f;
					{
						const float ai = fabsf(q_local[i]);
						const float v = fmaf(ai, 1.0f, -cost.minTheta);
						if (v > 0.0f) {
							const float scale_arg = fmaf(2.0f / (cost.minTheta + 1.0e-6f), fmaf(v, 0.69314718055994530941723212145818f, 0.0f), 0.0f);
							const float exp_val = fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
							const float dpen_dtheta = fmaf(cost.sharpW, fmaf(exp_val, 0.69314718055994530941723212145818f * (2.0f / (cost.minTheta + 1.0e-6f)), 0.0f), copysignf(1.0f, q_local[i]));
							gpen = fmaf(dpen_dtheta, 1.0f, gpen);
						}
					}
					{
						const float tsg = fmaf(-q_local[i], cost.archBiasK, 0.0f);
						const float exp_arg = -tsg;
						const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
						const float sig = 1.0f / fmaf(exp_val, 1.0f, 1.0f);
						gpen = fmaf(-fmaf(cost.archBiasW, cost.archBiasK, 0.0f), sig, gpen);
					}
					const float g = fmaf(fmaf(dx, -sum_s[i], fmaf(dy, sum_c[i], 0.0f)), 1.0f / dist, gpen);
					q_try[i] = fmaf(-eta, g, q_local[i]);
					const float lo = (i == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
					const float hi = 2.6179938779914943653855361527329f;
					if (q_try[i] < lo) q_try[i] = lo;
					else if (q_try[i] > hi) q_try[i] = hi;
					++i;
				}
				if (cost.variableLen) {
					int j = 0;
#pragma loop ivdep
					while (j < n) {
						const float g = fmaf(fmaf(dx, c_arr[j], fmaf(dy, s_arr[j], 0.0f)), 1.0f / dist, 0.0f);
						float v = fmaf(-eta, g, q_local[n + j]);
						if (v < 0.5f) v = 0.5f;
						else if (v > 2.0f) v = 2.0f;
						q_try[n + j] = v;
						++j;
					}
				}
				float x2;
				float y2;
				const float f2 = cost(q_try, x2, y2);
				const float rel_improvement = fmaf(-1.0f / f, f2, 1.0f);
				if (f2 < f) {
					memcpy(q_local, q_try, static_cast<size_t>(dim) * sizeof(float));
					f = f2;
					x = x2;
					y = y2;
					eta = fmaf(eta, fmaf(10.0f / sqrtf(dim_f), 1.0f / (1.0f + rel_improvement) * rel_improvement, 1.0f), 0.0f);
				}
				else {
					const float caution_coeff = 5.0f / sqrtf(dim_f) * rel_improvement;
					eta = fmaf(eta, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0f);
					memcpy(q_try, q_local, static_cast<size_t>(dim) * sizeof(float));
					++no_improve_steps;
				}
				++steps_since_check;
				if (steps_since_check == check_interval) {
					if (fmaf(-1.0f / f_at_check_start, f, 1.0f) < 1e-3f) break;
					f_at_check_start = f;
					steps_since_check = 0;
				}
				if (no_improve_steps > threshold_iters_grad >> 2) break;
				++stepI;
			}
		}
		if (f < bestF) {
			if (0.95f < p || dim < 3) {
				const int last = n - 1;
				const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
				const float hi = 2.6179938779914943653855361527329f;
				float bestLocF = f;
				float saved = q_local[last];
				float delta = eta;
				while (delta > fmaf(delta, 0.15f, 0.0f)) {
					int sgn = -1;
					while (sgn < 2) {
						float cand = fmaf(static_cast<float>(sgn), delta, saved);
						if (cand < lo) cand = lo;
						else if (cand > hi) cand = hi;
						const float backup = q_local[last];
						q_local[last] = cand;
						float x2;
						float y2;
						const float f2 = cost(q_local, x2, y2);
						if (f2 < bestLocF) {
							bestLocF = f2;
							x = x2;
							y = y2;
							saved = cand;
						}
						q_local[last] = backup;
						sgn += 2;
					}
					delta = fmaf(delta, 0.5f, 0.0f);
				}
				if (bestLocF < f) {
					q_local[last] = saved;
					f = bestLocF;
				}
			}
			bestF = f;
			bestQ.assign(q_local, q_local + dim);
			bestX = x;
			bestY = y;
			no_improve = 0;
		}
		else ++no_improve;
		return f;
		};
	const float f_a = evalAt(a);
	const float f_b = evalAt(b);
	const float Kf = fminf(fmaxf(2.0f * dim_f, 8.0f), 128.0f);
	const int K = static_cast<int>(Kf);
	H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + 16u);
	const int rank = g_world->rank();
	const int world = g_world->size();
	alignas(16) float seeds[256 * 32];
	const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, static_cast<unsigned>(fmaf(static_cast<float>(rank), 7919.0f, static_cast<float>(seed))));
	int i = 0;
	while (i < seedCnt) {
		const float* s = seeds + static_cast<size_t>(fmaf(static_cast<float>(i), 32.0f, 0.0f));
		const float t_seed = map.pointToT(s);
		const float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f) : fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f), exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
		const float t1 = fmaxf(a, fmaf(-interval_size, 1.0f, t_seed));
		const float t2 = fminf(b, fmaf(interval_size, 1.0f, t_seed));
		if (t2 > t1) {
			alignas(16) float q1[32];
			alignas(16) float q2[32];
			float x1;
			float y1;
			float x2;
			float y2;
			map.map01ToPoint(t1, q1);
			const float f1 = cost(q1, x1, y1);
			map.map01ToPoint(t2, q2);
			const float f2 = cost(q2, x2, y2);
			IntervalND* I = new IntervalND(t1, t2, f1, f2);
			I->i1 = t_to_idx(t1);
			I->i2 = t_to_idx(t2);
			I->diam = map.block_diameter(I->i1, I->i2);
			I->compute_span_level(map);
			I->set_metric(I->diam);
			update_pockets_and_Mmax(I);
			I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
			if (i < 3) I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f);
			else {
				const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f);
				const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f);
				const float mult = fmaf(start_mult, exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
				I->R = fmaf(I->R, mult, 0.0f);
			}
			H.emplace_back(I);
			std::push_heap(H.begin(), H.end(), ComparePtrND);
			if (f1 < bestF) {
				bestF = f1;
				bestQ.assign(q1, q1 + dim);
				bestX = x1;
				bestY = y1;
			}
			if (f2 < bestF) {
				bestF = f2;
				bestQ.assign(q2, q2 + dim);
				bestX = x2;
				bestY = y2;
			}
		}
		++i;
	}
	float prev_t = a;
	float prev_f = f_a;
	int k = 1;
	while (k <= K) {
		const float t = fmaf(fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1)));
		const float f = evalAt(t);
		IntervalND* I = new IntervalND(prev_t, t, prev_f, f);
		I->i1 = t_to_idx(prev_t);
		I->i2 = t_to_idx(t);
		I->diam = map.block_diameter(I->i1, I->i2);
		I->compute_span_level(map);
		I->set_metric(I->diam);
		update_pockets_and_Mmax(I);
		I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
		H.emplace_back(I);
		std::push_heap(H.begin(), H.end(), ComparePtrND);
		prev_t = t;
		prev_f = f;
		++k;
	}
	IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b);
	tail->i1 = t_to_idx(prev_t);
	tail->i2 = t_to_idx(b);
	tail->diam = map.block_diameter(tail->i1, tail->i2);
	tail->compute_span_level(map);
	tail->set_metric(tail->diam);
	update_pockets_and_Mmax(tail);
	tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
	H.emplace_back(tail);
	std::push_heap(H.begin(), H.end(), ComparePtrND);
	const int kickEveryDim = static_cast<int>(fmaf(22.5f, exp2f(-0.295f * dim_f), 0.0f));
	const int noImproveThrDim = static_cast<int>(fmaf(32.5f, exp2f(-0.1f * dim_f), 0.0f));
	while (it < maxIter) {
		p = fmaf(-1.0f / initial_len, dmax, 1.0f);
		const float p_arg = fmaf(p, 2.3f, -2.9775f);
		const float r_eff = fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, r, 0.0f);
		const bool staganition = no_improve > noImproveThrDim;
		if (!(it % kickEveryDim) || staganition) {
			const float t_best = map.pointToT(bestQ.data());
			const float log_arg = fmaf(dim_f, 1.0f, -1.0f);
			const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f);
			int ii = 0;
			while (ii < 2) {
				const float off = (ii == 0) ? fmaf(adaptive_diameter, 2.0f, 0.0f) : fmaf(-adaptive_diameter, 2.0f, 0.0f);
				const float t_seed = fminf(b, fmaxf(a, fmaf(off, 1.0f, t_best)));
				const float f_seed = evalAt(t_seed);
				IntervalND* J = new IntervalND(fmaf(-adaptive_diameter, 1.0f, t_seed), fmaf(adaptive_diameter, 1.0f, t_seed), f_seed, f_seed);
				J->i1 = t_to_idx(fmaf(-adaptive_diameter, 1.0f, t_seed));
				J->i2 = t_to_idx(fmaf(adaptive_diameter, 1.0f, t_seed));
				J->diam = map.block_diameter(J->i1, J->i2);
				J->compute_span_level(map);
				J->set_metric(J->diam);
				update_pockets_and_Mmax(J);
				J->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
				J->R = fmaf(J->R, 2.03f - adaptive_coeff, 0.0f);
				H.emplace_back(J);
				std::push_heap(H.begin(), H.end(), ComparePtrND);
				++ii;
			}
			no_improve = 0;
		}
		const float tmp_exp_arg_threshold = fmaf(-exp_arg_threshold * p, 10.0f, 0.0f);
		adaptive_coeff_threshold = fmaf(fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition_threshold, 1.0f);
		const float exp_arg = fmaf(B_dim, p, 0.0f);
		adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim);
		if (dim < 3) {
			const float adaptive_coeff_arg = adaptive_coeff - 1.0f;
			const float adaptive_coeff_threshold_arg = adaptive_coeff_threshold - 1.0f;
			adaptive_coeff = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
			adaptive_coeff_threshold = fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_threshold_arg);
		}
		const float arg_for_T = fmaf(p, 4.5f, 2.475f);
		const float exp_argument = fmaf(-0.2925f, dim_f, 0.0f);
		const float exp2_exp_arg = fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
		const float A = fmaf(39.995f, exp2_exp_arg, 0.0f);
		const int T = static_cast<int>(fmaf(fmaf(fmaf(1.0f / fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 2.0f), 1.0498f, -0.04978f), fmaf(sqrtf(dim_f), 4.875f, 0.0f), 0.0f), fmaf(-(exp2_exp_arg - 1.0f), A, A), 0.0f));
		std::pop_heap(H.begin(), H.end(), ComparePtrND);
		IntervalND* cur = H.back();
		H.pop_back();
		const float x1 = cur->x1;
		const float x2 = cur->x2;
		const float y1 = cur->y1;
		const float y2 = cur->y2;
		float m = fmaf(r_eff, Mmax, 0.0f);
		float tNew = step(m, x1, x2, y1, y2, dim_f, r_eff);
		const float bestFOld = bestF;
		const float fNew = evalAt(tNew);
		IntervalND* L = new IntervalND(x1, tNew, y1, fNew);
		IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2);
		L->i1 = t_to_idx(x1);
		L->i2 = t_to_idx(tNew);
		Rv->i1 = t_to_idx(tNew);
		Rv->i2 = t_to_idx(x2);
		L->diam = map.block_diameter(L->i1, L->i2);
		Rv->diam = map.block_diameter(Rv->i1, Rv->i2);
		L->compute_span_level(map);
		Rv->compute_span_level(map);
		L->set_metric(L->diam);
		Rv->set_metric(Rv->diam);
		const float Mloc = fmaxf(L->M, Rv->M);
		update_pockets_and_Mmax(L);
		update_pockets_and_Mmax(Rv);
		const float prevMmax = Mmax;
		if (Mloc > Mmax) Mmax = Mloc;
		m = fmaf(r_eff, Mmax, 0.0f);
		if (adaptive) {
			const float len1 = fmaf(tNew, 1.0f, -x1);
			const float len2 = fmaf(x2, 1.0f, -tNew);
			if (fmaf(len1, 1.0f, len2) == dmax) {
				dmax = fmaxf(len1, len2);
				for (auto pI : H) {
					const float Ls = fmaf(pI->x2, 1.0f, -pI->x1);
					if (Ls > dmax) dmax = Ls;
				}
			}
			if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) {
				const float alpha = p * p;
				const float beta = fmaf(-alpha, 1.0f, 2.0f);
				const float MULT = (1.0f / dmax) * Mmax;
				const float global_coeff = fmaf(MULT, r_eff, -MULT);
				const float GF = beta * global_coeff;
				L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f)));
				Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f)));
				const size_t sz = H.size();
				RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha);
				std::make_heap(H.begin(), H.end(), ComparePtrND);
			}
			else {
				if (Mloc > prevMmax) {
					L->ChangeCharacteristic(m);
					Rv->ChangeCharacteristic(m);
					if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
						const size_t sz = H.size();
						RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
						std::make_heap(H.begin(), H.end(), ComparePtrND);
					}
				}
				else {
					L->ChangeCharacteristic(m);
					Rv->ChangeCharacteristic(m);
				}
			}
		}
		else {
			if (Mloc > prevMmax) {
				L->ChangeCharacteristic(m);
				Rv->ChangeCharacteristic(m);
				if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
					const size_t sz = H.size();
					RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
					std::make_heap(H.begin(), H.end(), ComparePtrND);
				}
			}
			else {
				L->ChangeCharacteristic(m);
				Rv->ChangeCharacteristic(m);
			}
		}
		H.emplace_back(L);
		std::push_heap(H.begin(), H.end(), ComparePtrND);
		H.emplace_back(Rv);
		std::push_heap(H.begin(), H.end(), ComparePtrND);
		_mm_prefetch((const char*)H[0], _MM_HINT_T0);
		_mm_prefetch((const char*)H[1], _MM_HINT_T0);
		IntervalND* const top = H.front();
		const float interval_len = dim > 1 ? fmaf(top->x2, 1.0f, -top->x1) : top->diam;
		if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter) {
			out_iterations = static_cast<size_t>(it);
			out_achieved_epsilon = interval_len;
			return;
		}
		if (!(it % T) || staganition) {
			MultiCrossMsg out;
			const unsigned intervals_to_send = static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f)));
			out.count = intervals_to_send;
			float* dest = out.intervals;
			unsigned ii = 0;
			while (ii < 3) {
				IntervalND* Tt = H[ii];
				dest[0] = Tt->x1;
				dest[1] = 0.0f;
				dest[2] = Tt->x2;
				dest[3] = 0.0f;
				dest[4] = Tt->R;
				dest += 5;
				++ii;
			}
			const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
			bool active = true;
			const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter_T), 1.0f, 1.0f)) & 1);
			size_t ii2 = 0;
			while (ii2 < iterations && active) {
				const size_t step = 1ULL << ii2;
				const int partner = rank ^ static_cast<int>(step);
				if (partner < world) {
					const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
					if (am_sender) {
						g_world->isend(partner, 0, out);
						active = false;
					}
				}
				++ii2;
			}
			++exchange_counter_T;
		}
		const float improvement_threshold = 2.03f - adaptive_coeff;
		if (bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 7), 0.0f) && (p > 0.7f && !(it % 3)) || bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 5), 0.0f) && p > 0.9f) {
			BestSolutionMsg out;
			out.bestF = bestF;
			out.bestX = bestX;
			out.bestY = bestY;
			out.dim = static_cast<unsigned>(bestQ.size());
			memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float));
			{
				IntervalND* const I0 = H[0];
				out.bestI[0] = I0->x1;
				out.bestI[1] = I0->x2;
				out.bestI[2] = I0->y1;
				out.bestI[3] = I0->y2;
				if (dim > 2) {
					IntervalND* const I1 = H[1];
					out.bestI[4] = I1->x1;
					out.bestI[5] = I1->x2;
					out.bestI[6] = I1->y1;
					out.bestI[7] = I1->y2;
				}
			}
			const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
			bool active = true;
			const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter), 1.0f, 1.0f)) & 1);
			size_t ii2 = 0;
			while (ii2 < iterations && active) {
				const size_t step = 1ULL << ii2;
				const int partner = rank ^ static_cast<int>(step);
				if (partner < world) {
					const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
					if (am_sender) {
						g_world->isend(partner, 2, out);
						active = false;
					}
				}
				++ii2;
			}
			++exchange_counter;
		}
		while (g_world->iprobe(boost::mpi::any_source, 0)) {
			MultiCrossMsg in;
			g_world->recv(boost::mpi::any_source, 0, in);
			const MultiCrossMsg& mX = in;
			unsigned ii = 0;
			while (ii < mX.count) {
				const float* d = &mX.intervals[ii * 5];
				float sx = d[0];
				float ex = d[2];
				if (ex > sx) {
					alignas(16) float tmp[32];
					float tx;
					float ty;
					map.map01ToPoint(sx, tmp);
					const float y1i = cost(tmp, tx, ty);
					map.map01ToPoint(ex, tmp);
					const float y2i = cost(tmp, tx, ty);
					IntervalND* inj = new IntervalND(sx, ex, y1i, y2i);
					inj->i1 = t_to_idx(sx);
					inj->i2 = t_to_idx(ex);
					inj->diam = map.block_diameter(inj->i1, inj->i2);
					inj->compute_span_level(map);
					inj->set_metric(inj->diam);
					update_pockets_and_Mmax(inj);
					inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
					_mm_prefetch((const char*)H[0], _MM_HINT_T0);
					_mm_prefetch((const char*)H[1], _MM_HINT_T0);
					IntervalND* topH = H.front();
					const float topH_R = topH->R;
					const float arg_for_adaptive = fmaf(2.03f - adaptive_coeff, topH_R, 0.0f);
					if (inj->R > fmaf(fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.5819767068693264243850020f, -1.0f) || staganition) {
						const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f;
						const float kf = staganition ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f);
						const float exp_arg = fmaf(B_dim, fmaf(1.0f / static_cast<float>(mX.count - 1u), static_cast<float>(ii), 0.0f), 0.0f);
						float adaptive_coeff_clone = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim_clone);
						if (dim < 3) {
							const float adaptive_coeff_arg = adaptive_coeff_clone - 1.0f;
							adaptive_coeff_clone = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
						}
						const float arg_for_inj = fmaf(1.0f / fmaf(adaptive_coeff, topH_R, 0.0f), inj->R - topH_R, 1.0f);
						inj->R = fmaf(d[4], fmaf(fmaf(fmaf(arg_for_inj, fmaf(arg_for_inj, fmaf(arg_for_inj, fmaf(arg_for_inj, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), kf, 0.0f), adaptive_coeff_clone, 0.0f), 0.0f);
						H.emplace_back(inj);
						std::push_heap(H.begin(), H.end(), ComparePtrND);
					}
				}
				++ii;
			}
		}
		while (g_world->iprobe(boost::mpi::any_source, 2)) {
			BestSolutionMsg bm;
			g_world->recv(boost::mpi::any_source, 2, bm);
			if (bm.bestF < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) || staganition) {
				if (dim > 2) {
					_mm_prefetch((const char*)H[0], _MM_HINT_T0);
					_mm_prefetch((const char*)H[1], _MM_HINT_T0);
					const float adaptive_coeff_clone0 = fmaf(-1.0f, adaptive_coeff_addition, A_dim_clone);
					const float exp_arg = B_dim;
					const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
					const float adaptive_coeff_clone1 = fmaf(-exp_val, adaptive_coeff_addition, A_dim_clone);
					const float coeff0 = fmaf(adaptive_coeff_clone0, adaptive_coeff, 0.0f);
					const float coeff1 = fmaf(adaptive_coeff_clone1, adaptive_coeff, 0.0f);
					{
						const float sx = bm.bestI[0];
						const float ex = bm.bestI[1];
						const float y1i = bm.bestI[2];
						const float y2i = bm.bestI[3];
						IntervalND* I = new IntervalND(sx, ex, y1i, y2i);
						I->i1 = t_to_idx(sx);
						I->i2 = t_to_idx(ex);
						I->diam = map.block_diameter(I->i1, I->i2);
						I->compute_span_level(map);
						I->set_metric(I->diam);
						update_pockets_and_Mmax(I);
						I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
						I->R = fmaf(I->R, coeff0, 0.0f);
						H.emplace_back(I);
						std::push_heap(H.begin(), H.end(), ComparePtrND);
					}
					{
						const float sx = bm.bestI[4];
						const float ex = bm.bestI[5];
						const float y1i = bm.bestI[6];
						const float y2i = bm.bestI[7];
						IntervalND* I = new IntervalND(sx, ex, y1i, y2i);
						I->i1 = t_to_idx(sx);
						I->i2 = t_to_idx(ex);
						I->diam = map.block_diameter(I->i1, I->i2);
						I->compute_span_level(map);
						I->set_metric(I->diam);
						update_pockets_and_Mmax(I);
						I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
						I->R = fmaf(I->R, coeff1, 0.0f);
						H.emplace_back(I);
						std::push_heap(H.begin(), H.end(), ComparePtrND);
					}
				}
				else {
					_mm_prefetch((const char*)H[0], _MM_HINT_T0);
					const float sx = bm.bestI[0];
					const float ex = bm.bestI[1];
					const float y1i = bm.bestI[2];
					const float y2i = bm.bestI[3];
					IntervalND* I = new IntervalND(sx, ex, y1i, y2i);
					I->i1 = t_to_idx(sx);
					I->i2 = t_to_idx(ex);
					I->diam = map.block_diameter(I->i1, I->i2);
					I->compute_span_level(map);
					I->set_metric(I->diam);
					update_pockets_and_Mmax(I);
					I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
					I->R = fmaf(I->R, adaptive_coeff, 0.0f);
					H.emplace_back(I);
					std::push_heap(H.begin(), H.end(), ComparePtrND);
				}
				if (bm.bestF < bestF) {
					bestF = bm.bestF;
					bestX = bm.bestX;
					bestY = bm.bestY;
					bestQ.assign(bm.bestQ, bm.bestQ + bm.dim);
				}
			}
		}
		++it;
	}
}

"__declspec(align(16)) struct CrossMsg final { fl...

Pergunta

Pensamento

Modifying MPI message structure for optimization

Clarifying message structure for interval exchange

Choosing best intervals for type2 exchange

Sending and receiving type2 intervals with correct characteristic updates

Recomputing interval characteristics and updating formula for type2

Updating message structure and handling R value in type2

Handling scaling and message structure for type2

Handling type2 message structure and prefetch

Sending and receiving intervals efficiently

Optimizing interval transmission with y1/y2 values

Designing interval message for type2 with optimization in mind

Deciding interval structure for efficient message handling

Optimizing message structure and interval handling

Handling interval selection and adaptive scaling

Implementing interval handling and message updates

Updating interval handling and serialization

Resposta

Compartilhar esta Q&A