"__declspec(align(16)) struct CrossMsg final { fl...
Criado em: 19 de dezembro de 2025
Respondido usando GPT-5.2 Thinking por Chat01
Criado em: 19 de dezembro de 2025
Respondido usando GPT-5.2 Thinking por Chat01
"__declspec(align(16)) struct CrossMsg final {
float s_x1;
float s_x2;
float e_x1;
float e_x2;
float Rtop;
texttemplate <typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }
};
__declspec(align(16)) struct MultiCrossMsg final {
float intervals[35];
unsigned count;
texttemplate <typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept { ar& intervals& count; }
};
__declspec(align(16)) struct BestSolutionMsg final {
float bestF;
float bestX;
float bestY;
float bestQ[32];
unsigned dim;
texttemplate <typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept { ar& bestF& bestX& bestY& bestQ& dim; }
}; static __declspec(noalias) void agp_run_branch_mpi(
const MortonND& map,
const ManipCost& cost,
int maxIter,
float r,
bool adaptive,
float eps,
unsigned seed,
std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H,
std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ,
float& bestF,
float& bestX,
float& bestY,
size_t& out_iterations,
float& out_achieved_epsilon,
float M_prior) noexcept
{
const int n = cost.n;
const int dim = n + (cost.variableLen ? n : 0);
const float dim_f = static_cast<float>(dim);
unsigned exchange_counter = 0;
unsigned exchange_counter_T = 0;
alignas(16) float M_by_span[12];
int msi = 0;
while (msi < 12) M_by_span[msi++] = M_prior;
float Mmax = M_prior;
alignas(16) float q_local[32];
alignas(16) float phi[32];
alignas(16) float s_arr[32];
alignas(16) float c_arr[32];
alignas(16) float sum_s[32];
alignas(16) float sum_c[32];
alignas(16) float q_try[32];
bestQ.reserve(static_cast<size_t>(dim));
float x = 0.0f;
float y = 0.0f;
int no_improve = 0;
auto t_to_idx = [&](float t) -> unsigned long long {
unsigned long long idx = static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f));
return idx;
};
auto update_pockets_and_Mmax = [&](IntervalND* I) {
const int k = I->span_level;
if (I->M > M_by_span[k]) M_by_span[k] = I->M;
if (M_by_span[k] > Mmax) Mmax = M_by_span[k];
};
const float a = 0.0f;
const float b = 1.0f;
float p = 0.0f;
float dmax = fmaf(b, 1.0f, -a);
const float initial_len = dmax;
float exp_arg_threshold = fmaf(1.0f / fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, -0.2f, 0.25f), -0.33333333f), 0.5f), -1.0f), dim_f), 0.415888308336f, 0.0f);
const float log_arg_threshold = fmaf(dim_f, 0.1f, -0.105f);
const float adaptive_coeff_addition_threshold = fmaf(fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 1.0f / fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg_threshold), 0.17814618538f);
float adaptive_coeff_threshold = adaptive_coeff_addition_threshold + 1.0f;
const float log_arg = dim_f + 5.0f;
const float A_dim = fmaf(1.0f / fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), 3.0f, 0.0f);
const float B_dim = fmaf(A_dim, 0.65f, 0.0f);
const float log_argument = A_dim - 2.03f;
const float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim;
const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.55f), 1.0f);
float adaptive_coeff = A_dim - adaptive_coeff_addition;
const float A_dim_clone = fmaf(A_dim - fmaf(-fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim), 0.5f, 0.0f);
int it = 0;
auto evalAt = [&](float t) -> float {
map.map01ToPoint(t, q_local);
float f = cost(q_local, x, y);
const float step_arg = fmaf(-dim_f, 0.825f, 3.3f);
float eta = fmaf(fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f);
if ((((f < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) && p > 0.55f) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && ((p > 0.7f && !(it % 3)) || p > 0.9f))) && dim > 2) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && dim < 3)) {
float acc = 0.0f;
int ii = 0;
while (ii < n) {
acc = fmaf(q_local[ii], 1.0f, acc);
phi[ii] = acc;
++ii;
}
FABE13_SINCOS(phi, s_arr, c_arr, n);
float as = 0.0f;
float ac = 0.0f;
int k = n - 1;
while (k >= 0) {
const float Lk = cost.variableLen ? q_local[n + k] : 1.0f;
as = fmaf(Lk, s_arr[k], as);
ac = fmaf(Lk, c_arr[k], ac);
sum_s[k] = as;
sum_c[k] = ac;
--k;
}
const float dx = fmaf(x, 1.0f, -cost.targetX);
const float dy = fmaf(y, 1.0f, -cost.targetY);
const float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1.0e-8f;
int stepI = 0;
const float adaptive_window = fmaf(1.0f / fmaf(bestF, adaptive_coeff_threshold, 0.0f), fmaf(bestF, adaptive_coeff_threshold, -f), 0.0f);
const float curse_dim_arg = fmaf(dim_f, 0.825f, 0.0f);
const int threshold_iters_grad = static_cast<int>(fmaf(fmaf(adaptive_window, adaptive_window, 1.0f), fmaf(fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f), 0.0f));
int no_improve_steps = 0;
float f_at_check_start = f;
int steps_since_check = 0;
const int check_interval = static_cast<int>(fmaf(1.0f / log2f(fmaf(dim_f, 0.5f, 1.0f)), static_cast<float>(threshold_iters_grad >> 2), 0.0f));
while (stepI < threshold_iters_grad) {
int i = 0;
#pragma loop ivdep
while (i < n) {
float gpen = 0.0f;
{
const float ai = fabsf(q_local[i]);
const float v = fmaf(ai, 1.0f, -cost.minTheta);
if (v > 0.0f) {
const float scale_arg = fmaf(2.0f / (cost.minTheta + 1.0e-6f), fmaf(v, 0.69314718055994530941723212145818f, 0.0f), 0.0f);
const float exp_val = fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float dpen_dtheta = fmaf(cost.sharpW, fmaf(exp_val, 0.69314718055994530941723212145818f * (2.0f / (cost.minTheta + 1.0e-6f)), 0.0f), copysignf(1.0f, q_local[i]));
gpen = fmaf(dpen_dtheta, 1.0f, gpen);
}
}
{
const float tsg = fmaf(-q_local[i], cost.archBiasK, 0.0f);
const float exp_arg = -tsg;
const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float sig = 1.0f / fmaf(exp_val, 1.0f, 1.0f);
gpen = fmaf(-fmaf(cost.archBiasW, cost.archBiasK, 0.0f), sig, gpen);
}
const float g = fmaf(fmaf(dx, -sum_s[i], fmaf(dy, sum_c[i], 0.0f)), 1.0f / dist, gpen);
q_try[i] = fmaf(-eta, g, q_local[i]);
const float lo = (i == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
const float hi = 2.6179938779914943653855361527329f;
if (q_try[i] < lo) q_try[i] = lo;
else if (q_try[i] > hi) q_try[i] = hi;
++i;
}
if (cost.variableLen) {
int j = 0;
#pragma loop ivdep
while (j < n) {
const float g = fmaf(fmaf(dx, c_arr[j], fmaf(dy, s_arr[j], 0.0f)), 1.0f / dist, 0.0f);
float v = fmaf(-eta, g, q_local[n + j]);
if (v < 0.5f) v = 0.5f;
else if (v > 2.0f) v = 2.0f;
q_try[n + j] = v;
++j;
}
}
float x2;
float y2;
const float f2 = cost(q_try, x2, y2);
const float rel_improvement = fmaf(-1.0f / f, f2, 1.0f);
if (f2 < f) {
memcpy(q_local, q_try, static_cast<size_t>(dim) * sizeof(float));
f = f2;
x = x2;
y = y2;
eta = fmaf(eta, fmaf(10.0f / sqrtf(dim_f), 1.0f / (1.0f + rel_improvement) * rel_improvement, 1.0f), 0.0f);
}
else {
const float caution_coeff = 5.0f / sqrtf(dim_f) * rel_improvement;
eta = fmaf(eta, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0f);
memcpy(q_try, q_local, static_cast<size_t>(dim) * sizeof(float));
++no_improve_steps;
}
++steps_since_check;
if (steps_since_check == check_interval) {
if (fmaf(-1.0f / f_at_check_start, f, 1.0f) < 1e-3f) break;
f_at_check_start = f;
steps_since_check = 0;
}
if (no_improve_steps > threshold_iters_grad >> 2) break;
++stepI;
}
}
if (f < bestF) {
if (0.95f < p || dim < 3) {
const int last = n - 1;
const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
const float hi = 2.6179938779914943653855361527329f;
float bestLocF = f;
float saved = q_local[last];
float delta = eta;
while (delta > fmaf(delta, 0.15f, 0.0f)) {
int sgn = -1;
while (sgn < 2) {
float cand = fmaf(static_cast<float>(sgn), delta, saved);
if (cand < lo) cand = lo;
else if (cand > hi) cand = hi;
const float backup = q_local[last];
q_local[last] = cand;
float x2;
float y2;
const float f2 = cost(q_local, x2, y2);
if (f2 < bestLocF) {
bestLocF = f2;
x = x2;
y = y2;
saved = cand;
}
q_local[last] = backup;
sgn += 2;
}
delta = fmaf(delta, 0.5f, 0.0f);
}
if (bestLocF < f) {
q_local[last] = saved;
f = bestLocF;
}
}
bestF = f;
bestQ.assign(q_local, q_local + dim);
bestX = x;
bestY = y;
no_improve = 0;
}
else ++no_improve;
return f;
};
const float f_a = evalAt(a);
const float f_b = evalAt(b);
const float Kf = fminf(fmaxf(2.0f * dim_f, 8.0f), 128.0f);
const int K = static_cast<int>(Kf);
H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + 16u);
const int rank = g_world->rank();
const int world = g_world->size();
alignas(16) float seeds[256 * 32];
const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, static_cast<unsigned>(fmaf(static_cast<float>(rank), 7919.0f, static_cast<float>(seed))));
int i = 0;
while (i < seedCnt) {
const float* s = seeds + static_cast<size_t>(fmaf(static_cast<float>(i), 32.0f, 0.0f));
const float t_seed = map.pointToT(s);
const float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f) : fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f), exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
const float t1 = fmaxf(a, fmaf(-interval_size, 1.0f, t_seed));
const float t2 = fminf(b, fmaf(interval_size, 1.0f, t_seed));
if (t2 > t1) {
alignas(16) float q1[32];
alignas(16) float q2[32];
float x1;
float y1;
float x2;
float y2;
map.map01ToPoint(t1, q1);
const float f1 = cost(q1, x1, y1);
map.map01ToPoint(t2, q2);
const float f2 = cost(q2, x2, y2);
IntervalND* I = new IntervalND(t1, t2, f1, f2);
I->i1 = t_to_idx(t1);
I->i2 = t_to_idx(t2);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
if (i < 3) I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f);
else {
const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f);
const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f);
const float mult = fmaf(start_mult, exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
I->R = fmaf(I->R, mult, 0.0f);
}
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
if (f1 < bestF) {
bestF = f1;
bestQ.assign(q1, q1 + dim);
bestX = x1;
bestY = y1;
}
if (f2 < bestF) {
bestF = f2;
bestQ.assign(q2, q2 + dim);
bestX = x2;
bestY = y2;
}
}
++i;
}
float prev_t = a;
float prev_f = f_a;
int k = 1;
while (k <= K) {
const float t = fmaf(fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1)));
const float f = evalAt(t);
IntervalND* I = new IntervalND(prev_t, t, prev_f, f);
I->i1 = t_to_idx(prev_t);
I->i2 = t_to_idx(t);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
prev_t = t;
prev_f = f;
++k;
}
IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b);
tail->i1 = t_to_idx(prev_t);
tail->i2 = t_to_idx(b);
tail->diam = map.block_diameter(tail->i1, tail->i2);
tail->compute_span_level(map);
tail->set_metric(tail->diam);
update_pockets_and_Mmax(tail);
tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(tail);
std::push_heap(H.begin(), H.end(), ComparePtrND);
const int kickEveryDim = static_cast<int>(fmaf(22.5f, exp2f(-0.295f * dim_f), 0.0f));
const int noImproveThrDim = static_cast<int>(fmaf(32.5f, exp2f(-0.1f * dim_f), 0.0f));
while (it < maxIter) {
p = fmaf(-1.0f / initial_len, dmax, 1.0f);
const float p_arg = fmaf(p, 2.3f, -2.9775f);
const float r_eff = fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, r, 0.0f);
const bool staganition = no_improve > noImproveThrDim;
if (!(it % kickEveryDim) || staganition) {
const float t_best = map.pointToT(bestQ.data());
const float log_arg = fmaf(dim_f, 1.0f, -1.0f);
const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f);
int ii = 0;
while (ii < 2) {
const float off = (ii == 0) ? fmaf(adaptive_diameter, 2.0f, 0.0f) : fmaf(-adaptive_diameter, 2.0f, 0.0f);
const float t_seed = fminf(b, fmaxf(a, fmaf(off, 1.0f, t_best)));
const float f_seed = evalAt(t_seed);
IntervalND* J = new IntervalND(fmaf(-adaptive_diameter, 1.0f, t_seed), fmaf(adaptive_diameter, 1.0f, t_seed), f_seed, f_seed);
J->i1 = t_to_idx(fmaf(-adaptive_diameter, 1.0f, t_seed));
J->i2 = t_to_idx(fmaf(adaptive_diameter, 1.0f, t_seed));
J->diam = map.block_diameter(J->i1, J->i2);
J->compute_span_level(map);
J->set_metric(J->diam);
update_pockets_and_Mmax(J);
J->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
J->R = fmaf(J->R, 2.03f - adaptive_coeff, 0.0f);
H.emplace_back(J);
std::push_heap(H.begin(), H.end(), ComparePtrND);
++ii;
}
no_improve = 0;
}
const float tmp_exp_arg_threshold = fmaf(-exp_arg_threshold * p, 10.0f, 0.0f);
adaptive_coeff_threshold = fmaf(fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition_threshold, 1.0f);
const float exp_arg = fmaf(B_dim, p, 0.0f);
adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim);
if (dim < 3) {
const float adaptive_coeff_arg = adaptive_coeff - 1.0f;
const float adaptive_coeff_threshold_arg = adaptive_coeff_threshold - 1.0f;
adaptive_coeff = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
adaptive_coeff_threshold = fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_threshold_arg);
}
const float arg_for_T = fmaf(p, 4.5f, 2.475f);
const float exp_argument = fmaf(-0.2925f, dim_f, 0.0f);
const float exp2_exp_arg = fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float A = fmaf(39.995f, exp2_exp_arg, 0.0f);
const int T = static_cast<int>(fmaf(fmaf(fmaf(1.0f / fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 2.0f), 1.0498f, -0.04978f), fmaf(sqrtf(dim_f), 4.875f, 0.0f), 0.0f), fmaf(-(exp2_exp_arg - 1.0f), A, A), 0.0f));
std::pop_heap(H.begin(), H.end(), ComparePtrND);
IntervalND* cur = H.back();
H.pop_back();
const float x1 = cur->x1;
const float x2 = cur->x2;
const float y1 = cur->y1;
const float y2 = cur->y2;
float m = fmaf(r_eff, Mmax, 0.0f);
float tNew = step(m, x1, x2, y1, y2, dim_f, r_eff);
const float bestFOld = bestF;
const float fNew = evalAt(tNew);
IntervalND* L = new IntervalND(x1, tNew, y1, fNew);
IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2);
L->i1 = t_to_idx(x1);
L->i2 = t_to_idx(tNew);
Rv->i1 = t_to_idx(tNew);
Rv->i2 = t_to_idx(x2);
L->diam = map.block_diameter(L->i1, L->i2);
Rv->diam = map.block_diameter(Rv->i1, Rv->i2);
L->compute_span_level(map);
Rv->compute_span_level(map);
L->set_metric(L->diam);
Rv->set_metric(Rv->diam);
const float Mloc = fmaxf(L->M, Rv->M);
update_pockets_and_Mmax(L);
update_pockets_and_Mmax(Rv);
const float prevMmax = Mmax;
if (Mloc > Mmax) Mmax = Mloc;
m = fmaf(r_eff, Mmax, 0.0f);
if (adaptive) {
const float len1 = fmaf(tNew, 1.0f, -x1);
const float len2 = fmaf(x2, 1.0f, -tNew);
if (fmaf(len1, 1.0f, len2) == dmax) {
dmax = fmaxf(len1, len2);
for (auto pI : H) {
const float Ls = fmaf(pI->x2, 1.0f, -pI->x1);
if (Ls > dmax) dmax = Ls;
}
}
if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) {
const float alpha = p * p;
const float beta = fmaf(-alpha, 1.0f, 2.0f);
const float MULT = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULT, r_eff, -MULT);
const float GF = beta * global_coeff;
L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f)));
Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f)));
const size_t sz = H.size();
RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
else {
if (Mloc > prevMmax) {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
const size_t sz = H.size();
RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
}
else {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
}
}
}
else {
if (Mloc > prevMmax) {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
const size_t sz = H.size();
RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
}
else {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
}
}
H.emplace_back(L);
std::push_heap(H.begin(), H.end(), ComparePtrND);
H.emplace_back(Rv);
std::push_heap(H.begin(), H.end(), ComparePtrND);
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
IntervalND* const top = H.front();
const float interval_len = dim > 1 ? fmaf(top->x2, 1.0f, -top->x1) : top->diam;
if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter) {
out_iterations = static_cast<size_t>(it);
out_achieved_epsilon = interval_len;
return;
}
if (!(it % T) || staganition) {
MultiCrossMsg out;
const unsigned intervals_to_send = static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f)));
out.count = intervals_to_send;
float* dest = out.intervals;
unsigned ii = 0;
while (ii < 3) {
IntervalND* Tt = H[ii];
dest[0] = Tt->x1;
dest[1] = 0.0f;
dest[2] = Tt->x2;
dest[3] = 0.0f;
dest[4] = Tt->R;
dest += 5;
++ii;
}
const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
bool active = true;
const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter_T), 1.0f, 1.0f)) & 1);
size_t ii2 = 0;
while (ii2 < iterations && active) {
const size_t step = 1ULL << ii2;
const int partner = rank ^ static_cast<int>(step);
if (partner < world) {
const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
if (am_sender) {
g_world->isend(partner, 0, out);
active = false;
}
}
++ii2;
}
++exchange_counter_T;
}
const float improvement_threshold = 2.03f - adaptive_coeff;
if (bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 7), 0.0f) && (p > 0.7f && !(it % 3)) || bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 5), 0.0f) && p > 0.9f) {
BestSolutionMsg out;
out.bestF = bestF;
out.bestX = bestX;
out.bestY = bestY;
out.dim = static_cast<unsigned>(bestQ.size());
memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float));
const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
bool active = true;
const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter), 1.0f, 1.0f)) & 1);
size_t ii2 = 0;
while (ii2 < iterations && active) {
const size_t step = 1ULL << ii2;
const int partner = rank ^ static_cast<int>(step);
if (partner < world) {
const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
if (am_sender) {
g_world->isend(partner, 2, out);
active = false;
}
}
++ii2;
}
++exchange_counter;
}
while (g_world->iprobe(boost::mpi::any_source, 0)) {
MultiCrossMsg in;
g_world->recv(boost::mpi::any_source, 0, in);
const MultiCrossMsg& mX = in;
unsigned ii = 0;
while (ii < mX.count) {
const float* d = &mX.intervals[ii * 5];
float sx = d[0];
float ex = d[2];
if (ex > sx) {
alignas(16) float tmp[32];
float tx;
float ty;
map.map01ToPoint(sx, tmp);
const float y1i = cost(tmp, tx, ty);
map.map01ToPoint(ex, tmp);
const float y2i = cost(tmp, tx, ty);
IntervalND* inj = new IntervalND(sx, ex, y1i, y2i);
inj->i1 = t_to_idx(sx);
inj->i2 = t_to_idx(ex);
inj->diam = map.block_diameter(inj->i1, inj->i2);
inj->compute_span_level(map);
inj->set_metric(inj->diam);
update_pockets_and_Mmax(inj);
inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
IntervalND* topH = H.front();
const float topH_R = topH->R;
const float arg_for_adaptive = fmaf(2.03f - adaptive_coeff, topH_R, 0.0f);
if (inj->R > fmaf(fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.5819767068693264243850020f, -1.0f) || staganition) {
const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f;
const float kf = staganition ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f);
const float exp_arg = fmaf(B_dim, fmaf(1.0f / static_cast<float>(mX.count - 1u), static_cast<float>(ii), 0.0f), 0.0f);
float adaptive_coeff_clone = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim_clone);
if (dim < 3) {
const float adaptive_coeff_arg = adaptive_coeff_clone - 1.0f;
adaptive_coeff_clone = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
}
const float arg_for_inj = fmaf(1.0f / fmaf(adaptive_coeff, topH_R, 0.0f), inj->R - topH_R, 1.0f);
inj->R = fmaf(d[4], fmaf(fmaf(fmaf(arg_for_inj, fmaf(arg_for_inj, fmaf(arg_for_inj, fmaf(arg_for_inj, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), kf, 0.0f), adaptive_coeff_clone, 0.0f), 0.0f);
H.emplace_back(inj);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
}
++ii;
}
}
while (g_world->iprobe(boost::mpi::any_source, 2)) {
BestSolutionMsg bm;
g_world->recv(boost::mpi::any_source, 2, bm);
if (bm.bestF < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) || staganition) {
alignas(16) float tmp_q[32];
memcpy(tmp_q, bm.bestQ, bm.dim * sizeof(float));
const float t_best = map.pointToT(tmp_q);
const float log_arg = fmaf(dim_f, 1.0f, -1.0f);
const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f);
const float t1 = fmaf(-adaptive_diameter, 0.2f, t_best);
const float t2 = fmaf(adaptive_diameter, 0.2f, t_best);
alignas(16) float tq1[32];
alignas(16) float tq2[32];
float xx1;
float yy1;
float xx2;
float yy2;
map.map01ToPoint(t1, tq1);
const float f1 = cost(tq1, xx1, yy1);
map.map01ToPoint(t2, tq2);
const float f2 = cost(tq2, xx2, yy2);
IntervalND* I = new IntervalND(t1, t2, f1, f2);
I->i1 = t_to_idx(t1);
I->i2 = t_to_idx(t2);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, adaptive_coeff, 0.0f);
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
if (bm.bestF < bestF) {
bestF = bm.bestF;
bestX = bm.bestX;
bestY = bm.bestY;
bestQ.assign(bm.bestQ, bm.bestQ + bm.dim);
}
}
}
++it;
}
}" - внимательно рассмотри этот код, тут два принципиально отличающихся типа mpi обменов - первый для исследования поискового пространства, второй для уточнения, проблема сейчас в том что при приёме второго типа сообщений создаются интервалы очень малого диаметра что вводит алгоритм глобального поиска в заблуждение так как создаются очень малые интервалы с большой характеристикой - что нужно сделать: условие для отправки сообщений второго типа трогать не нужно, условие внутри цикла приёма сообщений второго типа для рассмотрения точки как кандидата: "if (bm.bestF < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) || staganition) {" - также трогать не нужно, но теперь нужно чтобы при входе в это условие не создавался малый интервал вокруг лучшей точки а нужно теперь помимо только лишь лучшей точки в сообщениях второго типа передавать также лучший интервал если размерность меньше 3, и два лучших интервала если размерность больше 2, то есть все условие что в отправке что в приёме при рассмотрении точки как перспективного кандидата в сообщениях второго типа - остаются такими же - но теперь помимо точки передаются 1 или 2 лучших интервала, и что потом должно происходить с этими интервалалами: в текущем коде у вновь созданных интервалов при приёме сообщений второго типа характеристика меняется как I->R = fmaf(I->R, adaptive_coeff, 0.0f);, представим теперь что I это переданный интервал, тогда сначала его характеристика должна меняться аналогично тому как она сначала меняется в сообщениях первого типа inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); затем если передавался только 1 лучший интервал - то его характеристика должна будет измениться по закону I->R = fmaf(I->R, adaptive_coeff, 0.0f); - в случае же если передавалось два лучших интервала то дополнительно характеристика каждого из них должна будет умножиться на adaptive_coeff_clone - который будет вводится в сообщениях второго типа по полной аналогии с сообщениями первого типа с тем отличием что его не надо будет смягчать при размерности меньше 3 как это сейчас делается в сообщениях первого типа, то есть вот этого: " if (dim < 3) {
const float adaptive_coeff_arg = adaptive_coeff_clone - 1.0f;
adaptive_coeff_clone = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
}" - делать не надо, зато надо также как и в сообщениях первого типа делать _mm_prefetch((const char*)H[0 или (0 и затем 1 - если 2 интервала пересылается)], _MM_HINT_T0); то есть итоговая формула изменений характеристик в сообщениях второго типа будет: "I->R = fmaf(I->R, fmaf(adaptive_coeff_clone, adaptive_coeff, 0.0f), 0.0f);". Заметь в случае с сообщениями первого типа adaptive_coeff_clone зависит от числа передаваемых интервалов через "fmaf(1.0f / static_cast<float>(mX.count - 1u), static_cast<float>(ii), 0.0f)" - тут же сделай проще, используй вместо этой формулы: "fmaf(1.0f / static_cast<float>(mX.count - 1u), static_cast<float>(ii), 0.0f)": 0 для первого интервала, 1 для второго интервала, в случае когда их двое, в случае одного интервала напоминаю домножать на adaptive_coeff_clone не нужно, не добавляй лишних проверок в код по типу проверок на непустоту, эти кучи гарантировано не пусти, не делай никаких проверок на вылет за пределы диапазона и прочее - проверки запрещены, модифицируй мой код в соответствии с указанными требованиями - причём заметь - изменения в код должны быть в моём стиле максимально оптимизированного кода: заботься о скорости работы кода и думай о максимальной производительности, так что инлайнь вычисления, загружай регистры и умножители процессора через вложенные сложные длинные формулы с fmaf, старайся делать все вычисления in_place - минимум обращений к памяти - всё в регистрах, инструкции размещай чтобы обеспечить оптимальную загрузку CPU, минимум кеш-промахов, локальность доступа к данным, если кешируешь переменнные - кешируй их как можно ближе к месту использования, возможно потребуется внести изменения в сериализируемые MPI структуры чтобы сделать это максимально быстро по скорости - мне пришли полную исправленную версию кода без пропусков и комментариев, в полном соответствии со всеми моими требованиями
cpp__declspec(align(16)) struct CrossMsg final { float s_x1; float s_x2; float e_x1; float e_x2; float Rtop; template <typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; } }; __declspec(align(16)) struct MultiCrossMsg final { float intervals[35]; unsigned count; template <typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept { ar& intervals& count; } }; __declspec(align(16)) struct BestSolutionMsg final { float bestF; float bestX; float bestY; float bestQ[32]; float bestI[8]; unsigned dim; template <typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept { ar& bestF& bestX& bestY& bestQ& bestI& dim; } }; static __declspec(noalias) void agp_run_branch_mpi( const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed, std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H, std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ, float& bestF, float& bestX, float& bestY, size_t& out_iterations, float& out_achieved_epsilon, float M_prior) noexcept { const int n = cost.n; const int dim = n + (cost.variableLen ? n : 0); const float dim_f = static_cast<float>(dim); unsigned exchange_counter = 0; unsigned exchange_counter_T = 0; alignas(16) float M_by_span[12]; int msi = 0; while (msi < 12) M_by_span[msi++] = M_prior; float Mmax = M_prior; alignas(16) float q_local[32]; alignas(16) float phi[32]; alignas(16) float s_arr[32]; alignas(16) float c_arr[32]; alignas(16) float sum_s[32]; alignas(16) float sum_c[32]; alignas(16) float q_try[32]; bestQ.reserve(static_cast<size_t>(dim)); float x = 0.0f; float y = 0.0f; int no_improve = 0; auto t_to_idx = [&](float t) -> unsigned long long { unsigned long long idx = static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f)); return idx; }; auto update_pockets_and_Mmax = [&](IntervalND* I) { const int k = I->span_level; if (I->M > M_by_span[k]) M_by_span[k] = I->M; if (M_by_span[k] > Mmax) Mmax = M_by_span[k]; }; const float a = 0.0f; const float b = 1.0f; float p = 0.0f; float dmax = fmaf(b, 1.0f, -a); const float initial_len = dmax; float exp_arg_threshold = fmaf(1.0f / fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, -0.2f, 0.25f), -0.33333333f), 0.5f), -1.0f), dim_f), 0.415888308336f, 0.0f); const float log_arg_threshold = fmaf(dim_f, 0.1f, -0.105f); const float adaptive_coeff_addition_threshold = fmaf(fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 1.0f / fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg_threshold), 0.17814618538f); float adaptive_coeff_threshold = adaptive_coeff_addition_threshold + 1.0f; const float log_arg = dim_f + 5.0f; const float A_dim = fmaf(1.0f / fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), 3.0f, 0.0f); const float B_dim = fmaf(A_dim, 0.65f, 0.0f); const float log_argument = A_dim - 2.03f; const float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim; const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.55f), 1.0f); float adaptive_coeff = A_dim - adaptive_coeff_addition; const float A_dim_clone = fmaf(A_dim - fmaf(-fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim), 0.5f, 0.0f); int it = 0; auto evalAt = [&](float t) -> float { map.map01ToPoint(t, q_local); float f = cost(q_local, x, y); const float step_arg = fmaf(-dim_f, 0.825f, 3.3f); float eta = fmaf(fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f); if ((((f < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) && p > 0.55f) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && ((p > 0.7f && !(it % 3)) || p > 0.9f))) && dim > 2) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && dim < 3)) { float acc = 0.0f; int ii = 0; while (ii < n) { acc = fmaf(q_local[ii], 1.0f, acc); phi[ii] = acc; ++ii; } FABE13_SINCOS(phi, s_arr, c_arr, n); float as = 0.0f; float ac = 0.0f; int k = n - 1; while (k >= 0) { const float Lk = cost.variableLen ? q_local[n + k] : 1.0f; as = fmaf(Lk, s_arr[k], as); ac = fmaf(Lk, c_arr[k], ac); sum_s[k] = as; sum_c[k] = ac; --k; } const float dx = fmaf(x, 1.0f, -cost.targetX); const float dy = fmaf(y, 1.0f, -cost.targetY); const float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1.0e-8f; int stepI = 0; const float adaptive_window = fmaf(1.0f / fmaf(bestF, adaptive_coeff_threshold, 0.0f), fmaf(bestF, adaptive_coeff_threshold, -f), 0.0f); const float curse_dim_arg = fmaf(dim_f, 0.825f, 0.0f); const int threshold_iters_grad = static_cast<int>(fmaf(fmaf(adaptive_window, adaptive_window, 1.0f), fmaf(fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f), 0.0f)); int no_improve_steps = 0; float f_at_check_start = f; int steps_since_check = 0; const int check_interval = static_cast<int>(fmaf(1.0f / log2f(fmaf(dim_f, 0.5f, 1.0f)), static_cast<float>(threshold_iters_grad >> 2), 0.0f)); while (stepI < threshold_iters_grad) { int i = 0; #pragma loop ivdep while (i < n) { float gpen = 0.0f; { const float ai = fabsf(q_local[i]); const float v = fmaf(ai, 1.0f, -cost.minTheta); if (v > 0.0f) { const float scale_arg = fmaf(2.0f / (cost.minTheta + 1.0e-6f), fmaf(v, 0.69314718055994530941723212145818f, 0.0f), 0.0f); const float exp_val = fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float dpen_dtheta = fmaf(cost.sharpW, fmaf(exp_val, 0.69314718055994530941723212145818f * (2.0f / (cost.minTheta + 1.0e-6f)), 0.0f), copysignf(1.0f, q_local[i])); gpen = fmaf(dpen_dtheta, 1.0f, gpen); } } { const float tsg = fmaf(-q_local[i], cost.archBiasK, 0.0f); const float exp_arg = -tsg; const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float sig = 1.0f / fmaf(exp_val, 1.0f, 1.0f); gpen = fmaf(-fmaf(cost.archBiasW, cost.archBiasK, 0.0f), sig, gpen); } const float g = fmaf(fmaf(dx, -sum_s[i], fmaf(dy, sum_c[i], 0.0f)), 1.0f / dist, gpen); q_try[i] = fmaf(-eta, g, q_local[i]); const float lo = (i == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f; const float hi = 2.6179938779914943653855361527329f; if (q_try[i] < lo) q_try[i] = lo; else if (q_try[i] > hi) q_try[i] = hi; ++i; } if (cost.variableLen) { int j = 0; #pragma loop ivdep while (j < n) { const float g = fmaf(fmaf(dx, c_arr[j], fmaf(dy, s_arr[j], 0.0f)), 1.0f / dist, 0.0f); float v = fmaf(-eta, g, q_local[n + j]); if (v < 0.5f) v = 0.5f; else if (v > 2.0f) v = 2.0f; q_try[n + j] = v; ++j; } } float x2; float y2; const float f2 = cost(q_try, x2, y2); const float rel_improvement = fmaf(-1.0f / f, f2, 1.0f); if (f2 < f) { memcpy(q_local, q_try, static_cast<size_t>(dim) * sizeof(float)); f = f2; x = x2; y = y2; eta = fmaf(eta, fmaf(10.0f / sqrtf(dim_f), 1.0f / (1.0f + rel_improvement) * rel_improvement, 1.0f), 0.0f); } else { const float caution_coeff = 5.0f / sqrtf(dim_f) * rel_improvement; eta = fmaf(eta, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0f); memcpy(q_try, q_local, static_cast<size_t>(dim) * sizeof(float)); ++no_improve_steps; } ++steps_since_check; if (steps_since_check == check_interval) { if (fmaf(-1.0f / f_at_check_start, f, 1.0f) < 1e-3f) break; f_at_check_start = f; steps_since_check = 0; } if (no_improve_steps > threshold_iters_grad >> 2) break; ++stepI; } } if (f < bestF) { if (0.95f < p || dim < 3) { const int last = n - 1; const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f; const float hi = 2.6179938779914943653855361527329f; float bestLocF = f; float saved = q_local[last]; float delta = eta; while (delta > fmaf(delta, 0.15f, 0.0f)) { int sgn = -1; while (sgn < 2) { float cand = fmaf(static_cast<float>(sgn), delta, saved); if (cand < lo) cand = lo; else if (cand > hi) cand = hi; const float backup = q_local[last]; q_local[last] = cand; float x2; float y2; const float f2 = cost(q_local, x2, y2); if (f2 < bestLocF) { bestLocF = f2; x = x2; y = y2; saved = cand; } q_local[last] = backup; sgn += 2; } delta = fmaf(delta, 0.5f, 0.0f); } if (bestLocF < f) { q_local[last] = saved; f = bestLocF; } } bestF = f; bestQ.assign(q_local, q_local + dim); bestX = x; bestY = y; no_improve = 0; } else ++no_improve; return f; }; const float f_a = evalAt(a); const float f_b = evalAt(b); const float Kf = fminf(fmaxf(2.0f * dim_f, 8.0f), 128.0f); const int K = static_cast<int>(Kf); H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + 16u); const int rank = g_world->rank(); const int world = g_world->size(); alignas(16) float seeds[256 * 32]; const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, static_cast<unsigned>(fmaf(static_cast<float>(rank), 7919.0f, static_cast<float>(seed)))); int i = 0; while (i < seedCnt) { const float* s = seeds + static_cast<size_t>(fmaf(static_cast<float>(i), 32.0f, 0.0f)); const float t_seed = map.pointToT(s); const float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f) : fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f), exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)), 0.0f); const float t1 = fmaxf(a, fmaf(-interval_size, 1.0f, t_seed)); const float t2 = fminf(b, fmaf(interval_size, 1.0f, t_seed)); if (t2 > t1) { alignas(16) float q1[32]; alignas(16) float q2[32]; float x1; float y1; float x2; float y2; map.map01ToPoint(t1, q1); const float f1 = cost(q1, x1, y1); map.map01ToPoint(t2, q2); const float f2 = cost(q2, x2, y2); IntervalND* I = new IntervalND(t1, t2, f1, f2); I->i1 = t_to_idx(t1); I->i2 = t_to_idx(t2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); if (i < 3) I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f); else { const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f); const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f); const float mult = fmaf(start_mult, exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)), 0.0f); I->R = fmaf(I->R, mult, 0.0f); } H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); if (f1 < bestF) { bestF = f1; bestQ.assign(q1, q1 + dim); bestX = x1; bestY = y1; } if (f2 < bestF) { bestF = f2; bestQ.assign(q2, q2 + dim); bestX = x2; bestY = y2; } } ++i; } float prev_t = a; float prev_f = f_a; int k = 1; while (k <= K) { const float t = fmaf(fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1))); const float f = evalAt(t); IntervalND* I = new IntervalND(prev_t, t, prev_f, f); I->i1 = t_to_idx(prev_t); I->i2 = t_to_idx(t); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); prev_t = t; prev_f = f; ++k; } IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b); tail->i1 = t_to_idx(prev_t); tail->i2 = t_to_idx(b); tail->diam = map.block_diameter(tail->i1, tail->i2); tail->compute_span_level(map); tail->set_metric(tail->diam); update_pockets_and_Mmax(tail); tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); H.emplace_back(tail); std::push_heap(H.begin(), H.end(), ComparePtrND); const int kickEveryDim = static_cast<int>(fmaf(22.5f, exp2f(-0.295f * dim_f), 0.0f)); const int noImproveThrDim = static_cast<int>(fmaf(32.5f, exp2f(-0.1f * dim_f), 0.0f)); while (it < maxIter) { p = fmaf(-1.0f / initial_len, dmax, 1.0f); const float p_arg = fmaf(p, 2.3f, -2.9775f); const float r_eff = fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, r, 0.0f); const bool staganition = no_improve > noImproveThrDim; if (!(it % kickEveryDim) || staganition) { const float t_best = map.pointToT(bestQ.data()); const float log_arg = fmaf(dim_f, 1.0f, -1.0f); const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f); int ii = 0; while (ii < 2) { const float off = (ii == 0) ? fmaf(adaptive_diameter, 2.0f, 0.0f) : fmaf(-adaptive_diameter, 2.0f, 0.0f); const float t_seed = fminf(b, fmaxf(a, fmaf(off, 1.0f, t_best))); const float f_seed = evalAt(t_seed); IntervalND* J = new IntervalND(fmaf(-adaptive_diameter, 1.0f, t_seed), fmaf(adaptive_diameter, 1.0f, t_seed), f_seed, f_seed); J->i1 = t_to_idx(fmaf(-adaptive_diameter, 1.0f, t_seed)); J->i2 = t_to_idx(fmaf(adaptive_diameter, 1.0f, t_seed)); J->diam = map.block_diameter(J->i1, J->i2); J->compute_span_level(map); J->set_metric(J->diam); update_pockets_and_Mmax(J); J->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); J->R = fmaf(J->R, 2.03f - adaptive_coeff, 0.0f); H.emplace_back(J); std::push_heap(H.begin(), H.end(), ComparePtrND); ++ii; } no_improve = 0; } const float tmp_exp_arg_threshold = fmaf(-exp_arg_threshold * p, 10.0f, 0.0f); adaptive_coeff_threshold = fmaf(fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition_threshold, 1.0f); const float exp_arg = fmaf(B_dim, p, 0.0f); adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim); if (dim < 3) { const float adaptive_coeff_arg = adaptive_coeff - 1.0f; const float adaptive_coeff_threshold_arg = adaptive_coeff_threshold - 1.0f; adaptive_coeff = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg); adaptive_coeff_threshold = fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_threshold_arg); } const float arg_for_T = fmaf(p, 4.5f, 2.475f); const float exp_argument = fmaf(-0.2925f, dim_f, 0.0f); const float exp2_exp_arg = fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float A = fmaf(39.995f, exp2_exp_arg, 0.0f); const int T = static_cast<int>(fmaf(fmaf(fmaf(1.0f / fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 2.0f), 1.0498f, -0.04978f), fmaf(sqrtf(dim_f), 4.875f, 0.0f), 0.0f), fmaf(-(exp2_exp_arg - 1.0f), A, A), 0.0f)); std::pop_heap(H.begin(), H.end(), ComparePtrND); IntervalND* cur = H.back(); H.pop_back(); const float x1 = cur->x1; const float x2 = cur->x2; const float y1 = cur->y1; const float y2 = cur->y2; float m = fmaf(r_eff, Mmax, 0.0f); float tNew = step(m, x1, x2, y1, y2, dim_f, r_eff); const float bestFOld = bestF; const float fNew = evalAt(tNew); IntervalND* L = new IntervalND(x1, tNew, y1, fNew); IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2); L->i1 = t_to_idx(x1); L->i2 = t_to_idx(tNew); Rv->i1 = t_to_idx(tNew); Rv->i2 = t_to_idx(x2); L->diam = map.block_diameter(L->i1, L->i2); Rv->diam = map.block_diameter(Rv->i1, Rv->i2); L->compute_span_level(map); Rv->compute_span_level(map); L->set_metric(L->diam); Rv->set_metric(Rv->diam); const float Mloc = fmaxf(L->M, Rv->M); update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv); const float prevMmax = Mmax; if (Mloc > Mmax) Mmax = Mloc; m = fmaf(r_eff, Mmax, 0.0f); if (adaptive) { const float len1 = fmaf(tNew, 1.0f, -x1); const float len2 = fmaf(x2, 1.0f, -tNew); if (fmaf(len1, 1.0f, len2) == dmax) { dmax = fmaxf(len1, len2); for (auto pI : H) { const float Ls = fmaf(pI->x2, 1.0f, -pI->x1); if (Ls > dmax) dmax = Ls; } } if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) { const float alpha = p * p; const float beta = fmaf(-alpha, 1.0f, 2.0f); const float MULT = (1.0f / dmax) * Mmax; const float global_coeff = fmaf(MULT, r_eff, -MULT); const float GF = beta * global_coeff; L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f))); Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f))); const size_t sz = H.size(); RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha); std::make_heap(H.begin(), H.end(), ComparePtrND); } else { if (Mloc > prevMmax) { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) { const size_t sz = H.size(); RecomputeR_ConstM_AVX2_ND(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtrND); } } else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); } } } else { if (Mloc > prevMmax) { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) { const size_t sz = H.size(); RecomputeR_ConstM_AVX2_ND(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtrND); } } else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); } } H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND); H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND); _mm_prefetch((const char*)H[0], _MM_HINT_T0); _mm_prefetch((const char*)H[1], _MM_HINT_T0); IntervalND* const top = H.front(); const float interval_len = dim > 1 ? fmaf(top->x2, 1.0f, -top->x1) : top->diam; if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter) { out_iterations = static_cast<size_t>(it); out_achieved_epsilon = interval_len; return; } if (!(it % T) || staganition) { MultiCrossMsg out; const unsigned intervals_to_send = static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f))); out.count = intervals_to_send; float* dest = out.intervals; unsigned ii = 0; while (ii < 3) { IntervalND* Tt = H[ii]; dest[0] = Tt->x1; dest[1] = 0.0f; dest[2] = Tt->x2; dest[3] = 0.0f; dest[4] = Tt->R; dest += 5; ++ii; } const size_t iterations = std::bit_width(static_cast<size_t>(world - 1)); bool active = true; const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter_T), 1.0f, 1.0f)) & 1); size_t ii2 = 0; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ static_cast<int>(step); if (partner < world) { const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T); if (am_sender) { g_world->isend(partner, 0, out); active = false; } } ++ii2; } ++exchange_counter_T; } const float improvement_threshold = 2.03f - adaptive_coeff; if (bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 7), 0.0f) && (p > 0.7f && !(it % 3)) || bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 5), 0.0f) && p > 0.9f) { BestSolutionMsg out; out.bestF = bestF; out.bestX = bestX; out.bestY = bestY; out.dim = static_cast<unsigned>(bestQ.size()); memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float)); { IntervalND* const I0 = H[0]; out.bestI[0] = I0->x1; out.bestI[1] = I0->x2; out.bestI[2] = I0->y1; out.bestI[3] = I0->y2; if (dim > 2) { IntervalND* const I1 = H[1]; out.bestI[4] = I1->x1; out.bestI[5] = I1->x2; out.bestI[6] = I1->y1; out.bestI[7] = I1->y2; } } const size_t iterations = std::bit_width(static_cast<size_t>(world - 1)); bool active = true; const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter), 1.0f, 1.0f)) & 1); size_t ii2 = 0; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ static_cast<int>(step); if (partner < world) { const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T); if (am_sender) { g_world->isend(partner, 2, out); active = false; } } ++ii2; } ++exchange_counter; } while (g_world->iprobe(boost::mpi::any_source, 0)) { MultiCrossMsg in; g_world->recv(boost::mpi::any_source, 0, in); const MultiCrossMsg& mX = in; unsigned ii = 0; while (ii < mX.count) { const float* d = &mX.intervals[ii * 5]; float sx = d[0]; float ex = d[2]; if (ex > sx) { alignas(16) float tmp[32]; float tx; float ty; map.map01ToPoint(sx, tmp); const float y1i = cost(tmp, tx, ty); map.map01ToPoint(ex, tmp); const float y2i = cost(tmp, tx, ty); IntervalND* inj = new IntervalND(sx, ex, y1i, y2i); inj->i1 = t_to_idx(sx); inj->i2 = t_to_idx(ex); inj->diam = map.block_diameter(inj->i1, inj->i2); inj->compute_span_level(map); inj->set_metric(inj->diam); update_pockets_and_Mmax(inj); inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); _mm_prefetch((const char*)H[0], _MM_HINT_T0); _mm_prefetch((const char*)H[1], _MM_HINT_T0); IntervalND* topH = H.front(); const float topH_R = topH->R; const float arg_for_adaptive = fmaf(2.03f - adaptive_coeff, topH_R, 0.0f); if (inj->R > fmaf(fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.5819767068693264243850020f, -1.0f) || staganition) { const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f; const float kf = staganition ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f); const float exp_arg = fmaf(B_dim, fmaf(1.0f / static_cast<float>(mX.count - 1u), static_cast<float>(ii), 0.0f), 0.0f); float adaptive_coeff_clone = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim_clone); if (dim < 3) { const float adaptive_coeff_arg = adaptive_coeff_clone - 1.0f; adaptive_coeff_clone = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg); } const float arg_for_inj = fmaf(1.0f / fmaf(adaptive_coeff, topH_R, 0.0f), inj->R - topH_R, 1.0f); inj->R = fmaf(d[4], fmaf(fmaf(fmaf(arg_for_inj, fmaf(arg_for_inj, fmaf(arg_for_inj, fmaf(arg_for_inj, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), kf, 0.0f), adaptive_coeff_clone, 0.0f), 0.0f); H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtrND); } } ++ii; } } while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg bm; g_world->recv(boost::mpi::any_source, 2, bm); if (bm.bestF < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) || staganition) { if (dim > 2) { _mm_prefetch((const char*)H[0], _MM_HINT_T0); _mm_prefetch((const char*)H[1], _MM_HINT_T0); const float adaptive_coeff_clone0 = fmaf(-1.0f, adaptive_coeff_addition, A_dim_clone); const float exp_arg = B_dim; const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float adaptive_coeff_clone1 = fmaf(-exp_val, adaptive_coeff_addition, A_dim_clone); const float coeff0 = fmaf(adaptive_coeff_clone0, adaptive_coeff, 0.0f); const float coeff1 = fmaf(adaptive_coeff_clone1, adaptive_coeff, 0.0f); { const float sx = bm.bestI[0]; const float ex = bm.bestI[1]; const float y1i = bm.bestI[2]; const float y2i = bm.bestI[3]; IntervalND* I = new IntervalND(sx, ex, y1i, y2i); I->i1 = t_to_idx(sx); I->i2 = t_to_idx(ex); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); I->R = fmaf(I->R, coeff0, 0.0f); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } { const float sx = bm.bestI[4]; const float ex = bm.bestI[5]; const float y1i = bm.bestI[6]; const float y2i = bm.bestI[7]; IntervalND* I = new IntervalND(sx, ex, y1i, y2i); I->i1 = t_to_idx(sx); I->i2 = t_to_idx(ex); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); I->R = fmaf(I->R, coeff1, 0.0f); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } } else { _mm_prefetch((const char*)H[0], _MM_HINT_T0); const float sx = bm.bestI[0]; const float ex = bm.bestI[1]; const float y1i = bm.bestI[2]; const float y2i = bm.bestI[3]; IntervalND* I = new IntervalND(sx, ex, y1i, y2i); I->i1 = t_to_idx(sx); I->i2 = t_to_idx(ex); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); I->R = fmaf(I->R, adaptive_coeff, 0.0f); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } if (bm.bestF < bestF) { bestF = bm.bestF; bestX = bm.bestX; bestY = bm.bestY; bestQ.assign(bm.bestQ, bm.bestQ + bm.dim); } } } ++it; } }