"static __declspec(noalias) void agp_run_branch_mp...

Criado em: 17 de dezembro de 2025

Respondido usando GPT-5.2 Thinking por Chat01

Pergunta

"static __declspec(noalias) void agp_run_branch_mpi(
const MortonND& map,
const ManipCost& cost,
int maxIter,
float r,
bool adaptive,
float eps,
unsigned seed,
std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H,
std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ,
float& bestF,
float& bestX,
float& bestY,
size_t& out_iterations,
float& out_achieved_epsilon,
float M_prior) noexcept
{
const int n = cost.n;
const int dim = n + (cost.variableLen ? n : 0);
const float dim_f = (float)dim;
unsigned exchange_counter = 0;
unsigned exchange_counter_T = 0;
alignas(16) float M_by_span[12];
int msi = 0;
while (msi < 12) M_by_span[msi++] = M_prior;
float Mmax = M_prior;
alignas(16) float q_local[32];
alignas(16) float phi[32];
alignas(16) float s_arr[32];
alignas(16) float c_arr[32];
alignas(16) float sum_s[32];
alignas(16) float sum_c[32];
alignas(16) float q_try[32];
bestQ.reserve((size_t)dim);
float x = 0.0f;
float y = 0.0f;
int no_improve = 0;
auto t_to_idx = [&](float t) -> unsigned long long {
unsigned long long idx = (unsigned long long)fmaf(t, (float)map.scale, 0.0f);
return idx;
};
auto update_pockets_and_Mmax = [&](IntervalND* I) {
const int k = I->span_level;
if (I->M > M_by_span[k]) M_by_span[k] = I->M;
if (M_by_span[k] > Mmax) Mmax = M_by_span[k];
};
const float a = 0.0f;
const float b = 1.0f;
float p = 0.0f;
float dmax = fmaf(b, 1.0f, -a);
const float initial_len = dmax;
float exp_arg_threshold = fmaf(1.0f / fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, -0.2f, 0.25f), -0.33333333f), 0.5f), -1.0f), dim_f), 0.415888308336f, 0.0f);
const float log_arg_threshold = fmaf(dim_f, 0.1f, -0.105f);
const float adaptive_coeff_addition_threshold = fmaf(fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 1.0f / fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg_threshold), 0.17814618538f);
float adaptive_coeff_threshold = adaptive_coeff_addition_threshold + 1.0f;
const float log_arg = dim_f + 5.0f;
const float A_dim = fmaf(1.0f / fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), 3.0f, 0.0f);
const float B_dim = fmaf(A_dim, 0.65f, 0.0f);
const float log_argument = A_dim - 2.03f;
const float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim;
const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.55f), 1.0f);
float adaptive_coeff = A_dim - adaptive_coeff_addition;
int it = 0;
auto evalAt = [&](float t) -> float {
map.map01ToPoint(t, q_local);
float f = cost(q_local, x, y);
const float step_arg = fmaf(-dim_f, 0.825f, 3.3f);
float eta = fmaf(fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f);
if ((((f < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) && p > 0.55f) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && ((p > 0.7f && !(it % 3)) || p > 0.9f))) && dim > 2) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && dim < 3)) {
float acc = 0.0f;
int ii = 0;
while (ii < n) {
acc = fmaf(q_local[ii], 1.0f, acc);
phi[ii] = acc;
++ii;
}
FABE13_SINCOS(phi, s_arr, c_arr, n);
float as = 0.0f;
float ac = 0.0f;
int k = n - 1;
while (k >= 0) {
const float Lk = cost.variableLen ? q_local[n + k] : 1.0f;
as = fmaf(Lk, s_arr[k], as);
ac = fmaf(Lk, c_arr[k], ac);
sum_s[k] = as;
sum_c[k] = ac;
--k;
}
const float dx = fmaf(x, 1.0f, -cost.targetX);
const float dy = fmaf(y, 1.0f, -cost.targetY);
const float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1.0e-8f;
int stepI = 0;
const float adaptive_window = fmaf(1.0f / fmaf(bestF, adaptive_coeff_threshold, 0.0f), fmaf(bestF, adaptive_coeff_threshold, -f), 0.0f);
const float curse_dim_arg = fmaf(dim_f, 0.825f, 0.0f);
const int threshold_iters_grad = (int)fmaf(fmaf(adaptive_window, adaptive_window, 1.0f), fmaf(fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f), 0.0f);
int no_improve_steps = 0;
float f_at_check_start = f;
int steps_since_check = 0;
const int check_interval = (int)fmaf(1.0f / log2f(fmaf(dim_f, 0.5f, 1.0f)), (float)(threshold_iters_grad >> 2), 0.0f);
while (stepI < threshold_iters_grad) {
int i = 0;
#pragma loop ivdep
while (i < n) {
float gpen = 0.0f;
{
const float ai = fabsf(q_local[i]);
const float v = fmaf(ai, 1.0f, -cost.minTheta);
if (v > 0.0f) {
const float scale_arg = fmaf(2.0f / (cost.minTheta + 1.0e-6f), fmaf(v, 0.69314718055994530941723212145818f, 0.0f), 0.0f);
const float exp_val = fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float dpen_dtheta = fmaf(cost.sharpW, fmaf(exp_val, 0.69314718055994530941723212145818f * (2.0f / (cost.minTheta + 1.0e-6f)), 0.0f), copysignf(1.0f, q_local[i]));
gpen = fmaf(dpen_dtheta, 1.0f, gpen);
}
}
{
const float tsg = fmaf(-q_local[i], cost.archBiasK, 0.0f);
const float exp_arg = -tsg;
const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float sig = 1.0f / fmaf(exp_val, 1.0f, 1.0f);
gpen = fmaf(-fmaf(cost.archBiasW, cost.archBiasK, 0.0f), sig, gpen);
}
const float g = fmaf(fmaf(dx, -sum_s[i], fmaf(dy, sum_c[i], 0.0f)), 1.0f / dist, gpen);
q_try[i] = fmaf(-eta, g, q_local[i]);
const float lo = (i == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
const float hi = 2.6179938779914943653855361527329f;
if (q_try[i] < lo) q_try[i] = lo;
else if (q_try[i] > hi) q_try[i] = hi;
++i;
}
if (cost.variableLen) {
int j = 0;
#pragma loop ivdep
while (j < n) {
const float g = fmaf(fmaf(dx, c_arr[j], fmaf(dy, s_arr[j], 0.0f)), 1.0f / dist, 0.0f);
float v = fmaf(-eta, g, q_local[n + j]);
if (v < 0.5f) v = 0.5f;
else if (v > 2.0f) v = 2.0f;
q_try[n + j] = v;
++j;
}
}
float x2;
float y2;
const float f2 = cost(q_try, x2, y2);
const float rel_improvement = fmaf(-1.0f / f, f2, 1.0f);
if (f2 < f) {
memcpy(q_local, q_try, (size_t)dim * sizeof(float));
f = f2;
x = x2;
y = y2;
eta = fmaf(eta, fmaf(10.0f / sqrtf(dim_f), 1.0f / (1.0f + rel_improvement) * rel_improvement, 1.0f), 0.0f);
}
else {
const float caution_coeff = 5.0f / sqrtf(dim_f) * rel_improvement;
eta = fmaf(eta, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0f);
memcpy(q_try, q_local, (size_t)dim * sizeof(float));
++no_improve_steps;
}
++steps_since_check;
if (steps_since_check == check_interval) {
if (fmaf(-1.0f / f_at_check_start, f, 1.0f) < 1e-3f) break;
f_at_check_start = f;
steps_since_check = 0;
}
if (no_improve_steps > threshold_iters_grad >> 2) break;
++stepI;
}
}
if (f < bestF) {
if (0.95f < p || dim < 3) {
const int last = n - 1;
const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
const float hi = 2.6179938779914943653855361527329f;
float bestLocF = f;
float saved = q_local[last];
float delta = eta;
while (delta > fmaf(delta, 0.15f, 0.0f)) {
int sgn = -1;
while (sgn < 2) {
float cand = fmaf((float)sgn, delta, saved);
if (cand < lo) cand = lo;
else if (cand > hi) cand = hi;
const float backup = q_local[last];
q_local[last] = cand;
float x2;
float y2;
const float f2 = cost(q_local, x2, y2);
if (f2 < bestLocF) {
bestLocF = f2;
x = x2;
y = y2;
saved = cand;
}
q_local[last] = backup;
sgn += 2;
}
delta = fmaf(delta, 0.5f, 0.0f);
}
if (bestLocF < f) {
q_local[last] = saved;
f = bestLocF;
}
}
bestF = f;
bestQ.assign(q_local, q_local + dim);
bestX = x;
bestY = y;
no_improve = 0;
}
else ++no_improve;
return f;
};
const float f_a = evalAt(a);
const float f_b = evalAt(b);
const float Kf = fminf(fmaxf(2.0f * dim_f, 8.0f), 128.0f);
const int K = (int)Kf;
H.reserve((size_t)maxIter + (size_t)K + 16u);
const int rank = g_world->rank();
const int world = g_world->size();
alignas(16) float seeds[256 * 32];
const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, (unsigned)fmaf((float)rank, 7919.0f, (float)seed));
int i = 0;
while (i < seedCnt) {
const float* s = seeds + (size_t)fmaf((float)i, 32.0f, 0.0f);
const float t_seed = map.pointToT(s);
const float interval_size = (i < 3) ? fmaf(0.0004f, (float)dim, 0.0f) : fmaf(fmaf(0.00031f, (float)dim, 0.0f), exp2f((1.0f / (float)(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * (float)(i - 3)), 0.0f);
const float t1 = fmaxf(a, fmaf(-interval_size, 1.0f, t_seed));
const float t2 = fminf(b, fmaf(interval_size, 1.0f, t_seed));
if (t2 > t1) {
alignas(16) float q1[32];
alignas(16) float q2[32];
float x1;
float y1;
float x2;
float y2;
map.map01ToPoint(t1, q1);
const float f1 = cost(q1, x1, y1);
map.map01ToPoint(t2, q2);
const float f2 = cost(q2, x2, y2);
IntervalND* I = new IntervalND(t1, t2, f1, f2);
I->i1 = t_to_idx(t1);
I->i2 = t_to_idx(t2);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
if (i < 3) I->R = fmaf(I->R, fmaf(0.01f, (float)dim, 0.85f), 0.0f);
else {
const float start_mult = fmaf(0.214f, (float)dim, 0.0f);
const float end_mult = fmaf(0.174f, (float)dim, 0.0f);
const float mult = fmaf(start_mult, exp2f((1.0f / (float)(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * (float)(i - 3)), 0.0f);
I->R = fmaf(I->R, mult, 0.0f);
}
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
if (f1 < bestF) {
bestF = f1;
bestQ.assign(q1, q1 + dim);
bestX = x1;
bestY = y1;
}
if (f2 < bestF) {
bestF = f2;
bestQ.assign(q2, q2 + dim);
bestX = x2;
bestY = y2;
}
}
++i;
}
float prev_t = a;
float prev_f = f_a;
int k = 1;
while (k <= K) {
const float t = fmaf(fmaf(b - a, (float)k / (float)(K + 1), a), 1.0f, (float)rank / (float)(world * (K + 1)));
const float f = evalAt(t);
IntervalND* I = new IntervalND(prev_t, t, prev_f, f);
I->i1 = t_to_idx(prev_t);
I->i2 = t_to_idx(t);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
prev_t = t;
prev_f = f;
++k;
}
IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b);
tail->i1 = t_to_idx(prev_t);
tail->i2 = t_to_idx(b);
tail->diam = map.block_diameter(tail->i1, tail->i2);
tail->compute_span_level(map);
tail->set_metric(tail->diam);
update_pockets_and_Mmax(tail);
tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(tail);
std::push_heap(H.begin(), H.end(), ComparePtrND);
const int kickEveryDim = static_cast<int>(fmaf(22.5f, exp2f(-0.295f * dim_f), 0.0f));
const int noImproveThrDim = static_cast<int>(fmaf(32.5f, exp2f(-0.1f * dim_f), 0.0f));
while (it < maxIter) {
p = fmaf(-1.0f / initial_len, dmax, 1.0f);
const float p_arg = fmaf(p, 2.3f, -2.9775f);
const float r_eff = fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, r, 0.0f);
const bool staganition = no_improve > noImproveThrDim;
if (!(it % kickEveryDim) || staganition) {
const float t_best = map.pointToT(bestQ.data());
const float log_arg = fmaf(dim_f, 1.0f, -1.0f);
const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f);
int ii = 0;
while (ii < 2) {
const float off = (ii == 0) ? fmaf(adaptive_diameter, 2.0f, 0.0f) : fmaf(-adaptive_diameter, 2.0f, 0.0f);
const float t_seed = fminf(b, fmaxf(a, fmaf(off, 1.0f, t_best)));
const float f_seed = evalAt(t_seed);
IntervalND* J = new IntervalND(fmaf(-adaptive_diameter, 1.0f, t_seed), fmaf(adaptive_diameter, 1.0f, t_seed), f_seed, f_seed);
J->i1 = t_to_idx(fmaf(-adaptive_diameter, 1.0f, t_seed));
J->i2 = t_to_idx(fmaf(adaptive_diameter, 1.0f, t_seed));
J->diam = map.block_diameter(J->i1, J->i2);
J->compute_span_level(map);
J->set_metric(J->diam);
update_pockets_and_Mmax(J);
J->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
J->R = fmaf(J->R, 2.03f - adaptive_coeff, 0.0f);
H.emplace_back(J);
std::push_heap(H.begin(), H.end(), ComparePtrND);
++ii;
}
no_improve = 0;
}
exp_arg_threshold = fmaf(-exp_arg_threshold * p, 10.0f, 0.0f);
adaptive_coeff_threshold = fmaf(fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition_threshold, 1.0f);
const float exp_arg = fmaf(B_dim, p, 0.0f);
adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim);
if (dim < 3) {
const float adaptive_coeff_arg = adaptive_coeff - 1.0f;
const float adaptive_coeff_threshold_arg = adaptive_coeff_threshold - 1.0f;
adaptive_coeff = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
adaptive_coeff_threshold = fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_threshold_arg);
}
const float exp_argument = fmaf(-0.2925f, dim_f, 0.0f);
const float exp2_exp_arg = fmaf(exp_argument * 0.69314718055994530941723212145818f, fmaf(exp_argument * 0.69314718055994530941723212145818f, fmaf(exp_argument * 0.69314718055994530941723212145818f, fmaf(exp_argument * 0.69314718055994530941723212145818f, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float A = fmaf(39.995f, exp2_exp_arg, 0.0f);
const int T = static_cast<int>(fmaf(fmaf(boost::math::pdf(boost::math::normal(0.5f, 0.17f), p), sqrtf(fmaf(dim_f, 1.825f, 0.0f)), 0.0f), fmaf(-(exp2_exp_arg - 1.0f), A, A), 0.0f));
std::pop_heap(H.begin(), H.end(), ComparePtrND);
IntervalND* cur = H.back();
H.pop_back();
const float x1 = cur->x1;
const float x2 = cur->x2;
const float y1 = cur->y1;
const float y2 = cur->y2;
float m = fmaf(r_eff, Mmax, 0.0f);
float tNew = step(m, x1, x2, y1, y2, dim_f, r_eff);
const float bestFOld = bestF;
const float fNew = evalAt(tNew);
IntervalND* L = new IntervalND(x1, tNew, y1, fNew);
IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2);
L->i1 = t_to_idx(x1);
L->i2 = t_to_idx(tNew);
Rv->i1 = t_to_idx(tNew);
Rv->i2 = t_to_idx(x2);
L->diam = map.block_diameter(L->i1, L->i2);
Rv->diam = map.block_diameter(Rv->i1, Rv->i2);
L->compute_span_level(map);
Rv->compute_span_level(map);
L->set_metric(L->diam);
Rv->set_metric(Rv->diam);
const float Mloc = fmaxf(L->M, Rv->M);
update_pockets_and_Mmax(L);
update_pockets_and_Mmax(Rv);
const float prevMmax = Mmax;
if (Mloc > Mmax) Mmax = Mloc;
m = fmaf(r_eff, Mmax, 0.0f);
if (adaptive) {
const float len1 = fmaf(tNew, 1.0f, -x1);
const float len2 = fmaf(x2, 1.0f, -tNew);
if (fmaf(len1, 1.0f, len2) == dmax) {
dmax = fmaxf(len1, len2);
for (auto pI : H) {
const float Ls = fmaf(pI->x2, 1.0f, -pI->x1);
if (Ls > dmax) dmax = Ls;
}
}
if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) {
const float alpha = p * p;
const float beta = fmaf(-alpha, 1.0f, 2.0f);
const float MULT = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULT, r_eff, -MULT);
const float GF = beta * global_coeff;
L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f)));
Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f)));
const size_t sz = H.size();
RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
else {
if (Mloc > prevMmax) {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
const size_t sz = H.size();
RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
}
else {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
}
}
}
else {
if (Mloc > prevMmax) {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
const size_t sz = H.size();
RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
}
else {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
}
}
H.emplace_back(L);
std::push_heap(H.begin(), H.end(), ComparePtrND);
H.emplace_back(Rv);
std::push_heap(H.begin(), H.end(), ComparePtrND);
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
IntervalND* const top = H.front();
const float interval_len = dim > 1 ? fmaf(top->x2, 1.0f, -top->x1) : top->diam;
if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter) {
out_iterations = (size_t)it;
out_achieved_epsilon = interval_len;
return;
}
if (!(it % T) || staganition) {
MultiCrossMsg out;
const unsigned intervals_to_send = static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f)));
out.count = intervals_to_send;
float* dest = out.intervals;
unsigned ii = 0;
while (ii < 3) {
IntervalND* Tt = H[ii];
dest[0] = Tt->x1;
dest[1] = 0.0f;
dest[2] = Tt->x2;
dest[3] = 0.0f;
dest[4] = Tt->R;
dest += 5;
++ii;
}
const size_t iterations = std::bit_width((size_t)(world - 1));
bool active = true;
const bool invert_T = ((int)fmaf((float)exchange_counter_T, 1.0f, 1.0f)) & 1;
size_t ii2 = 0;
while (ii2 < iterations && active) {
const size_t step = 1ULL << ii2;
const int partner = rank ^ (int)step;
if (partner < world) {
const bool am_sender = ((!!(rank & (int)step)) ^ invert_T);
if (am_sender) {
g_world->isend(partner, 0, out);
active = false;
}
}
++ii2;
}
++exchange_counter_T;
}
if (bestF < fmaf(bestFOld, 2.03f - adaptive_coeff, 0.0f) && !(it % 2)) {
BestSolutionMsg out;
out.bestF = bestF;
out.bestX = bestX;
out.bestY = bestY;
out.dim = (unsigned)bestQ.size();
memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float));
const size_t iterations = std::bit_width((size_t)(world - 1));
bool active = true;
const bool invert_T = ((int)fmaf((float)exchange_counter, 1.0f, 1.0f)) & 1;
size_t ii2 = 0;
while (ii2 < iterations && active) {
const size_t step = 1ULL << ii2;
const int partner = rank ^ (int)step;
if (partner < world) {
const bool am_sender = ((!!(rank & (int)step)) ^ invert_T);
if (am_sender) {
g_world->isend(partner, 2, out);
active = false;
}
}
++ii2;
}
++exchange_counter;
}
while (g_world->iprobe(boost::mpi::any_source, 0)) {
MultiCrossMsg in;
g_world->recv(boost::mpi::any_source, 0, in);
const MultiCrossMsg& mX = in;
unsigned ii = 0;
while (ii < mX.count) {
const float* d = &mX.intervals[ii * 5];
float sx = d[0];
float ex = d[2];
if (ex > sx) {
alignas(16) float tmp[32];
float tx;
float ty;
map.map01ToPoint(sx, tmp);
const float y1i = cost(tmp, tx, ty);
map.map01ToPoint(ex, tmp);
const float y2i = cost(tmp, tx, ty);
IntervalND* inj = new IntervalND(sx, ex, y1i, y2i);
inj->i1 = t_to_idx(sx);
inj->i2 = t_to_idx(ex);
inj->diam = map.block_diameter(inj->i1, inj->i2);
inj->compute_span_level(map);
inj->set_metric(inj->diam);
update_pockets_and_Mmax(inj);
inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
IntervalND* topH = H.front();
if (inj->R > fmaf(adaptive_coeff, topH->R, -0.03f)) {
const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f;
const float kf = staganition ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f);
inj->R = fmaf(d[4], kf, 0.0f);
H.emplace_back(inj);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
}
++ii;
}
}
while (g_world->iprobe(boost::mpi::any_source, 2)) {
BestSolutionMsg bm;
g_world->recv(boost::mpi::any_source, 2, bm);
if (bm.bestF < fmaf(bestF, adaptive_coeff, 0.0f) || staganition) {
alignas(16) float tmp_q[32];
memcpy(tmp_q, bm.bestQ, bm.dim * sizeof(float));
const float t_best = map.pointToT(tmp_q);
const float log_arg = fmaf(dim_f, 1.0f, -1.0f);
const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f);
const float t1 = fmaxf(a, fmaf(-adaptive_diameter, 0.2f, t_best));
const float t2 = fminf(b, fmaf(adaptive_diameter, 0.2f, t_best));
if (t2 > t1) {
alignas(16) float tq1[32];
alignas(16) float tq2[32];
float xx1;
float yy1;
float xx2;
float yy2;
map.map01ToPoint(t1, tq1);
const float f1 = cost(tq1, xx1, yy1);
map.map01ToPoint(t2, tq2);
const float f2 = cost(tq2, xx2, yy2);
IntervalND* I = new IntervalND(t1, t2, f1, f2);
I->i1 = t_to_idx(t1);
I->i2 = t_to_idx(t2);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, 2.03f - adaptive_coeff, 0.0f);
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
if (bm.bestF < bestF) {
bestF = bm.bestF;
bestX = bm.bestX;
bestY = bm.bestY;
bestQ.assign(bm.bestQ, bm.bestQ + bm.dim);
}
}
}
++it;
}
}" - сейчас в случае перспективного интервала - отправляется только один интервал при одиночных пересылках - нужно сделать так, чтобы при размерности меньше 3 - отправлялся один лучший интервал так как это реализовано сейчас, при размерности три и больше отправлять всегда фиксировано два лучших интервала в случае перспективного интервала, но реализовать закон убывающей полезности - сейчас при отправке (и последующем приёме) одного лучшего интервала на принимающей стороне его характеристика преобразуется как "I->R = fmaf(I->R, 2.03f - adaptive_coeff, 0.0f);" - и такой закон преобразования характеристик нужно сохранить для случая размерностей меньше 3, но в случае размерности 3 и больше по закону убывающей полезности должно происходить следующее: "первый лучший интервал должен преобразовываться по тому же правилу но помимо этого он должен умножаться на значение адаптивного коэффициента adaptive_coeff но заметь этот адаптивный коэффициент вычисляется по прогрессу - есть добавка зависящая от прогресса, домножаемая и складываемая с основной частью - таким образом формируется адаптивный коэффициент в зависимости от прогресса - но нужно теперь хранить такую же в точности добавку с тем же функционалом но потом подставлять вместо прогресса 0 - для первого лучшего интервала, 1 - для второго лучшего интервала (он уже не самый лучший интервал) - логично что коэффициент для него должен быть меньше, т.е. по факту нужно создать клон адаптивного коэффициента - но заметь что основа этого коэффициента зависящая от размерности а не от прогресса должна сохраниться - так будет выгоднее чтобы не дублировать функционал - используем ту же основу но добавляем к ней другую добавку зависящую формально также от прогресса но вместо прогресса подставляем 0 для первого лучшего интервала и 1 для второго как финальную версию прогресса так как второй интервал будет последним в пачке пересылаемых, но важный нюанс - клон адаптивного коэффициента должен будет быть сдвинут вниз по оси ординат если его рисовать как функцию от прогресса на величину равную разнице максимального и минимального значения коэффициента делённую пополам - это обеспечит что пересылаемые интервалы не будут сильно завышаться по характеристикам, и этот клон адаптивного коэффициента но зависимый от того какой это лучший интервал (первый лучший из кучи - то есть действительно самый лучший, второй лучший сразу после первого) нужно будет домножать через fmaf на характеристику уже пересчитанную по формуле "I->R = fmaf(I->R, 2.03f - adaptive_coeff, 0.0f);" - то есть пересчитываем так и домножаем через fmaf дополнительно на клон адаптивного коэффициента. НО более того нужно будет еще и домножать на этот клон интервалы которые пересылаются при крупном обмене - сразу несколько интервалов в пачке пересылаются - там после того как характеристика будет изменена по правилу: " inj->R = fmaf(d[4], kf, 0.0f);" - нужно будет дополнительно после этого домножить характеристику через fmaf на клон адаптивного коэффициента - только в этом случае ЗАМЕТЬ БУДЕТ ОТЛИЧИЕ В АРГУМЕНТАХ КЛОНА АДАПТИВНОГО КОЭФФИЦИЕНТА - ЕСЛИ РАНЬШЕ МЫ ПЕРЕДАВАЛИ ТОЛЬКО 0 И 1 ТАК КАК БЫЛО ТОЛЬКО ДВА ИНТЕРВАЛА В СЛУЧАЕ РАЗМЕРНОСТИ БОЛЬШЕ 2 ПРИ ПЕРЕСЫЛКЕ ПО УСЛОВИЮ ПЕРСПЕКТИВНОГО ИНТЕРВАЛА - В ЭТОМ ЖЕ СЛУЧАЕ НАПРИМЕР ЕСЛИ ПЕРЕСЫЛАЕТСЯ ПАЧКА ИЗ ТРЁХ ИНТЕРВАЛОВ ТО АРГУМЕНТЫ У КЛОНА АДАПТИВНОГО КОЭФФИЦИЕНТА БУДУТ 0, 0.5, 1 СООТВЕТСВЕННО, В СЛУЧАЕ ПАЧКИ ИЗ 4 ИНТЕРВАЛОВ 0, 0.333, 0.666, 1 СООТВЕТСВЕННО - И АНАЛОГИЧНО ПРИ БОЛЬШИХ РАЗМЕРАХ ПАЧКИ АРГУМЕНТЫ БУДУТ МЕНЯТЬСЯ, ДЛЯ ПАЧКИ ИЗ 5 ИНТЕРВАЛОВ 0, 0.25, 0.5, 0.75, 1 И ТАК ДАЛЕЕ ПО АНАЛОГИИ - ПОМНИ О ТОМ ЧТО КЛОН АДАПТИВНОГО КОЭФФИЦИЕНТА ДОЛЖЕН БЫТЬ СДВИНУТ ВНИЗ ПО ОРДИНАТЕ НА ВЕЛИЧИНУ РАВНУЮ ПОЛУРАЗНИЦЕ МАКСИМАЛЬНОГО И МИНИМАЛЬНОГО ЗНАЧЕНИЙ АДАПТИВНОГО КОЭФФИЦИЕНТА ПРИ ПРОГРЕССАХ 0 И 1 СООТВЕТСТВЕННО - ПРИМЕНИ УКАЗАННЫЕ МНОЙ ИЗМЕНЕНИЯ В МОЙ КОД И ВЕРНИ МНЕ ПОЛНУЮ ИСПРАВЛЕННУЮ ВЕРСИЮ КОДА - ПРИДЕРЖИВАЙСЯ МОЕГО СТИЛЯ МАКСИМАЛЬНОЙ ОПТИМИЗАЦИИ ПОД СКОРОСТЬ КОДА И ПРИДЕРЖИВАЙСЯ ТЕХ ЖЕ ПРАВИЛ ПО ОФОРМЛЕНИЮ КОДА, ВЫЧИСЛЕНИЯ ПО ВОЗМОЖНОСТИ ИНЛАЙНЬ, ПОСТАРАЙСЯ ЧТОБЫ ЭТО БЫЛО МАКСИМАЛЬНО БЫСТРО - ТАК ЧТО ДЕЛАЙ ИНЛАЙН FMAF НАДСТРОЙКИ НАД СУЩЕСТВУЮЩИМИ ФОРМУЛАМИ - ЧТОБЫ ПРОСТО FMAF ФОРМУЛЫ СТАЛИ ДЛИННЕЕ А НЕ ПОЯВИЛИСЬ НОВЫЕ, ДАЖЕ ПРОСТЫЕ УМНОЖЕНИЯ ДВУХ ЧИСЕЛ ВСЕГДА РЕАЛИЗУЙ ЧЕРЕЗ FMAF - МНЕ ВЕРНИ ПОЛНУЮ ИСПРАВЛЕННУЮ ВЕРСИЮ КОДА БЕЗ ПРОПУСКОВ И КОММЕНТАРИЕВ
"

Pensamento

Examining logic behind interval transfer

The code snippet fills just three intervals despite setting out.count = intervals_to_send. This seems to be a limit, maybe due to truncation. The user mentioned "крупном обмене", implying multiple intervals should be sent. It looks like the top 3 are sent using a binary pattern. On the receiver side, the best solution is used to create new intervals.

Clarifying interval sending for different dimensions

The user asks to send one best interval when the dimension is less than 3, and two best intervals when the dimension is 3 or more—this applies if the interval is promising. It seems that the MultiCrossMsg is involved. I need to double-check whether the code currently follows this logic. The message suggests a specific path triggered by T for promising intervals.

Interpreting promising interval and refining behavior

It looks like "перспективный интервал" might refer to sending intervals from a partner. Earlier code suggests a larger exchange ("крупный обмен"), while another path triggers sending the best solution when certain conditions are met. So, for dimensions under 3, I’ll send one best interval, and for dimensions over 3, I’ll always send two best intervals in the case of a promising one.

Handling two best intervals in a single send

The user asked to send two best intervals, not solutions. The current BestSolutionMsg structure contains one solution, but I could adapt it to send two entries—perhaps by sending two messages sequentially or modifying the structure slightly. Since a single "send" is requested, I’ll aim to send a message containing two intervals.

Modifying message types for best intervals

I might need to modify the BestSolutionMsg to send two intervals instead of one. However, I could package these as a MultiCrossMsg, though that's typically used for efficient periodic exchanges. Since the user wants fixed intervals for dimensions ≥ 3, I'll send those two intervals, applying a scaling transformation for each based on the progress.

Understanding adaptive coefficient adjustment

I’ve realized I need to use the same base adaptive coefficient but adjust it based on progress. Specifically, I’ll store the same addition value but use progress 0 for the first interval and 1 for the second. So, I need to create a clone of the adaptive coefficient, keeping its functional form but modifying its addition part depending on progress. The coefficient uses a polynomial approximation of exp for its calculation, which helps adjust based on progress.

Computing the adaptive coefficient clone

To compute the clone coefficient, I need to use z (0 or 1) in place of p. The coefficient will be based on the same B_dim, poly equation, and adaptive_coeff_addition. Additionally, the clone should be shifted down by half the difference between the maximum (at progress 0) and minimum (at progress 1) values of the coefficient (max-min)/2. So, the formula for the clone is clone(z) = coeff(z) - 0.5*(coeff(0)-coeff(1)).

Creating the clone coefficient and applying transformations

The clone coefficient for the first interval uses progress 0 and is shifted down by half the difference between the maximum and minimum values at progress 0 and 1. For the second interval, progress 1 is used, and it’s similarly shifted down. Then, this clone is multiplied with the transformed characteristic using the standard formula. For interval packing, each interval is adjusted based on its position within the pack.

Handling interval transformations and sending two best intervals

For each received interval, I compute a value z based on its position in the pack. Then I apply the clone coefficient based on that value. For dimensions less than 3, I'll proceed without the clone, but for dim ≥ 3, I'll apply the clone multiplication. Additionally, when sending, we need to send two best intervals instead of just one for dim ≥ 3.

Choosing best intervals and sending two

To choose the best intervals, I'll likely select from the top intervals in the heap based on characteristic R. Currently, we send the best solution when it's significantly improved. However, the user wants two best intervals sent when a "perspective interval" occurs. This must relate to promising intervals, perhaps when a new top interval is created or an interval is split. I'll need to send the two best intervals from the heap, not just the best solution.

Deciding how to send two best intervals

To handle sending the two best intervals, the process is a bit tricky. For dimensions 3 or greater, the user wants me to send two best intervals in a "promising interval" situation, not just the best solution. This means modifying the current system, either by using multiple messages or adjusting the message struct for multiple intervals.

Choosing how to send two intervals for dim>=3

For dimensions 3 or higher, we might need to send two intervals, not just the best one. We can pass the best interval and second best from the heap. Instead of sending a solution, we could send bestQ and the midpoint from the second-best interval, reconstructed on the receiver side. This would meet the requirements for "promising interval."

Deciding how to send two best intervals for dim>=3

For dimensions 3 or higher, we'll send two intervals: the global best and the second-best. We can calculate the second best using the heap's top two intervals. The second best can be approximated by popping the heap twice, but ensuring heap properties are restored. We’ll compute cost at midpoint for second interval.

Sending two intervals with different tags in dim>=3

For dim >= 3, I will send two intervals: the first with tag2 and the second with tag3. The receiver will probe both tags in order, ensuring correct mapping by treating tag2 as slot0 and tag3 as slot1. This ensures proper ordering and avoids any interleaving issues, while maintaining simple changes to tags.

Clarifying clone and adaptive coefficient usage for intervals

I think the key here is that we will multiply with the clone of adaptive_coeff for both intervals, using the progress values fixed (0 for the first and 1 for the second). It’s a bit tricky, but I’m focusing on keeping the base formula intact and only adjusting via the clone derived from the same basis but with progress changes.

Structuring interval sending and receiving logic

The plan involves modifying interval sending and receiving for multi-dimensional cases. We’ll adjust the sending logic to correctly match the number of intervals needed, compute adaptive coefficients, and handle ‘clone coefficients’ based on interval position. For receiving, we’ll adjust transformations according to the dimension, ensuring proper multiplication of the intervals post-reception, especially for higher dimensions.

Computing and applying clone coefficients

The plan is to compute the clone adaptive coefficient using a specific formula for each iteration, based on constants and the dimensions provided. For intervals with z values in the range of 0 to 1, we adjust them by multiplying the interval’s R values by the calculated clone coefficient. For sending intervals in big exchanges, no immediate change is required, but adjustments might be needed during receiving.

Finalizing coefficient adjustments

The transformation must occur during the receiving stage, including base and clone multiplications. After setting inj->R, we apply further multiplication with the calculated clone. For multi-interval tasks or tag-specific cases, we ensure coefficients are computed efficiently, especially when handling intervals' adaptive coefficients and shifts, ensuring proper transforms and multiplications.

Handling interval sending and ordering

We need to ensure that intervals sent during exchanges are properly ordered by their rank — specifically, using the top intervals based on their R value. This involves carefully selecting and sorting the intervals before transmission, making sure to respect the specifications for sending the two best ones, even when only their rank is used for comparison.

Setting up interval modifications and message sending

We need to send the best and second-best intervals, with the first one being "bestQ" and the second chosen based on the second-best in the heap, excluding bestQ. This fits with the conditions and ensures proper use of the clone coefficient. I'll adjust the sending protocol and ensure the function is modified correctly, without commentary.

Clarifying best interval handling for messages

The first message should keep sending the best solution (bestQ). For the second, we’ll extract the top two intervals from the heap, using their midpoints, to introduce diversity without altering the first message’s content.

For dimensions >=3, we’ll send two intervals while adjusting how the receiver handles them by applying specific transforms with coefficients.

Ensuring safe message packing and sending

We'll handle the message struct by calculating the capacity of MultiCrossMsg using a compile-time constant that prevents overflows. While the capacity might vary, we'll clamp send_cnt at 16 to be safe, based on typical packing sizes. Additionally, for larger dimension cases, we'll adjust accordingly and send two messages when needed, ensuring we stick to the correct partner for each tag.

Handling message sends and coefficient calculations

We'll compute the second message before the loop to ensure it's ready. Since Boost.MPI's isend likely performs a deep copy, we'll ensure the messages remain valid until completion. We'll use inlined coefficient computations with fmaf and preserve existing formulas for efficiency. Adjustments will be made to ensure optimal packing and handling of intervals during the send process.

Addressing message order and coefficient handling in exchanges

For the message order, we’ll prioritize the best intervals as requested, adjusting the order to fit the requirements. We'll use precomputed variables like inv_mxc to avoid redundant calculations, and handle coefficient clones based on dimension, implementing the needed logic for tag3 messages and respecting the necessary checks. This should maintain the integrity while improving the process.

Handling message format and heap selections based on dimension

For dim >= 3, I'll send two best intervals by extracting the top two from the heap and sending their midpoints, as required. For dim < 3, I'll maintain the original rule and continue sending only bestQ. If the heap has only one interval, it will be sent twice. This should meet the spec while preserving logic.

Implementing updated solution for heap intervals and coefficient application

I've outlined how to extract top intervals from the heap for dim >= 3, calculating the midpoints, then sending the corresponding messages. I’ll implement adjustments for the adaptive coefficient, ensuring the clone is applied as described. The logic for message injection and handling will be preserved within the new dimensions, and no new floating-point multiplications will be required where they're unnecessary.

Thought for 10m 22s

Resposta

cpp
static __declspec(noalias) void agp_run_branch_mpi( const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed, std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H, std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ, float& bestF, float& bestX, float& bestY, size_t& out_iterations, float& out_achieved_epsilon, float M_prior) noexcept { const int n = cost.n; const int dim = n + (cost.variableLen ? n : 0); const float dim_f = (float)dim; unsigned exchange_counter = 0; unsigned exchange_counter_T = 0; alignas(16) float M_by_span[12]; int msi = 0; while (msi < 12) M_by_span[msi++] = M_prior; float Mmax = M_prior; alignas(16) float q_local[32]; alignas(16) float phi[32]; alignas(16) float s_arr[32]; alignas(16) float c_arr[32]; alignas(16) float sum_s[32]; alignas(16) float sum_c[32]; alignas(16) float q_try[32]; bestQ.reserve((size_t)dim); float x = 0.0f; float y = 0.0f; int no_improve = 0; auto t_to_idx = [&](float t) -> unsigned long long { unsigned long long idx = (unsigned long long)fmaf(t, (float)map.scale, 0.0f); return idx; }; auto update_pockets_and_Mmax = [&](IntervalND* I) { const int k = I->span_level; if (I->M > M_by_span[k]) M_by_span[k] = I->M; if (M_by_span[k] > Mmax) Mmax = M_by_span[k]; }; const float a = 0.0f; const float b = 1.0f; float p = 0.0f; float dmax = fmaf(b, 1.0f, -a); const float initial_len = dmax; float exp_arg_threshold = fmaf(1.0f / fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, -0.2f, 0.25f), -0.33333333f), 0.5f), -1.0f), dim_f), 0.415888308336f, 0.0f); const float log_arg_threshold = fmaf(dim_f, 0.1f, -0.105f); const float adaptive_coeff_addition_threshold = fmaf(fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 1.0f / fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg_threshold), 0.17814618538f); float adaptive_coeff_threshold = adaptive_coeff_addition_threshold + 1.0f; const float log_arg = dim_f + 5.0f; const float A_dim = fmaf(1.0f / fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), 3.0f, 0.0f); const float B_dim = fmaf(A_dim, 0.65f, 0.0f); const float log_argument = A_dim - 2.03f; const float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim; const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.55f), 1.0f); float adaptive_coeff = A_dim - adaptive_coeff_addition; const float clone_poly_B = fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float clone_coeff0 = fmaf(-1.0f, adaptive_coeff_addition, A_dim); const float clone_coeff1 = fmaf(-clone_poly_B, adaptive_coeff_addition, A_dim); const float clone_max = fmaxf(clone_coeff0, clone_coeff1); const float clone_min = fminf(clone_coeff0, clone_coeff1); const float clone_shift_down = fmaf(fmaf(clone_max, 1.0f, -clone_min), 0.5f, 0.0f); auto adaptive_coeff_clone = [&](float prog) -> float { const float exp_arg_c = fmaf(B_dim, prog, 0.0f); const float poly_c = fmaf(exp_arg_c, fmaf(exp_arg_c, fmaf(exp_arg_c, fmaf(exp_arg_c, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float coeff_c = fmaf(-poly_c, adaptive_coeff_addition, A_dim); return fmaf(coeff_c, 1.0f, -clone_shift_down); }; int it = 0; auto evalAt = [&](float t) -> float { map.map01ToPoint(t, q_local); float f = cost(q_local, x, y); const float step_arg = fmaf(-dim_f, 0.825f, 3.3f); float eta = fmaf(fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f); if ((((f < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) && p > 0.55f) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && ((p > 0.7f && !(it % 3)) || p > 0.9f))) && dim > 2) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && dim < 3)) { float acc = 0.0f; int ii = 0; while (ii < n) { acc = fmaf(q_local[ii], 1.0f, acc); phi[ii] = acc; ++ii; } FABE13_SINCOS(phi, s_arr, c_arr, n); float as = 0.0f; float ac = 0.0f; int k = n - 1; while (k >= 0) { const float Lk = cost.variableLen ? q_local[n + k] : 1.0f; as = fmaf(Lk, s_arr[k], as); ac = fmaf(Lk, c_arr[k], ac); sum_s[k] = as; sum_c[k] = ac; --k; } const float dx = fmaf(x, 1.0f, -cost.targetX); const float dy = fmaf(y, 1.0f, -cost.targetY); const float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1.0e-8f; int stepI = 0; const float adaptive_window = fmaf(1.0f / fmaf(bestF, adaptive_coeff_threshold, 0.0f), fmaf(bestF, adaptive_coeff_threshold, -f), 0.0f); const float curse_dim_arg = fmaf(dim_f, 0.825f, 0.0f); const int threshold_iters_grad = (int)fmaf(fmaf(adaptive_window, adaptive_window, 1.0f), fmaf(fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f), 0.0f); int no_improve_steps = 0; float f_at_check_start = f; int steps_since_check = 0; const int check_interval = (int)fmaf(1.0f / log2f(fmaf(dim_f, 0.5f, 1.0f)), (float)(threshold_iters_grad >> 2), 0.0f); while (stepI < threshold_iters_grad) { int i = 0; #pragma loop ivdep while (i < n) { float gpen = 0.0f; { const float ai = fabsf(q_local[i]); const float v = fmaf(ai, 1.0f, -cost.minTheta); if (v > 0.0f) { const float scale_arg = fmaf(2.0f / (cost.minTheta + 1.0e-6f), fmaf(v, 0.69314718055994530941723212145818f, 0.0f), 0.0f); const float exp_val = fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float dpen_dtheta = fmaf(cost.sharpW, fmaf(exp_val, 0.69314718055994530941723212145818f * (2.0f / (cost.minTheta + 1.0e-6f)), 0.0f), copysignf(1.0f, q_local[i])); gpen = fmaf(dpen_dtheta, 1.0f, gpen); } } { const float tsg = fmaf(-q_local[i], cost.archBiasK, 0.0f); const float exp_arg = -tsg; const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float sig = 1.0f / fmaf(exp_val, 1.0f, 1.0f); gpen = fmaf(-fmaf(cost.archBiasW, cost.archBiasK, 0.0f), sig, gpen); } const float g = fmaf(fmaf(dx, -sum_s[i], fmaf(dy, sum_c[i], 0.0f)), 1.0f / dist, gpen); q_try[i] = fmaf(-eta, g, q_local[i]); const float lo = (i == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f; const float hi = 2.6179938779914943653855361527329f; if (q_try[i] < lo) q_try[i] = lo; else if (q_try[i] > hi) q_try[i] = hi; ++i; } if (cost.variableLen) { int j = 0; #pragma loop ivdep while (j < n) { const float g = fmaf(fmaf(dx, c_arr[j], fmaf(dy, s_arr[j], 0.0f)), 1.0f / dist, 0.0f); float v = fmaf(-eta, g, q_local[n + j]); if (v < 0.5f) v = 0.5f; else if (v > 2.0f) v = 2.0f; q_try[n + j] = v; ++j; } } float x2; float y2; const float f2 = cost(q_try, x2, y2); const float rel_improvement = fmaf(-1.0f / f, f2, 1.0f); if (f2 < f) { memcpy(q_local, q_try, (size_t)dim * sizeof(float)); f = f2; x = x2; y = y2; eta = fmaf(eta, fmaf(10.0f / sqrtf(dim_f), 1.0f / (1.0f + rel_improvement) * rel_improvement, 1.0f), 0.0f); } else { const float caution_coeff = 5.0f / sqrtf(dim_f) * rel_improvement; eta = fmaf(eta, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0f); memcpy(q_try, q_local, (size_t)dim * sizeof(float)); ++no_improve_steps; } ++steps_since_check; if (steps_since_check == check_interval) { if (fmaf(-1.0f / f_at_check_start, f, 1.0f) < 1e-3f) break; f_at_check_start = f; steps_since_check = 0; } if (no_improve_steps > threshold_iters_grad >> 2) break; ++stepI; } } if (f < bestF) { if (0.95f < p || dim < 3) { const int last = n - 1; const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f; const float hi = 2.6179938779914943653855361527329f; float bestLocF = f; float saved = q_local[last]; float delta = eta; while (delta > fmaf(delta, 0.15f, 0.0f)) { int sgn = -1; while (sgn < 2) { float cand = fmaf((float)sgn, delta, saved); if (cand < lo) cand = lo; else if (cand > hi) cand = hi; const float backup = q_local[last]; q_local[last] = cand; float x2; float y2; const float f2 = cost(q_local, x2, y2); if (f2 < bestLocF) { bestLocF = f2; x = x2; y = y2; saved = cand; } q_local[last] = backup; sgn += 2; } delta = fmaf(delta, 0.5f, 0.0f); } if (bestLocF < f) { q_local[last] = saved; f = bestLocF; } } bestF = f; bestQ.assign(q_local, q_local + dim); bestX = x; bestY = y; no_improve = 0; } else ++no_improve; return f; }; const float f_a = evalAt(a); const float f_b = evalAt(b); const float Kf = fminf(fmaxf(2.0f * dim_f, 8.0f), 128.0f); const int K = (int)Kf; H.reserve((size_t)maxIter + (size_t)K + 16u); const int rank = g_world->rank(); const int world = g_world->size(); alignas(16) float seeds[256 * 32]; const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, (unsigned)fmaf((float)rank, 7919.0f, (float)seed)); int i = 0; while (i < seedCnt) { const float* s = seeds + (size_t)fmaf((float)i, 32.0f, 0.0f); const float t_seed = map.pointToT(s); const float interval_size = (i < 3) ? fmaf(0.0004f, (float)dim, 0.0f) : fmaf(fmaf(0.00031f, (float)dim, 0.0f), exp2f((1.0f / (float)(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * (float)(i - 3)), 0.0f); const float t1 = fmaxf(a, fmaf(-interval_size, 1.0f, t_seed)); const float t2 = fminf(b, fmaf(interval_size, 1.0f, t_seed)); if (t2 > t1) { alignas(16) float q1[32]; alignas(16) float q2[32]; float x1; float y1; float x2; float y2; map.map01ToPoint(t1, q1); const float f1 = cost(q1, x1, y1); map.map01ToPoint(t2, q2); const float f2 = cost(q2, x2, y2); IntervalND* I = new IntervalND(t1, t2, f1, f2); I->i1 = t_to_idx(t1); I->i2 = t_to_idx(t2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); if (i < 3) I->R = fmaf(I->R, fmaf(0.01f, (float)dim, 0.85f), 0.0f); else { const float start_mult = fmaf(0.214f, (float)dim, 0.0f); const float end_mult = fmaf(0.174f, (float)dim, 0.0f); const float mult = fmaf(start_mult, exp2f((1.0f / (float)(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * (float)(i - 3)), 0.0f); I->R = fmaf(I->R, mult, 0.0f); } H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); if (f1 < bestF) { bestF = f1; bestQ.assign(q1, q1 + dim); bestX = x1; bestY = y1; } if (f2 < bestF) { bestF = f2; bestQ.assign(q2, q2 + dim); bestX = x2; bestY = y2; } } ++i; } float prev_t = a; float prev_f = f_a; int k = 1; while (k <= K) { const float t = fmaf(fmaf(b - a, (float)k / (float)(K + 1), a), 1.0f, (float)rank / (float)(world * (K + 1))); const float f = evalAt(t); IntervalND* I = new IntervalND(prev_t, t, prev_f, f); I->i1 = t_to_idx(prev_t); I->i2 = t_to_idx(t); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); prev_t = t; prev_f = f; ++k; } IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b); tail->i1 = t_to_idx(prev_t); tail->i2 = t_to_idx(b); tail->diam = map.block_diameter(tail->i1, tail->i2); tail->compute_span_level(map); tail->set_metric(tail->diam); update_pockets_and_Mmax(tail); tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); H.emplace_back(tail); std::push_heap(H.begin(), H.end(), ComparePtrND); const int kickEveryDim = static_cast<int>(fmaf(22.5f, exp2f(-0.295f * dim_f), 0.0f)); const int noImproveThrDim = static_cast<int>(fmaf(32.5f, exp2f(-0.1f * dim_f), 0.0f)); while (it < maxIter) { p = fmaf(-1.0f / initial_len, dmax, 1.0f); const float p_arg = fmaf(p, 2.3f, -2.9775f); const float r_eff = fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, r, 0.0f); const bool staganition = no_improve > noImproveThrDim; if (!(it % kickEveryDim) || staganition) { const float t_best = map.pointToT(bestQ.data()); const float log_arg = fmaf(dim_f, 1.0f, -1.0f); const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f); int ii = 0; while (ii < 2) { const float off = (ii == 0) ? fmaf(adaptive_diameter, 2.0f, 0.0f) : fmaf(-adaptive_diameter, 2.0f, 0.0f); const float t_seed = fminf(b, fmaxf(a, fmaf(off, 1.0f, t_best))); const float f_seed = evalAt(t_seed); IntervalND* J = new IntervalND(fmaf(-adaptive_diameter, 1.0f, t_seed), fmaf(adaptive_diameter, 1.0f, t_seed), f_seed, f_seed); J->i1 = t_to_idx(fmaf(-adaptive_diameter, 1.0f, t_seed)); J->i2 = t_to_idx(fmaf(adaptive_diameter, 1.0f, t_seed)); J->diam = map.block_diameter(J->i1, J->i2); J->compute_span_level(map); J->set_metric(J->diam); update_pockets_and_Mmax(J); J->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); J->R = fmaf(J->R, 2.03f - adaptive_coeff, 0.0f); H.emplace_back(J); std::push_heap(H.begin(), H.end(), ComparePtrND); ++ii; } no_improve = 0; } exp_arg_threshold = fmaf(-exp_arg_threshold * p, 10.0f, 0.0f); adaptive_coeff_threshold = fmaf(fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition_threshold, 1.0f); const float exp_arg = fmaf(B_dim, p, 0.0f); adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim); if (dim < 3) { const float adaptive_coeff_arg = adaptive_coeff - 1.0f; const float adaptive_coeff_threshold_arg = adaptive_coeff_threshold - 1.0f; adaptive_coeff = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg); adaptive_coeff_threshold = fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_threshold_arg); } const float exp_argument = fmaf(-0.2925f, dim_f, 0.0f); const float exp2_exp_arg = fmaf(exp_argument * 0.69314718055994530941723212145818f, fmaf(exp_argument * 0.69314718055994530941723212145818f, fmaf(exp_argument * 0.69314718055994530941723212145818f, fmaf(exp_argument * 0.69314718055994530941723212145818f, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float A = fmaf(39.995f, exp2_exp_arg, 0.0f); const int T = static_cast<int>(fmaf(fmaf(boost::math::pdf(boost::math::normal(0.5f, 0.17f), p), sqrtf(fmaf(dim_f, 1.825f, 0.0f)), 0.0f), fmaf(-(exp2_exp_arg - 1.0f), A, A), 0.0f)); std::pop_heap(H.begin(), H.end(), ComparePtrND); IntervalND* cur = H.back(); H.pop_back(); const float x1 = cur->x1; const float x2 = cur->x2; const float y1 = cur->y1; const float y2 = cur->y2; float m = fmaf(r_eff, Mmax, 0.0f); float tNew = step(m, x1, x2, y1, y2, dim_f, r_eff); const float bestFOld = bestF; const float fNew = evalAt(tNew); IntervalND* L = new IntervalND(x1, tNew, y1, fNew); IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2); L->i1 = t_to_idx(x1); L->i2 = t_to_idx(tNew); Rv->i1 = t_to_idx(tNew); Rv->i2 = t_to_idx(x2); L->diam = map.block_diameter(L->i1, L->i2); Rv->diam = map.block_diameter(Rv->i1, Rv->i2); L->compute_span_level(map); Rv->compute_span_level(map); L->set_metric(L->diam); Rv->set_metric(Rv->diam); const float Mloc = fmaxf(L->M, Rv->M); update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv); const float prevMmax = Mmax; if (Mloc > Mmax) Mmax = Mloc; m = fmaf(r_eff, Mmax, 0.0f); if (adaptive) { const float len1 = fmaf(tNew, 1.0f, -x1); const float len2 = fmaf(x2, 1.0f, -tNew); if (fmaf(len1, 1.0f, len2) == dmax) { dmax = fmaxf(len1, len2); for (auto pI : H) { const float Ls = fmaf(pI->x2, 1.0f, -pI->x1); if (Ls > dmax) dmax = Ls; } } if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) { const float alpha = p * p; const float beta = fmaf(-alpha, 1.0f, 2.0f); const float MULT = (1.0f / dmax) * Mmax; const float global_coeff = fmaf(MULT, r_eff, -MULT); const float GF = beta * global_coeff; L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f))); Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f))); const size_t sz = H.size(); RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha); std::make_heap(H.begin(), H.end(), ComparePtrND); } else { if (Mloc > prevMmax) { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) { const size_t sz = H.size(); RecomputeR_ConstM_AVX2_ND(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtrND); } } else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); } } } else { if (Mloc > prevMmax) { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) { const size_t sz = H.size(); RecomputeR_ConstM_AVX2_ND(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtrND); } } else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); } } H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND); H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND); _mm_prefetch((const char*)H[0], _MM_HINT_T0); _mm_prefetch((const char*)H[1], _MM_HINT_T0); IntervalND* const top = H.front(); const float interval_len = dim > 1 ? fmaf(top->x2, 1.0f, -top->x1) : top->diam; if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter) { out_iterations = (size_t)it; out_achieved_epsilon = interval_len; return; } if (!(it % T) || staganition) { MultiCrossMsg out; unsigned intervals_to_send = static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f))); const unsigned cap = (unsigned)(sizeof(out.intervals) / (sizeof(out.intervals[0]) * 5u)); if (intervals_to_send > cap) intervals_to_send = cap; const unsigned szH = (unsigned)H.size(); if (intervals_to_send > szH) intervals_to_send = szH; out.count = intervals_to_send; IntervalND* picked[64]; unsigned pick_i = 0; while (pick_i < intervals_to_send) { std::pop_heap(H.begin(), H.end(), ComparePtrND); picked[pick_i] = H.back(); H.pop_back(); ++pick_i; } float* dest = out.intervals; unsigned fill_i = 0; while (fill_i < intervals_to_send) { IntervalND* Tt = picked[fill_i]; dest[0] = Tt->x1; dest[1] = 0.0f; dest[2] = Tt->x2; dest[3] = 0.0f; dest[4] = Tt->R; dest += 5; ++fill_i; } int restore_i = (int)intervals_to_send - 1; while (restore_i >= 0) { H.emplace_back(picked[restore_i]); std::push_heap(H.begin(), H.end(), ComparePtrND); --restore_i; } const size_t iterations = std::bit_width((size_t)(world - 1)); bool active = true; const bool invert_T = ((int)fmaf((float)exchange_counter_T, 1.0f, 1.0f)) & 1; size_t ii2 = 0; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ (int)step; if (partner < world) { const bool am_sender = ((!!(rank & (int)step)) ^ invert_T); if (am_sender) { g_world->isend(partner, 0, out); active = false; } } ++ii2; } ++exchange_counter_T; } if (bestF < fmaf(bestFOld, 2.03f - adaptive_coeff, 0.0f) && !(it % 2)) { const size_t iterations = std::bit_width((size_t)(world - 1)); bool active = true; const bool invert_T = ((int)fmaf((float)exchange_counter, 1.0f, 1.0f)) & 1; if (dim > 2) { IntervalND* I0; IntervalND* I1; std::pop_heap(H.begin(), H.end(), ComparePtrND); I0 = H.back(); H.pop_back(); bool have_second = false; if (!H.empty()) { std::pop_heap(H.begin(), H.end(), ComparePtrND); I1 = H.back(); H.pop_back(); have_second = true; } else { I1 = I0; } alignas(16) float q0[32]; alignas(16) float q1p[32]; float x0; float y0; float x1p; float y1p; const float t0 = fmaf(I0->x1, 0.5f, fmaf(I0->x2, 0.5f, 0.0f)); const float t1p = fmaf(I1->x1, 0.5f, fmaf(I1->x2, 0.5f, 0.0f)); map.map01ToPoint(t0, q0); const float f0 = cost(q0, x0, y0); map.map01ToPoint(t1p, q1p); const float f1 = cost(q1p, x1p, y1p); if (have_second) { H.emplace_back(I1); std::push_heap(H.begin(), H.end(), ComparePtrND); } H.emplace_back(I0); std::push_heap(H.begin(), H.end(), ComparePtrND); BestSolutionMsg out0; out0.bestF = f0; out0.bestX = x0; out0.bestY = y0; out0.dim = (unsigned)dim; memcpy(out0.bestQ, q0, (size_t)dim * sizeof(float)); BestSolutionMsg out1; out1.bestF = f1; out1.bestX = x1p; out1.bestY = y1p; out1.dim = (unsigned)dim; memcpy(out1.bestQ, q1p, (size_t)dim * sizeof(float)); size_t ii2 = 0; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ (int)step; if (partner < world) { const bool am_sender = ((!!(rank & (int)step)) ^ invert_T); if (am_sender) { g_world->isend(partner, 2, out0); g_world->isend(partner, 3, out1); active = false; } } ++ii2; } } else { BestSolutionMsg out; out.bestF = bestF; out.bestX = bestX; out.bestY = bestY; out.dim = (unsigned)bestQ.size(); memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float)); size_t ii2 = 0; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ (int)step; if (partner < world) { const bool am_sender = ((!!(rank & (int)step)) ^ invert_T); if (am_sender) { g_world->isend(partner, 2, out); active = false; } } ++ii2; } } ++exchange_counter; } while (g_world->iprobe(boost::mpi::any_source, 0)) { MultiCrossMsg in; g_world->recv(boost::mpi::any_source, 0, in); const MultiCrossMsg& mX = in; const unsigned mxc = mX.count; const float inv_mxc = (mxc > 1u) ? (1.0f / (float)(mxc - 1u)) : 0.0f; unsigned ii = 0; while (ii < mX.count) { const float* d = &mX.intervals[ii * 5]; float sx = d[0]; float ex = d[2]; if (ex > sx) { alignas(16) float tmp[32]; float tx; float ty; map.map01ToPoint(sx, tmp); const float y1i = cost(tmp, tx, ty); map.map01ToPoint(ex, tmp); const float y2i = cost(tmp, tx, ty); IntervalND* inj = new IntervalND(sx, ex, y1i, y2i); inj->i1 = t_to_idx(sx); inj->i2 = t_to_idx(ex); inj->diam = map.block_diameter(inj->i1, inj->i2); inj->compute_span_level(map); inj->set_metric(inj->diam); update_pockets_and_Mmax(inj); inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); _mm_prefetch((const char*)H[0], _MM_HINT_T0); _mm_prefetch((const char*)H[1], _MM_HINT_T0); IntervalND* topH = H.front(); if (inj->R > fmaf(adaptive_coeff, topH->R, -0.03f)) { const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f; const float kf = staganition ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f); inj->R = fmaf(d[4], kf, 0.0f); if (dim > 2) { const float prog = fmaf((float)ii, inv_mxc, 0.0f); inj->R = fmaf(inj->R, adaptive_coeff_clone(prog), 0.0f); } H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtrND); } } ++ii; } } while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg bm; g_world->recv(boost::mpi::any_source, 2, bm); if (bm.bestF < fmaf(bestF, adaptive_coeff, 0.0f) || staganition) { alignas(16) float tmp_q[32]; memcpy(tmp_q, bm.bestQ, bm.dim * sizeof(float)); const float t_best = map.pointToT(tmp_q); const float log_arg = fmaf(dim_f, 1.0f, -1.0f); const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f); const float t1 = fmaxf(a, fmaf(-adaptive_diameter, 0.2f, t_best)); const float t2 = fminf(b, fmaf(adaptive_diameter, 0.2f, t_best)); if (t2 > t1) { alignas(16) float tq1[32]; alignas(16) float tq2[32]; float xx1; float yy1; float xx2; float yy2; map.map01ToPoint(t1, tq1); const float f1 = cost(tq1, xx1, yy1); map.map01ToPoint(t2, tq2); const float f2 = cost(tq2, xx2, yy2); IntervalND* I = new IntervalND(t1, t2, f1, f2); I->i1 = t_to_idx(t1); I->i2 = t_to_idx(t2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); I->R = fmaf(I->R, 2.03f - adaptive_coeff, 0.0f); if (dim > 2) I->R = fmaf(I->R, adaptive_coeff_clone(0.0f), 0.0f); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } if (bm.bestF < bestF) { bestF = bm.bestF; bestX = bm.bestX; bestY = bm.bestY; bestQ.assign(bm.bestQ, bm.bestQ + bm.dim); } } } while (g_world->iprobe(boost::mpi::any_source, 3)) { BestSolutionMsg bm; g_world->recv(boost::mpi::any_source, 3, bm); if (bm.bestF < fmaf(bestF, adaptive_coeff, 0.0f) || staganition) { alignas(16) float tmp_q[32]; memcpy(tmp_q, bm.bestQ, bm.dim * sizeof(float)); const float t_best = map.pointToT(tmp_q); const float log_arg = fmaf(dim_f, 1.0f, -1.0f); const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f); const float t1 = fmaxf(a, fmaf(-adaptive_diameter, 0.2f, t_best)); const float t2 = fminf(b, fmaf(adaptive_diameter, 0.2f, t_best)); if (t2 > t1) { alignas(16) float tq1[32]; alignas(16) float tq2[32]; float xx1; float yy1; float xx2; float yy2; map.map01ToPoint(t1, tq1); const float f1 = cost(tq1, xx1, yy1); map.map01ToPoint(t2, tq2); const float f2 = cost(tq2, xx2, yy2); IntervalND* I = new IntervalND(t1, t2, f1, f2); I->i1 = t_to_idx(t1); I->i2 = t_to_idx(t2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); I->R = fmaf(I->R, 2.03f - adaptive_coeff, 0.0f); if (dim > 2) I->R = fmaf(I->R, adaptive_coeff_clone(1.0f), 0.0f); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } if (bm.bestF < bestF) { bestF = bm.bestF; bestX = bm.bestX; bestY = bm.bestY; bestQ.assign(bm.bestQ, bm.bestQ + bm.dim); } } } ++it; } }

Compartilhar esta Q&A