"static __declspec(noalias) void agp_run_branch_mp...
Criado em: 17 de dezembro de 2025
Respondido usando GPT-5.2 Thinking por Chat01
Criado em: 17 de dezembro de 2025
Respondido usando GPT-5.2 Thinking por Chat01
"static __declspec(noalias) void agp_run_branch_mpi(
const MortonND& map,
const ManipCost& cost,
int maxIter,
float r,
bool adaptive,
float eps,
unsigned seed,
std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H,
std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ,
float& bestF,
float& bestX,
float& bestY,
size_t& out_iterations,
float& out_achieved_epsilon,
float M_prior) noexcept
{
const int n = cost.n;
const int dim = n + (cost.variableLen ? n : 0);
const float dim_f = (float)dim;
unsigned exchange_counter = 0;
unsigned exchange_counter_T = 0;
alignas(16) float M_by_span[12];
int msi = 0;
while (msi < 12) M_by_span[msi++] = M_prior;
float Mmax = M_prior;
alignas(16) float q_local[32];
alignas(16) float phi[32];
alignas(16) float s_arr[32];
alignas(16) float c_arr[32];
alignas(16) float sum_s[32];
alignas(16) float sum_c[32];
alignas(16) float q_try[32];
bestQ.reserve((size_t)dim);
float x = 0.0f;
float y = 0.0f;
int no_improve = 0;
auto t_to_idx = [&](float t) -> unsigned long long {
unsigned long long idx = (unsigned long long)fmaf(t, (float)map.scale, 0.0f);
return idx;
};
auto update_pockets_and_Mmax = [&](IntervalND* I) {
const int k = I->span_level;
if (I->M > M_by_span[k]) M_by_span[k] = I->M;
if (M_by_span[k] > Mmax) Mmax = M_by_span[k];
};
const float a = 0.0f;
const float b = 1.0f;
float p = 0.0f;
float dmax = fmaf(b, 1.0f, -a);
const float initial_len = dmax;
float exp_arg_threshold = fmaf(1.0f / fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, -0.2f, 0.25f), -0.33333333f), 0.5f), -1.0f), dim_f), 0.415888308336f, 0.0f);
const float log_arg_threshold = fmaf(dim_f, 0.1f, -0.105f);
const float adaptive_coeff_addition_threshold = fmaf(fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 1.0f / fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg_threshold), 0.17814618538f);
float adaptive_coeff_threshold = adaptive_coeff_addition_threshold + 1.0f;
const float log_arg = dim_f + 5.0f;
const float A_dim = fmaf(1.0f / fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), 3.0f, 0.0f);
const float B_dim = fmaf(A_dim, 0.65f, 0.0f);
const float log_argument = A_dim - 2.03f;
const float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim;
const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.55f), 1.0f);
float adaptive_coeff = A_dim - adaptive_coeff_addition;
int it = 0;
auto evalAt = [&](float t) -> float {
map.map01ToPoint(t, q_local);
float f = cost(q_local, x, y);
const float step_arg = fmaf(-dim_f, 0.825f, 3.3f);
float eta = fmaf(fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f);
if ((((f < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) && p > 0.55f) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && ((p > 0.7f && !(it % 3)) || p > 0.9f))) && dim > 2) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && dim < 3)) {
float acc = 0.0f;
int ii = 0;
while (ii < n) {
acc = fmaf(q_local[ii], 1.0f, acc);
phi[ii] = acc;
++ii;
}
FABE13_SINCOS(phi, s_arr, c_arr, n);
float as = 0.0f;
float ac = 0.0f;
int k = n - 1;
while (k >= 0) {
const float Lk = cost.variableLen ? q_local[n + k] : 1.0f;
as = fmaf(Lk, s_arr[k], as);
ac = fmaf(Lk, c_arr[k], ac);
sum_s[k] = as;
sum_c[k] = ac;
--k;
}
const float dx = fmaf(x, 1.0f, -cost.targetX);
const float dy = fmaf(y, 1.0f, -cost.targetY);
const float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1.0e-8f;
int stepI = 0;
const float adaptive_window = fmaf(1.0f / fmaf(bestF, adaptive_coeff_threshold, 0.0f), fmaf(bestF, adaptive_coeff_threshold, -f), 0.0f);
const float curse_dim_arg = fmaf(dim_f, 0.825f, 0.0f);
const int threshold_iters_grad = (int)fmaf(fmaf(adaptive_window, adaptive_window, 1.0f), fmaf(fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f), 0.0f);
int no_improve_steps = 0;
float f_at_check_start = f;
int steps_since_check = 0;
const int check_interval = (int)fmaf(1.0f / log2f(fmaf(dim_f, 0.5f, 1.0f)), (float)(threshold_iters_grad >> 2), 0.0f);
while (stepI < threshold_iters_grad) {
int i = 0;
#pragma loop ivdep
while (i < n) {
float gpen = 0.0f;
{
const float ai = fabsf(q_local[i]);
const float v = fmaf(ai, 1.0f, -cost.minTheta);
if (v > 0.0f) {
const float scale_arg = fmaf(2.0f / (cost.minTheta + 1.0e-6f), fmaf(v, 0.69314718055994530941723212145818f, 0.0f), 0.0f);
const float exp_val = fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float dpen_dtheta = fmaf(cost.sharpW, fmaf(exp_val, 0.69314718055994530941723212145818f * (2.0f / (cost.minTheta + 1.0e-6f)), 0.0f), copysignf(1.0f, q_local[i]));
gpen = fmaf(dpen_dtheta, 1.0f, gpen);
}
}
{
const float tsg = fmaf(-q_local[i], cost.archBiasK, 0.0f);
const float exp_arg = -tsg;
const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float sig = 1.0f / fmaf(exp_val, 1.0f, 1.0f);
gpen = fmaf(-fmaf(cost.archBiasW, cost.archBiasK, 0.0f), sig, gpen);
}
const float g = fmaf(fmaf(dx, -sum_s[i], fmaf(dy, sum_c[i], 0.0f)), 1.0f / dist, gpen);
q_try[i] = fmaf(-eta, g, q_local[i]);
const float lo = (i == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
const float hi = 2.6179938779914943653855361527329f;
if (q_try[i] < lo) q_try[i] = lo;
else if (q_try[i] > hi) q_try[i] = hi;
++i;
}
if (cost.variableLen) {
int j = 0;
#pragma loop ivdep
while (j < n) {
const float g = fmaf(fmaf(dx, c_arr[j], fmaf(dy, s_arr[j], 0.0f)), 1.0f / dist, 0.0f);
float v = fmaf(-eta, g, q_local[n + j]);
if (v < 0.5f) v = 0.5f;
else if (v > 2.0f) v = 2.0f;
q_try[n + j] = v;
++j;
}
}
float x2;
float y2;
const float f2 = cost(q_try, x2, y2);
const float rel_improvement = fmaf(-1.0f / f, f2, 1.0f);
if (f2 < f) {
memcpy(q_local, q_try, (size_t)dim * sizeof(float));
f = f2;
x = x2;
y = y2;
eta = fmaf(eta, fmaf(10.0f / sqrtf(dim_f), 1.0f / (1.0f + rel_improvement) * rel_improvement, 1.0f), 0.0f);
}
else {
const float caution_coeff = 5.0f / sqrtf(dim_f) * rel_improvement;
eta = fmaf(eta, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0f);
memcpy(q_try, q_local, (size_t)dim * sizeof(float));
++no_improve_steps;
}
++steps_since_check;
if (steps_since_check == check_interval) {
if (fmaf(-1.0f / f_at_check_start, f, 1.0f) < 1e-3f) break;
f_at_check_start = f;
steps_since_check = 0;
}
if (no_improve_steps > threshold_iters_grad >> 2) break;
++stepI;
}
}
if (f < bestF) {
if (0.95f < p || dim < 3) {
const int last = n - 1;
const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
const float hi = 2.6179938779914943653855361527329f;
float bestLocF = f;
float saved = q_local[last];
float delta = eta;
while (delta > fmaf(delta, 0.15f, 0.0f)) {
int sgn = -1;
while (sgn < 2) {
float cand = fmaf((float)sgn, delta, saved);
if (cand < lo) cand = lo;
else if (cand > hi) cand = hi;
const float backup = q_local[last];
q_local[last] = cand;
float x2;
float y2;
const float f2 = cost(q_local, x2, y2);
if (f2 < bestLocF) {
bestLocF = f2;
x = x2;
y = y2;
saved = cand;
}
q_local[last] = backup;
sgn += 2;
}
delta = fmaf(delta, 0.5f, 0.0f);
}
if (bestLocF < f) {
q_local[last] = saved;
f = bestLocF;
}
}
bestF = f;
bestQ.assign(q_local, q_local + dim);
bestX = x;
bestY = y;
no_improve = 0;
}
else ++no_improve;
return f;
};
const float f_a = evalAt(a);
const float f_b = evalAt(b);
const float Kf = fminf(fmaxf(2.0f * dim_f, 8.0f), 128.0f);
const int K = (int)Kf;
H.reserve((size_t)maxIter + (size_t)K + 16u);
const int rank = g_world->rank();
const int world = g_world->size();
alignas(16) float seeds[256 * 32];
const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, (unsigned)fmaf((float)rank, 7919.0f, (float)seed));
int i = 0;
while (i < seedCnt) {
const float* s = seeds + (size_t)fmaf((float)i, 32.0f, 0.0f);
const float t_seed = map.pointToT(s);
const float interval_size = (i < 3) ? fmaf(0.0004f, (float)dim, 0.0f) : fmaf(fmaf(0.00031f, (float)dim, 0.0f), exp2f((1.0f / (float)(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * (float)(i - 3)), 0.0f);
const float t1 = fmaxf(a, fmaf(-interval_size, 1.0f, t_seed));
const float t2 = fminf(b, fmaf(interval_size, 1.0f, t_seed));
if (t2 > t1) {
alignas(16) float q1[32];
alignas(16) float q2[32];
float x1;
float y1;
float x2;
float y2;
map.map01ToPoint(t1, q1);
const float f1 = cost(q1, x1, y1);
map.map01ToPoint(t2, q2);
const float f2 = cost(q2, x2, y2);
IntervalND* I = new IntervalND(t1, t2, f1, f2);
I->i1 = t_to_idx(t1);
I->i2 = t_to_idx(t2);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
if (i < 3) I->R = fmaf(I->R, fmaf(0.01f, (float)dim, 0.85f), 0.0f);
else {
const float start_mult = fmaf(0.214f, (float)dim, 0.0f);
const float end_mult = fmaf(0.174f, (float)dim, 0.0f);
const float mult = fmaf(start_mult, exp2f((1.0f / (float)(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * (float)(i - 3)), 0.0f);
I->R = fmaf(I->R, mult, 0.0f);
}
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
if (f1 < bestF) {
bestF = f1;
bestQ.assign(q1, q1 + dim);
bestX = x1;
bestY = y1;
}
if (f2 < bestF) {
bestF = f2;
bestQ.assign(q2, q2 + dim);
bestX = x2;
bestY = y2;
}
}
++i;
}
float prev_t = a;
float prev_f = f_a;
int k = 1;
while (k <= K) {
const float t = fmaf(fmaf(b - a, (float)k / (float)(K + 1), a), 1.0f, (float)rank / (float)(world * (K + 1)));
const float f = evalAt(t);
IntervalND* I = new IntervalND(prev_t, t, prev_f, f);
I->i1 = t_to_idx(prev_t);
I->i2 = t_to_idx(t);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
prev_t = t;
prev_f = f;
++k;
}
IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b);
tail->i1 = t_to_idx(prev_t);
tail->i2 = t_to_idx(b);
tail->diam = map.block_diameter(tail->i1, tail->i2);
tail->compute_span_level(map);
tail->set_metric(tail->diam);
update_pockets_and_Mmax(tail);
tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(tail);
std::push_heap(H.begin(), H.end(), ComparePtrND);
const int kickEveryDim = static_cast<int>(fmaf(22.5f, exp2f(-0.295f * dim_f), 0.0f));
const int noImproveThrDim = static_cast<int>(fmaf(32.5f, exp2f(-0.1f * dim_f), 0.0f));
while (it < maxIter) {
p = fmaf(-1.0f / initial_len, dmax, 1.0f);
const float p_arg = fmaf(p, 2.3f, -2.9775f);
const float r_eff = fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, r, 0.0f);
const bool staganition = no_improve > noImproveThrDim;
if (!(it % kickEveryDim) || staganition) {
const float t_best = map.pointToT(bestQ.data());
const float log_arg = fmaf(dim_f, 1.0f, -1.0f);
const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f);
int ii = 0;
while (ii < 2) {
const float off = (ii == 0) ? fmaf(adaptive_diameter, 2.0f, 0.0f) : fmaf(-adaptive_diameter, 2.0f, 0.0f);
const float t_seed = fminf(b, fmaxf(a, fmaf(off, 1.0f, t_best)));
const float f_seed = evalAt(t_seed);
IntervalND* J = new IntervalND(fmaf(-adaptive_diameter, 1.0f, t_seed), fmaf(adaptive_diameter, 1.0f, t_seed), f_seed, f_seed);
J->i1 = t_to_idx(fmaf(-adaptive_diameter, 1.0f, t_seed));
J->i2 = t_to_idx(fmaf(adaptive_diameter, 1.0f, t_seed));
J->diam = map.block_diameter(J->i1, J->i2);
J->compute_span_level(map);
J->set_metric(J->diam);
update_pockets_and_Mmax(J);
J->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
J->R = fmaf(J->R, 2.03f - adaptive_coeff, 0.0f);
H.emplace_back(J);
std::push_heap(H.begin(), H.end(), ComparePtrND);
++ii;
}
no_improve = 0;
}
exp_arg_threshold = fmaf(-exp_arg_threshold * p, 10.0f, 0.0f);
adaptive_coeff_threshold = fmaf(fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition_threshold, 1.0f);
const float exp_arg = fmaf(B_dim, p, 0.0f);
adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim);
if (dim < 3) {
const float adaptive_coeff_arg = adaptive_coeff - 1.0f;
const float adaptive_coeff_threshold_arg = adaptive_coeff_threshold - 1.0f;
adaptive_coeff = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
adaptive_coeff_threshold = fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_threshold_arg);
}
const float exp_argument = fmaf(-0.2925f, dim_f, 0.0f);
const float exp2_exp_arg = fmaf(exp_argument * 0.69314718055994530941723212145818f, fmaf(exp_argument * 0.69314718055994530941723212145818f, fmaf(exp_argument * 0.69314718055994530941723212145818f, fmaf(exp_argument * 0.69314718055994530941723212145818f, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float A = fmaf(39.995f, exp2_exp_arg, 0.0f);
const int T = static_cast<int>(fmaf(fmaf(boost::math::pdf(boost::math::normal(0.5f, 0.17f), p), sqrtf(fmaf(dim_f, 1.825f, 0.0f)), 0.0f), fmaf(-(exp2_exp_arg - 1.0f), A, A), 0.0f));
std::pop_heap(H.begin(), H.end(), ComparePtrND);
IntervalND* cur = H.back();
H.pop_back();
const float x1 = cur->x1;
const float x2 = cur->x2;
const float y1 = cur->y1;
const float y2 = cur->y2;
float m = fmaf(r_eff, Mmax, 0.0f);
float tNew = step(m, x1, x2, y1, y2, dim_f, r_eff);
const float bestFOld = bestF;
const float fNew = evalAt(tNew);
IntervalND* L = new IntervalND(x1, tNew, y1, fNew);
IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2);
L->i1 = t_to_idx(x1);
L->i2 = t_to_idx(tNew);
Rv->i1 = t_to_idx(tNew);
Rv->i2 = t_to_idx(x2);
L->diam = map.block_diameter(L->i1, L->i2);
Rv->diam = map.block_diameter(Rv->i1, Rv->i2);
L->compute_span_level(map);
Rv->compute_span_level(map);
L->set_metric(L->diam);
Rv->set_metric(Rv->diam);
const float Mloc = fmaxf(L->M, Rv->M);
update_pockets_and_Mmax(L);
update_pockets_and_Mmax(Rv);
const float prevMmax = Mmax;
if (Mloc > Mmax) Mmax = Mloc;
m = fmaf(r_eff, Mmax, 0.0f);
if (adaptive) {
const float len1 = fmaf(tNew, 1.0f, -x1);
const float len2 = fmaf(x2, 1.0f, -tNew);
if (fmaf(len1, 1.0f, len2) == dmax) {
dmax = fmaxf(len1, len2);
for (auto pI : H) {
const float Ls = fmaf(pI->x2, 1.0f, -pI->x1);
if (Ls > dmax) dmax = Ls;
}
}
if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) {
const float alpha = p * p;
const float beta = fmaf(-alpha, 1.0f, 2.0f);
const float MULT = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULT, r_eff, -MULT);
const float GF = beta * global_coeff;
L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f)));
Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f)));
const size_t sz = H.size();
RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
else {
if (Mloc > prevMmax) {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
const size_t sz = H.size();
RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
}
else {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
}
}
}
else {
if (Mloc > prevMmax) {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
const size_t sz = H.size();
RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
}
else {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
}
}
H.emplace_back(L);
std::push_heap(H.begin(), H.end(), ComparePtrND);
H.emplace_back(Rv);
std::push_heap(H.begin(), H.end(), ComparePtrND);
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
IntervalND* const top = H.front();
const float interval_len = dim > 1 ? fmaf(top->x2, 1.0f, -top->x1) : top->diam;
if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter) {
out_iterations = (size_t)it;
out_achieved_epsilon = interval_len;
return;
}
if (!(it % T) || staganition) {
MultiCrossMsg out;
const unsigned intervals_to_send = static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f)));
out.count = intervals_to_send;
float* dest = out.intervals;
unsigned ii = 0;
while (ii < 3) {
IntervalND* Tt = H[ii];
dest[0] = Tt->x1;
dest[1] = 0.0f;
dest[2] = Tt->x2;
dest[3] = 0.0f;
dest[4] = Tt->R;
dest += 5;
++ii;
}
const size_t iterations = std::bit_width((size_t)(world - 1));
bool active = true;
const bool invert_T = ((int)fmaf((float)exchange_counter_T, 1.0f, 1.0f)) & 1;
size_t ii2 = 0;
while (ii2 < iterations && active) {
const size_t step = 1ULL << ii2;
const int partner = rank ^ (int)step;
if (partner < world) {
const bool am_sender = ((!!(rank & (int)step)) ^ invert_T);
if (am_sender) {
g_world->isend(partner, 0, out);
active = false;
}
}
++ii2;
}
++exchange_counter_T;
}
if (bestF < fmaf(bestFOld, 2.03f - adaptive_coeff, 0.0f) && !(it % 2)) {
BestSolutionMsg out;
out.bestF = bestF;
out.bestX = bestX;
out.bestY = bestY;
out.dim = (unsigned)bestQ.size();
memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float));
const size_t iterations = std::bit_width((size_t)(world - 1));
bool active = true;
const bool invert_T = ((int)fmaf((float)exchange_counter, 1.0f, 1.0f)) & 1;
size_t ii2 = 0;
while (ii2 < iterations && active) {
const size_t step = 1ULL << ii2;
const int partner = rank ^ (int)step;
if (partner < world) {
const bool am_sender = ((!!(rank & (int)step)) ^ invert_T);
if (am_sender) {
g_world->isend(partner, 2, out);
active = false;
}
}
++ii2;
}
++exchange_counter;
}
while (g_world->iprobe(boost::mpi::any_source, 0)) {
MultiCrossMsg in;
g_world->recv(boost::mpi::any_source, 0, in);
const MultiCrossMsg& mX = in;
unsigned ii = 0;
while (ii < mX.count) {
const float* d = &mX.intervals[ii * 5];
float sx = d[0];
float ex = d[2];
if (ex > sx) {
alignas(16) float tmp[32];
float tx;
float ty;
map.map01ToPoint(sx, tmp);
const float y1i = cost(tmp, tx, ty);
map.map01ToPoint(ex, tmp);
const float y2i = cost(tmp, tx, ty);
IntervalND* inj = new IntervalND(sx, ex, y1i, y2i);
inj->i1 = t_to_idx(sx);
inj->i2 = t_to_idx(ex);
inj->diam = map.block_diameter(inj->i1, inj->i2);
inj->compute_span_level(map);
inj->set_metric(inj->diam);
update_pockets_and_Mmax(inj);
inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
IntervalND* topH = H.front();
if (inj->R > fmaf(adaptive_coeff, topH->R, -0.03f)) {
const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f;
const float kf = staganition ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f);
inj->R = fmaf(d[4], kf, 0.0f);
H.emplace_back(inj);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
}
++ii;
}
}
while (g_world->iprobe(boost::mpi::any_source, 2)) {
BestSolutionMsg bm;
g_world->recv(boost::mpi::any_source, 2, bm);
if (bm.bestF < fmaf(bestF, adaptive_coeff, 0.0f) || staganition) {
alignas(16) float tmp_q[32];
memcpy(tmp_q, bm.bestQ, bm.dim * sizeof(float));
const float t_best = map.pointToT(tmp_q);
const float log_arg = fmaf(dim_f, 1.0f, -1.0f);
const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f);
const float t1 = fmaxf(a, fmaf(-adaptive_diameter, 0.2f, t_best));
const float t2 = fminf(b, fmaf(adaptive_diameter, 0.2f, t_best));
if (t2 > t1) {
alignas(16) float tq1[32];
alignas(16) float tq2[32];
float xx1;
float yy1;
float xx2;
float yy2;
map.map01ToPoint(t1, tq1);
const float f1 = cost(tq1, xx1, yy1);
map.map01ToPoint(t2, tq2);
const float f2 = cost(tq2, xx2, yy2);
IntervalND* I = new IntervalND(t1, t2, f1, f2);
I->i1 = t_to_idx(t1);
I->i2 = t_to_idx(t2);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, 2.03f - adaptive_coeff, 0.0f);
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
if (bm.bestF < bestF) {
bestF = bm.bestF;
bestX = bm.bestX;
bestY = bm.bestY;
bestQ.assign(bm.bestQ, bm.bestQ + bm.dim);
}
}
}
++it;
}
}" - сейчас в случае перспективного интервала - отправляется только один интервал при одиночных пересылках - нужно сделать так, чтобы при размерности меньше 3 - отправлялся один лучший интервал так как это реализовано сейчас, при размерности три и больше отправлять всегда фиксировано два лучших интервала в случае перспективного интервала, но реализовать закон убывающей полезности - сейчас при отправке (и последующем приёме) одного лучшего интервала на принимающей стороне его характеристика преобразуется как "I->R = fmaf(I->R, 2.03f - adaptive_coeff, 0.0f);" - и такой закон преобразования характеристик нужно сохранить для случая размерностей меньше 3, но в случае размерности 3 и больше по закону убывающей полезности должно происходить следующее: "первый лучший интервал должен преобразовываться по тому же правилу но помимо этого он должен умножаться на значение адаптивного коэффициента adaptive_coeff но заметь этот адаптивный коэффициент вычисляется по прогрессу - есть добавка зависящая от прогресса, домножаемая и складываемая с основной частью - таким образом формируется адаптивный коэффициент в зависимости от прогресса - но нужно теперь хранить такую же в точности добавку с тем же функционалом но потом подставлять вместо прогресса 0 - для первого лучшего интервала, 1 - для второго лучшего интервала (он уже не самый лучший интервал) - логично что коэффициент для него должен быть меньше, т.е. по факту нужно создать клон адаптивного коэффициента - но заметь что основа этого коэффициента зависящая от размерности а не от прогресса должна сохраниться - так будет выгоднее чтобы не дублировать функционал - используем ту же основу но добавляем к ней другую добавку зависящую формально также от прогресса но вместо прогресса подставляем 0 для первого лучшего интервала и 1 для второго как финальную версию прогресса так как второй интервал будет последним в пачке пересылаемых, но важный нюанс - клон адаптивного коэффициента должен будет быть сдвинут вниз по оси ординат если его рисовать как функцию от прогресса на величину равную разнице максимального и минимального значения коэффициента делённую пополам - это обеспечит что пересылаемые интервалы не будут сильно завышаться по характеристикам, и этот клон адаптивного коэффициента но зависимый от того какой это лучший интервал (первый лучший из кучи - то есть действительно самый лучший, второй лучший сразу после первого) нужно будет домножать через fmaf на характеристику уже пересчитанную по формуле "I->R = fmaf(I->R, 2.03f - adaptive_coeff, 0.0f);" - то есть пересчитываем так и домножаем через fmaf дополнительно на клон адаптивного коэффициента. НО более того нужно будет еще и домножать на этот клон интервалы которые пересылаются при крупном обмене - сразу несколько интервалов в пачке пересылаются - там после того как характеристика будет изменена по правилу: " inj->R = fmaf(d[4], kf, 0.0f);" - нужно будет дополнительно после этого домножить характеристику через fmaf на клон адаптивного коэффициента - только в этом случае ЗАМЕТЬ БУДЕТ ОТЛИЧИЕ В АРГУМЕНТАХ КЛОНА АДАПТИВНОГО КОЭФФИЦИЕНТА - ЕСЛИ РАНЬШЕ МЫ ПЕРЕДАВАЛИ ТОЛЬКО 0 И 1 ТАК КАК БЫЛО ТОЛЬКО ДВА ИНТЕРВАЛА В СЛУЧАЕ РАЗМЕРНОСТИ БОЛЬШЕ 2 ПРИ ПЕРЕСЫЛКЕ ПО УСЛОВИЮ ПЕРСПЕКТИВНОГО ИНТЕРВАЛА - В ЭТОМ ЖЕ СЛУЧАЕ НАПРИМЕР ЕСЛИ ПЕРЕСЫЛАЕТСЯ ПАЧКА ИЗ ТРЁХ ИНТЕРВАЛОВ ТО АРГУМЕНТЫ У КЛОНА АДАПТИВНОГО КОЭФФИЦИЕНТА БУДУТ 0, 0.5, 1 СООТВЕТСВЕННО, В СЛУЧАЕ ПАЧКИ ИЗ 4 ИНТЕРВАЛОВ 0, 0.333, 0.666, 1 СООТВЕТСВЕННО - И АНАЛОГИЧНО ПРИ БОЛЬШИХ РАЗМЕРАХ ПАЧКИ АРГУМЕНТЫ БУДУТ МЕНЯТЬСЯ, ДЛЯ ПАЧКИ ИЗ 5 ИНТЕРВАЛОВ 0, 0.25, 0.5, 0.75, 1 И ТАК ДАЛЕЕ ПО АНАЛОГИИ - ПОМНИ О ТОМ ЧТО КЛОН АДАПТИВНОГО КОЭФФИЦИЕНТА ДОЛЖЕН БЫТЬ СДВИНУТ ВНИЗ ПО ОРДИНАТЕ НА ВЕЛИЧИНУ РАВНУЮ ПОЛУРАЗНИЦЕ МАКСИМАЛЬНОГО И МИНИМАЛЬНОГО ЗНАЧЕНИЙ АДАПТИВНОГО КОЭФФИЦИЕНТА ПРИ ПРОГРЕССАХ 0 И 1 СООТВЕТСТВЕННО - ПРИМЕНИ УКАЗАННЫЕ МНОЙ ИЗМЕНЕНИЯ В МОЙ КОД И ВЕРНИ МНЕ ПОЛНУЮ ИСПРАВЛЕННУЮ ВЕРСИЮ КОДА - ПРИДЕРЖИВАЙСЯ МОЕГО СТИЛЯ МАКСИМАЛЬНОЙ ОПТИМИЗАЦИИ ПОД СКОРОСТЬ КОДА И ПРИДЕРЖИВАЙСЯ ТЕХ ЖЕ ПРАВИЛ ПО ОФОРМЛЕНИЮ КОДА, ВЫЧИСЛЕНИЯ ПО ВОЗМОЖНОСТИ ИНЛАЙНЬ, ПОСТАРАЙСЯ ЧТОБЫ ЭТО БЫЛО МАКСИМАЛЬНО БЫСТРО - ТАК ЧТО ДЕЛАЙ ИНЛАЙН FMAF НАДСТРОЙКИ НАД СУЩЕСТВУЮЩИМИ ФОРМУЛАМИ - ЧТОБЫ ПРОСТО FMAF ФОРМУЛЫ СТАЛИ ДЛИННЕЕ А НЕ ПОЯВИЛИСЬ НОВЫЕ, ДАЖЕ ПРОСТЫЕ УМНОЖЕНИЯ ДВУХ ЧИСЕЛ ВСЕГДА РЕАЛИЗУЙ ЧЕРЕЗ FMAF - МНЕ ВЕРНИ ПОЛНУЮ ИСПРАВЛЕННУЮ ВЕРСИЮ КОДА БЕЗ ПРОПУСКОВ И КОММЕНТАРИЕВ
"
cppstatic __declspec(noalias) void agp_run_branch_mpi( const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed, std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H, std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ, float& bestF, float& bestX, float& bestY, size_t& out_iterations, float& out_achieved_epsilon, float M_prior) noexcept { const int n = cost.n; const int dim = n + (cost.variableLen ? n : 0); const float dim_f = (float)dim; unsigned exchange_counter = 0; unsigned exchange_counter_T = 0; alignas(16) float M_by_span[12]; int msi = 0; while (msi < 12) M_by_span[msi++] = M_prior; float Mmax = M_prior; alignas(16) float q_local[32]; alignas(16) float phi[32]; alignas(16) float s_arr[32]; alignas(16) float c_arr[32]; alignas(16) float sum_s[32]; alignas(16) float sum_c[32]; alignas(16) float q_try[32]; bestQ.reserve((size_t)dim); float x = 0.0f; float y = 0.0f; int no_improve = 0; auto t_to_idx = [&](float t) -> unsigned long long { unsigned long long idx = (unsigned long long)fmaf(t, (float)map.scale, 0.0f); return idx; }; auto update_pockets_and_Mmax = [&](IntervalND* I) { const int k = I->span_level; if (I->M > M_by_span[k]) M_by_span[k] = I->M; if (M_by_span[k] > Mmax) Mmax = M_by_span[k]; }; const float a = 0.0f; const float b = 1.0f; float p = 0.0f; float dmax = fmaf(b, 1.0f, -a); const float initial_len = dmax; float exp_arg_threshold = fmaf(1.0f / fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, -0.2f, 0.25f), -0.33333333f), 0.5f), -1.0f), dim_f), 0.415888308336f, 0.0f); const float log_arg_threshold = fmaf(dim_f, 0.1f, -0.105f); const float adaptive_coeff_addition_threshold = fmaf(fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 1.0f / fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg_threshold), 0.17814618538f); float adaptive_coeff_threshold = adaptive_coeff_addition_threshold + 1.0f; const float log_arg = dim_f + 5.0f; const float A_dim = fmaf(1.0f / fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), 3.0f, 0.0f); const float B_dim = fmaf(A_dim, 0.65f, 0.0f); const float log_argument = A_dim - 2.03f; const float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim; const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.55f), 1.0f); float adaptive_coeff = A_dim - adaptive_coeff_addition; const float clone_poly_B = fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float clone_coeff0 = fmaf(-1.0f, adaptive_coeff_addition, A_dim); const float clone_coeff1 = fmaf(-clone_poly_B, adaptive_coeff_addition, A_dim); const float clone_max = fmaxf(clone_coeff0, clone_coeff1); const float clone_min = fminf(clone_coeff0, clone_coeff1); const float clone_shift_down = fmaf(fmaf(clone_max, 1.0f, -clone_min), 0.5f, 0.0f); auto adaptive_coeff_clone = [&](float prog) -> float { const float exp_arg_c = fmaf(B_dim, prog, 0.0f); const float poly_c = fmaf(exp_arg_c, fmaf(exp_arg_c, fmaf(exp_arg_c, fmaf(exp_arg_c, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float coeff_c = fmaf(-poly_c, adaptive_coeff_addition, A_dim); return fmaf(coeff_c, 1.0f, -clone_shift_down); }; int it = 0; auto evalAt = [&](float t) -> float { map.map01ToPoint(t, q_local); float f = cost(q_local, x, y); const float step_arg = fmaf(-dim_f, 0.825f, 3.3f); float eta = fmaf(fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f); if ((((f < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) && p > 0.55f) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && ((p > 0.7f && !(it % 3)) || p > 0.9f))) && dim > 2) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && dim < 3)) { float acc = 0.0f; int ii = 0; while (ii < n) { acc = fmaf(q_local[ii], 1.0f, acc); phi[ii] = acc; ++ii; } FABE13_SINCOS(phi, s_arr, c_arr, n); float as = 0.0f; float ac = 0.0f; int k = n - 1; while (k >= 0) { const float Lk = cost.variableLen ? q_local[n + k] : 1.0f; as = fmaf(Lk, s_arr[k], as); ac = fmaf(Lk, c_arr[k], ac); sum_s[k] = as; sum_c[k] = ac; --k; } const float dx = fmaf(x, 1.0f, -cost.targetX); const float dy = fmaf(y, 1.0f, -cost.targetY); const float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1.0e-8f; int stepI = 0; const float adaptive_window = fmaf(1.0f / fmaf(bestF, adaptive_coeff_threshold, 0.0f), fmaf(bestF, adaptive_coeff_threshold, -f), 0.0f); const float curse_dim_arg = fmaf(dim_f, 0.825f, 0.0f); const int threshold_iters_grad = (int)fmaf(fmaf(adaptive_window, adaptive_window, 1.0f), fmaf(fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f), 0.0f); int no_improve_steps = 0; float f_at_check_start = f; int steps_since_check = 0; const int check_interval = (int)fmaf(1.0f / log2f(fmaf(dim_f, 0.5f, 1.0f)), (float)(threshold_iters_grad >> 2), 0.0f); while (stepI < threshold_iters_grad) { int i = 0; #pragma loop ivdep while (i < n) { float gpen = 0.0f; { const float ai = fabsf(q_local[i]); const float v = fmaf(ai, 1.0f, -cost.minTheta); if (v > 0.0f) { const float scale_arg = fmaf(2.0f / (cost.minTheta + 1.0e-6f), fmaf(v, 0.69314718055994530941723212145818f, 0.0f), 0.0f); const float exp_val = fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float dpen_dtheta = fmaf(cost.sharpW, fmaf(exp_val, 0.69314718055994530941723212145818f * (2.0f / (cost.minTheta + 1.0e-6f)), 0.0f), copysignf(1.0f, q_local[i])); gpen = fmaf(dpen_dtheta, 1.0f, gpen); } } { const float tsg = fmaf(-q_local[i], cost.archBiasK, 0.0f); const float exp_arg = -tsg; const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float sig = 1.0f / fmaf(exp_val, 1.0f, 1.0f); gpen = fmaf(-fmaf(cost.archBiasW, cost.archBiasK, 0.0f), sig, gpen); } const float g = fmaf(fmaf(dx, -sum_s[i], fmaf(dy, sum_c[i], 0.0f)), 1.0f / dist, gpen); q_try[i] = fmaf(-eta, g, q_local[i]); const float lo = (i == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f; const float hi = 2.6179938779914943653855361527329f; if (q_try[i] < lo) q_try[i] = lo; else if (q_try[i] > hi) q_try[i] = hi; ++i; } if (cost.variableLen) { int j = 0; #pragma loop ivdep while (j < n) { const float g = fmaf(fmaf(dx, c_arr[j], fmaf(dy, s_arr[j], 0.0f)), 1.0f / dist, 0.0f); float v = fmaf(-eta, g, q_local[n + j]); if (v < 0.5f) v = 0.5f; else if (v > 2.0f) v = 2.0f; q_try[n + j] = v; ++j; } } float x2; float y2; const float f2 = cost(q_try, x2, y2); const float rel_improvement = fmaf(-1.0f / f, f2, 1.0f); if (f2 < f) { memcpy(q_local, q_try, (size_t)dim * sizeof(float)); f = f2; x = x2; y = y2; eta = fmaf(eta, fmaf(10.0f / sqrtf(dim_f), 1.0f / (1.0f + rel_improvement) * rel_improvement, 1.0f), 0.0f); } else { const float caution_coeff = 5.0f / sqrtf(dim_f) * rel_improvement; eta = fmaf(eta, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0f); memcpy(q_try, q_local, (size_t)dim * sizeof(float)); ++no_improve_steps; } ++steps_since_check; if (steps_since_check == check_interval) { if (fmaf(-1.0f / f_at_check_start, f, 1.0f) < 1e-3f) break; f_at_check_start = f; steps_since_check = 0; } if (no_improve_steps > threshold_iters_grad >> 2) break; ++stepI; } } if (f < bestF) { if (0.95f < p || dim < 3) { const int last = n - 1; const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f; const float hi = 2.6179938779914943653855361527329f; float bestLocF = f; float saved = q_local[last]; float delta = eta; while (delta > fmaf(delta, 0.15f, 0.0f)) { int sgn = -1; while (sgn < 2) { float cand = fmaf((float)sgn, delta, saved); if (cand < lo) cand = lo; else if (cand > hi) cand = hi; const float backup = q_local[last]; q_local[last] = cand; float x2; float y2; const float f2 = cost(q_local, x2, y2); if (f2 < bestLocF) { bestLocF = f2; x = x2; y = y2; saved = cand; } q_local[last] = backup; sgn += 2; } delta = fmaf(delta, 0.5f, 0.0f); } if (bestLocF < f) { q_local[last] = saved; f = bestLocF; } } bestF = f; bestQ.assign(q_local, q_local + dim); bestX = x; bestY = y; no_improve = 0; } else ++no_improve; return f; }; const float f_a = evalAt(a); const float f_b = evalAt(b); const float Kf = fminf(fmaxf(2.0f * dim_f, 8.0f), 128.0f); const int K = (int)Kf; H.reserve((size_t)maxIter + (size_t)K + 16u); const int rank = g_world->rank(); const int world = g_world->size(); alignas(16) float seeds[256 * 32]; const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, (unsigned)fmaf((float)rank, 7919.0f, (float)seed)); int i = 0; while (i < seedCnt) { const float* s = seeds + (size_t)fmaf((float)i, 32.0f, 0.0f); const float t_seed = map.pointToT(s); const float interval_size = (i < 3) ? fmaf(0.0004f, (float)dim, 0.0f) : fmaf(fmaf(0.00031f, (float)dim, 0.0f), exp2f((1.0f / (float)(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * (float)(i - 3)), 0.0f); const float t1 = fmaxf(a, fmaf(-interval_size, 1.0f, t_seed)); const float t2 = fminf(b, fmaf(interval_size, 1.0f, t_seed)); if (t2 > t1) { alignas(16) float q1[32]; alignas(16) float q2[32]; float x1; float y1; float x2; float y2; map.map01ToPoint(t1, q1); const float f1 = cost(q1, x1, y1); map.map01ToPoint(t2, q2); const float f2 = cost(q2, x2, y2); IntervalND* I = new IntervalND(t1, t2, f1, f2); I->i1 = t_to_idx(t1); I->i2 = t_to_idx(t2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); if (i < 3) I->R = fmaf(I->R, fmaf(0.01f, (float)dim, 0.85f), 0.0f); else { const float start_mult = fmaf(0.214f, (float)dim, 0.0f); const float end_mult = fmaf(0.174f, (float)dim, 0.0f); const float mult = fmaf(start_mult, exp2f((1.0f / (float)(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * (float)(i - 3)), 0.0f); I->R = fmaf(I->R, mult, 0.0f); } H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); if (f1 < bestF) { bestF = f1; bestQ.assign(q1, q1 + dim); bestX = x1; bestY = y1; } if (f2 < bestF) { bestF = f2; bestQ.assign(q2, q2 + dim); bestX = x2; bestY = y2; } } ++i; } float prev_t = a; float prev_f = f_a; int k = 1; while (k <= K) { const float t = fmaf(fmaf(b - a, (float)k / (float)(K + 1), a), 1.0f, (float)rank / (float)(world * (K + 1))); const float f = evalAt(t); IntervalND* I = new IntervalND(prev_t, t, prev_f, f); I->i1 = t_to_idx(prev_t); I->i2 = t_to_idx(t); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); prev_t = t; prev_f = f; ++k; } IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b); tail->i1 = t_to_idx(prev_t); tail->i2 = t_to_idx(b); tail->diam = map.block_diameter(tail->i1, tail->i2); tail->compute_span_level(map); tail->set_metric(tail->diam); update_pockets_and_Mmax(tail); tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); H.emplace_back(tail); std::push_heap(H.begin(), H.end(), ComparePtrND); const int kickEveryDim = static_cast<int>(fmaf(22.5f, exp2f(-0.295f * dim_f), 0.0f)); const int noImproveThrDim = static_cast<int>(fmaf(32.5f, exp2f(-0.1f * dim_f), 0.0f)); while (it < maxIter) { p = fmaf(-1.0f / initial_len, dmax, 1.0f); const float p_arg = fmaf(p, 2.3f, -2.9775f); const float r_eff = fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, r, 0.0f); const bool staganition = no_improve > noImproveThrDim; if (!(it % kickEveryDim) || staganition) { const float t_best = map.pointToT(bestQ.data()); const float log_arg = fmaf(dim_f, 1.0f, -1.0f); const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f); int ii = 0; while (ii < 2) { const float off = (ii == 0) ? fmaf(adaptive_diameter, 2.0f, 0.0f) : fmaf(-adaptive_diameter, 2.0f, 0.0f); const float t_seed = fminf(b, fmaxf(a, fmaf(off, 1.0f, t_best))); const float f_seed = evalAt(t_seed); IntervalND* J = new IntervalND(fmaf(-adaptive_diameter, 1.0f, t_seed), fmaf(adaptive_diameter, 1.0f, t_seed), f_seed, f_seed); J->i1 = t_to_idx(fmaf(-adaptive_diameter, 1.0f, t_seed)); J->i2 = t_to_idx(fmaf(adaptive_diameter, 1.0f, t_seed)); J->diam = map.block_diameter(J->i1, J->i2); J->compute_span_level(map); J->set_metric(J->diam); update_pockets_and_Mmax(J); J->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); J->R = fmaf(J->R, 2.03f - adaptive_coeff, 0.0f); H.emplace_back(J); std::push_heap(H.begin(), H.end(), ComparePtrND); ++ii; } no_improve = 0; } exp_arg_threshold = fmaf(-exp_arg_threshold * p, 10.0f, 0.0f); adaptive_coeff_threshold = fmaf(fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition_threshold, 1.0f); const float exp_arg = fmaf(B_dim, p, 0.0f); adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim); if (dim < 3) { const float adaptive_coeff_arg = adaptive_coeff - 1.0f; const float adaptive_coeff_threshold_arg = adaptive_coeff_threshold - 1.0f; adaptive_coeff = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg); adaptive_coeff_threshold = fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_threshold_arg); } const float exp_argument = fmaf(-0.2925f, dim_f, 0.0f); const float exp2_exp_arg = fmaf(exp_argument * 0.69314718055994530941723212145818f, fmaf(exp_argument * 0.69314718055994530941723212145818f, fmaf(exp_argument * 0.69314718055994530941723212145818f, fmaf(exp_argument * 0.69314718055994530941723212145818f, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float A = fmaf(39.995f, exp2_exp_arg, 0.0f); const int T = static_cast<int>(fmaf(fmaf(boost::math::pdf(boost::math::normal(0.5f, 0.17f), p), sqrtf(fmaf(dim_f, 1.825f, 0.0f)), 0.0f), fmaf(-(exp2_exp_arg - 1.0f), A, A), 0.0f)); std::pop_heap(H.begin(), H.end(), ComparePtrND); IntervalND* cur = H.back(); H.pop_back(); const float x1 = cur->x1; const float x2 = cur->x2; const float y1 = cur->y1; const float y2 = cur->y2; float m = fmaf(r_eff, Mmax, 0.0f); float tNew = step(m, x1, x2, y1, y2, dim_f, r_eff); const float bestFOld = bestF; const float fNew = evalAt(tNew); IntervalND* L = new IntervalND(x1, tNew, y1, fNew); IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2); L->i1 = t_to_idx(x1); L->i2 = t_to_idx(tNew); Rv->i1 = t_to_idx(tNew); Rv->i2 = t_to_idx(x2); L->diam = map.block_diameter(L->i1, L->i2); Rv->diam = map.block_diameter(Rv->i1, Rv->i2); L->compute_span_level(map); Rv->compute_span_level(map); L->set_metric(L->diam); Rv->set_metric(Rv->diam); const float Mloc = fmaxf(L->M, Rv->M); update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv); const float prevMmax = Mmax; if (Mloc > Mmax) Mmax = Mloc; m = fmaf(r_eff, Mmax, 0.0f); if (adaptive) { const float len1 = fmaf(tNew, 1.0f, -x1); const float len2 = fmaf(x2, 1.0f, -tNew); if (fmaf(len1, 1.0f, len2) == dmax) { dmax = fmaxf(len1, len2); for (auto pI : H) { const float Ls = fmaf(pI->x2, 1.0f, -pI->x1); if (Ls > dmax) dmax = Ls; } } if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) { const float alpha = p * p; const float beta = fmaf(-alpha, 1.0f, 2.0f); const float MULT = (1.0f / dmax) * Mmax; const float global_coeff = fmaf(MULT, r_eff, -MULT); const float GF = beta * global_coeff; L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f))); Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f))); const size_t sz = H.size(); RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha); std::make_heap(H.begin(), H.end(), ComparePtrND); } else { if (Mloc > prevMmax) { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) { const size_t sz = H.size(); RecomputeR_ConstM_AVX2_ND(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtrND); } } else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); } } } else { if (Mloc > prevMmax) { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) { const size_t sz = H.size(); RecomputeR_ConstM_AVX2_ND(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtrND); } } else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); } } H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND); H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND); _mm_prefetch((const char*)H[0], _MM_HINT_T0); _mm_prefetch((const char*)H[1], _MM_HINT_T0); IntervalND* const top = H.front(); const float interval_len = dim > 1 ? fmaf(top->x2, 1.0f, -top->x1) : top->diam; if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter) { out_iterations = (size_t)it; out_achieved_epsilon = interval_len; return; } if (!(it % T) || staganition) { MultiCrossMsg out; unsigned intervals_to_send = static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f))); const unsigned cap = (unsigned)(sizeof(out.intervals) / (sizeof(out.intervals[0]) * 5u)); if (intervals_to_send > cap) intervals_to_send = cap; const unsigned szH = (unsigned)H.size(); if (intervals_to_send > szH) intervals_to_send = szH; out.count = intervals_to_send; IntervalND* picked[64]; unsigned pick_i = 0; while (pick_i < intervals_to_send) { std::pop_heap(H.begin(), H.end(), ComparePtrND); picked[pick_i] = H.back(); H.pop_back(); ++pick_i; } float* dest = out.intervals; unsigned fill_i = 0; while (fill_i < intervals_to_send) { IntervalND* Tt = picked[fill_i]; dest[0] = Tt->x1; dest[1] = 0.0f; dest[2] = Tt->x2; dest[3] = 0.0f; dest[4] = Tt->R; dest += 5; ++fill_i; } int restore_i = (int)intervals_to_send - 1; while (restore_i >= 0) { H.emplace_back(picked[restore_i]); std::push_heap(H.begin(), H.end(), ComparePtrND); --restore_i; } const size_t iterations = std::bit_width((size_t)(world - 1)); bool active = true; const bool invert_T = ((int)fmaf((float)exchange_counter_T, 1.0f, 1.0f)) & 1; size_t ii2 = 0; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ (int)step; if (partner < world) { const bool am_sender = ((!!(rank & (int)step)) ^ invert_T); if (am_sender) { g_world->isend(partner, 0, out); active = false; } } ++ii2; } ++exchange_counter_T; } if (bestF < fmaf(bestFOld, 2.03f - adaptive_coeff, 0.0f) && !(it % 2)) { const size_t iterations = std::bit_width((size_t)(world - 1)); bool active = true; const bool invert_T = ((int)fmaf((float)exchange_counter, 1.0f, 1.0f)) & 1; if (dim > 2) { IntervalND* I0; IntervalND* I1; std::pop_heap(H.begin(), H.end(), ComparePtrND); I0 = H.back(); H.pop_back(); bool have_second = false; if (!H.empty()) { std::pop_heap(H.begin(), H.end(), ComparePtrND); I1 = H.back(); H.pop_back(); have_second = true; } else { I1 = I0; } alignas(16) float q0[32]; alignas(16) float q1p[32]; float x0; float y0; float x1p; float y1p; const float t0 = fmaf(I0->x1, 0.5f, fmaf(I0->x2, 0.5f, 0.0f)); const float t1p = fmaf(I1->x1, 0.5f, fmaf(I1->x2, 0.5f, 0.0f)); map.map01ToPoint(t0, q0); const float f0 = cost(q0, x0, y0); map.map01ToPoint(t1p, q1p); const float f1 = cost(q1p, x1p, y1p); if (have_second) { H.emplace_back(I1); std::push_heap(H.begin(), H.end(), ComparePtrND); } H.emplace_back(I0); std::push_heap(H.begin(), H.end(), ComparePtrND); BestSolutionMsg out0; out0.bestF = f0; out0.bestX = x0; out0.bestY = y0; out0.dim = (unsigned)dim; memcpy(out0.bestQ, q0, (size_t)dim * sizeof(float)); BestSolutionMsg out1; out1.bestF = f1; out1.bestX = x1p; out1.bestY = y1p; out1.dim = (unsigned)dim; memcpy(out1.bestQ, q1p, (size_t)dim * sizeof(float)); size_t ii2 = 0; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ (int)step; if (partner < world) { const bool am_sender = ((!!(rank & (int)step)) ^ invert_T); if (am_sender) { g_world->isend(partner, 2, out0); g_world->isend(partner, 3, out1); active = false; } } ++ii2; } } else { BestSolutionMsg out; out.bestF = bestF; out.bestX = bestX; out.bestY = bestY; out.dim = (unsigned)bestQ.size(); memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float)); size_t ii2 = 0; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ (int)step; if (partner < world) { const bool am_sender = ((!!(rank & (int)step)) ^ invert_T); if (am_sender) { g_world->isend(partner, 2, out); active = false; } } ++ii2; } } ++exchange_counter; } while (g_world->iprobe(boost::mpi::any_source, 0)) { MultiCrossMsg in; g_world->recv(boost::mpi::any_source, 0, in); const MultiCrossMsg& mX = in; const unsigned mxc = mX.count; const float inv_mxc = (mxc > 1u) ? (1.0f / (float)(mxc - 1u)) : 0.0f; unsigned ii = 0; while (ii < mX.count) { const float* d = &mX.intervals[ii * 5]; float sx = d[0]; float ex = d[2]; if (ex > sx) { alignas(16) float tmp[32]; float tx; float ty; map.map01ToPoint(sx, tmp); const float y1i = cost(tmp, tx, ty); map.map01ToPoint(ex, tmp); const float y2i = cost(tmp, tx, ty); IntervalND* inj = new IntervalND(sx, ex, y1i, y2i); inj->i1 = t_to_idx(sx); inj->i2 = t_to_idx(ex); inj->diam = map.block_diameter(inj->i1, inj->i2); inj->compute_span_level(map); inj->set_metric(inj->diam); update_pockets_and_Mmax(inj); inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); _mm_prefetch((const char*)H[0], _MM_HINT_T0); _mm_prefetch((const char*)H[1], _MM_HINT_T0); IntervalND* topH = H.front(); if (inj->R > fmaf(adaptive_coeff, topH->R, -0.03f)) { const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f; const float kf = staganition ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f); inj->R = fmaf(d[4], kf, 0.0f); if (dim > 2) { const float prog = fmaf((float)ii, inv_mxc, 0.0f); inj->R = fmaf(inj->R, adaptive_coeff_clone(prog), 0.0f); } H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtrND); } } ++ii; } } while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg bm; g_world->recv(boost::mpi::any_source, 2, bm); if (bm.bestF < fmaf(bestF, adaptive_coeff, 0.0f) || staganition) { alignas(16) float tmp_q[32]; memcpy(tmp_q, bm.bestQ, bm.dim * sizeof(float)); const float t_best = map.pointToT(tmp_q); const float log_arg = fmaf(dim_f, 1.0f, -1.0f); const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f); const float t1 = fmaxf(a, fmaf(-adaptive_diameter, 0.2f, t_best)); const float t2 = fminf(b, fmaf(adaptive_diameter, 0.2f, t_best)); if (t2 > t1) { alignas(16) float tq1[32]; alignas(16) float tq2[32]; float xx1; float yy1; float xx2; float yy2; map.map01ToPoint(t1, tq1); const float f1 = cost(tq1, xx1, yy1); map.map01ToPoint(t2, tq2); const float f2 = cost(tq2, xx2, yy2); IntervalND* I = new IntervalND(t1, t2, f1, f2); I->i1 = t_to_idx(t1); I->i2 = t_to_idx(t2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); I->R = fmaf(I->R, 2.03f - adaptive_coeff, 0.0f); if (dim > 2) I->R = fmaf(I->R, adaptive_coeff_clone(0.0f), 0.0f); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } if (bm.bestF < bestF) { bestF = bm.bestF; bestX = bm.bestX; bestY = bm.bestY; bestQ.assign(bm.bestQ, bm.bestQ + bm.dim); } } } while (g_world->iprobe(boost::mpi::any_source, 3)) { BestSolutionMsg bm; g_world->recv(boost::mpi::any_source, 3, bm); if (bm.bestF < fmaf(bestF, adaptive_coeff, 0.0f) || staganition) { alignas(16) float tmp_q[32]; memcpy(tmp_q, bm.bestQ, bm.dim * sizeof(float)); const float t_best = map.pointToT(tmp_q); const float log_arg = fmaf(dim_f, 1.0f, -1.0f); const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f); const float t1 = fmaxf(a, fmaf(-adaptive_diameter, 0.2f, t_best)); const float t2 = fminf(b, fmaf(adaptive_diameter, 0.2f, t_best)); if (t2 > t1) { alignas(16) float tq1[32]; alignas(16) float tq2[32]; float xx1; float yy1; float xx2; float yy2; map.map01ToPoint(t1, tq1); const float f1 = cost(tq1, xx1, yy1); map.map01ToPoint(t2, tq2); const float f2 = cost(tq2, xx2, yy2); IntervalND* I = new IntervalND(t1, t2, f1, f2); I->i1 = t_to_idx(t1); I->i2 = t_to_idx(t2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); I->R = fmaf(I->R, 2.03f - adaptive_coeff, 0.0f); if (dim > 2) I->R = fmaf(I->R, adaptive_coeff_clone(1.0f), 0.0f); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } if (bm.bestF < bestF) { bestF = bm.bestF; bestX = bm.bestX; bestY = bm.bestY; bestQ.assign(bm.bestQ, bm.bestQ + bm.dim); } } } ++it; } }