"__declspec(align(16)) struct CrossMsg final { fl...

Criado em: 19 de dezembro de 2025

Respondido usando GPT-5.2 Thinking por Chat01

Pergunta

"__declspec(align(16)) struct CrossMsg final {
float s_x1;
float s_x2;
float e_x1;
float e_x2;
float Rtop;

text
template <typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }

};

__declspec(align(16)) struct MultiCrossMsg final {
float intervals[35];
unsigned count;

text
template <typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept { ar& intervals& count; }

};

__declspec(align(16)) struct BestSolutionMsg final {
float bestF;
float bestX;
float bestY;
float bestQ[32];
unsigned dim;

text
template <typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept { ar& bestF& bestX& bestY& bestQ& dim; }

}; static __declspec(noalias) void agp_run_branch_mpi(
const MortonND& map,
const ManipCost& cost,
int maxIter,
float r,
bool adaptive,
float eps,
unsigned seed,
std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H,
std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ,
float& bestF,
float& bestX,
float& bestY,
size_t& out_iterations,
float& out_achieved_epsilon,
float M_prior) noexcept
{
const int n = cost.n;
const int dim = n + (cost.variableLen ? n : 0);
const float dim_f = static_cast<float>(dim);
unsigned exchange_counter = 0;
unsigned exchange_counter_T = 0;
alignas(16) float M_by_span[12];
int msi = 0;
while (msi < 12) M_by_span[msi++] = M_prior;
float Mmax = M_prior;
alignas(16) float q_local[32];
alignas(16) float phi[32];
alignas(16) float s_arr[32];
alignas(16) float c_arr[32];
alignas(16) float sum_s[32];
alignas(16) float sum_c[32];
alignas(16) float q_try[32];
bestQ.reserve(static_cast<size_t>(dim));
float x = 0.0f;
float y = 0.0f;
int no_improve = 0;
auto t_to_idx = [&](float t) -> unsigned long long {
unsigned long long idx = static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f));
return idx;
};
auto update_pockets_and_Mmax = [&](IntervalND* I) {
const int k = I->span_level;
if (I->M > M_by_span[k]) M_by_span[k] = I->M;
if (M_by_span[k] > Mmax) Mmax = M_by_span[k];
};
const float a = 0.0f;
const float b = 1.0f;
float p = 0.0f;
float dmax = fmaf(b, 1.0f, -a);
const float initial_len = dmax;
float exp_arg_threshold = fmaf(1.0f / fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, -0.2f, 0.25f), -0.33333333f), 0.5f), -1.0f), dim_f), 0.415888308336f, 0.0f);
const float log_arg_threshold = fmaf(dim_f, 0.1f, -0.105f);
const float adaptive_coeff_addition_threshold = fmaf(fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 1.0f / fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg_threshold), 0.17814618538f);
float adaptive_coeff_threshold = adaptive_coeff_addition_threshold + 1.0f;
const float log_arg = dim_f + 5.0f;
const float A_dim = fmaf(1.0f / fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), 3.0f, 0.0f);
const float B_dim = fmaf(A_dim, 0.65f, 0.0f);
const float log_argument = A_dim - 2.03f;
const float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim;
const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.55f), 1.0f);
float adaptive_coeff = A_dim - adaptive_coeff_addition;
const float A_dim_clone = fmaf(A_dim - fmaf(-fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim), 0.5f, 0.0f);
int it = 0;
auto evalAt = [&](float t) -> float {
map.map01ToPoint(t, q_local);
float f = cost(q_local, x, y);
const float step_arg = fmaf(-dim_f, 0.825f, 3.3f);
float eta = fmaf(fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f);
if ((((f < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) && p > 0.55f) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && ((p > 0.7f && !(it % 3)) || p > 0.9f))) && dim > 2) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && dim < 3)) {
float acc = 0.0f;
int ii = 0;
while (ii < n) {
acc = fmaf(q_local[ii], 1.0f, acc);
phi[ii] = acc;
++ii;
}
FABE13_SINCOS(phi, s_arr, c_arr, n);
float as = 0.0f;
float ac = 0.0f;
int k = n - 1;
while (k >= 0) {
const float Lk = cost.variableLen ? q_local[n + k] : 1.0f;
as = fmaf(Lk, s_arr[k], as);
ac = fmaf(Lk, c_arr[k], ac);
sum_s[k] = as;
sum_c[k] = ac;
--k;
}
const float dx = fmaf(x, 1.0f, -cost.targetX);
const float dy = fmaf(y, 1.0f, -cost.targetY);
const float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1.0e-8f;
int stepI = 0;
const float adaptive_window = fmaf(1.0f / fmaf(bestF, adaptive_coeff_threshold, 0.0f), fmaf(bestF, adaptive_coeff_threshold, -f), 0.0f);
const float curse_dim_arg = fmaf(dim_f, 0.825f, 0.0f);
const int threshold_iters_grad = static_cast<int>(fmaf(fmaf(adaptive_window, adaptive_window, 1.0f), fmaf(fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f), 0.0f));
int no_improve_steps = 0;
float f_at_check_start = f;
int steps_since_check = 0;
const int check_interval = static_cast<int>(fmaf(1.0f / log2f(fmaf(dim_f, 0.5f, 1.0f)), static_cast<float>(threshold_iters_grad >> 2), 0.0f));
while (stepI < threshold_iters_grad) {
int i = 0;
#pragma loop ivdep
while (i < n) {
float gpen = 0.0f;
{
const float ai = fabsf(q_local[i]);
const float v = fmaf(ai, 1.0f, -cost.minTheta);
if (v > 0.0f) {
const float scale_arg = fmaf(2.0f / (cost.minTheta + 1.0e-6f), fmaf(v, 0.69314718055994530941723212145818f, 0.0f), 0.0f);
const float exp_val = fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float dpen_dtheta = fmaf(cost.sharpW, fmaf(exp_val, 0.69314718055994530941723212145818f * (2.0f / (cost.minTheta + 1.0e-6f)), 0.0f), copysignf(1.0f, q_local[i]));
gpen = fmaf(dpen_dtheta, 1.0f, gpen);
}
}
{
const float tsg = fmaf(-q_local[i], cost.archBiasK, 0.0f);
const float exp_arg = -tsg;
const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float sig = 1.0f / fmaf(exp_val, 1.0f, 1.0f);
gpen = fmaf(-fmaf(cost.archBiasW, cost.archBiasK, 0.0f), sig, gpen);
}
const float g = fmaf(fmaf(dx, -sum_s[i], fmaf(dy, sum_c[i], 0.0f)), 1.0f / dist, gpen);
q_try[i] = fmaf(-eta, g, q_local[i]);
const float lo = (i == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
const float hi = 2.6179938779914943653855361527329f;
if (q_try[i] < lo) q_try[i] = lo;
else if (q_try[i] > hi) q_try[i] = hi;
++i;
}
if (cost.variableLen) {
int j = 0;
#pragma loop ivdep
while (j < n) {
const float g = fmaf(fmaf(dx, c_arr[j], fmaf(dy, s_arr[j], 0.0f)), 1.0f / dist, 0.0f);
float v = fmaf(-eta, g, q_local[n + j]);
if (v < 0.5f) v = 0.5f;
else if (v > 2.0f) v = 2.0f;
q_try[n + j] = v;
++j;
}
}
float x2;
float y2;
const float f2 = cost(q_try, x2, y2);
const float rel_improvement = fmaf(-1.0f / f, f2, 1.0f);
if (f2 < f) {
memcpy(q_local, q_try, static_cast<size_t>(dim) * sizeof(float));
f = f2;
x = x2;
y = y2;
eta = fmaf(eta, fmaf(10.0f / sqrtf(dim_f), 1.0f / (1.0f + rel_improvement) * rel_improvement, 1.0f), 0.0f);
}
else {
const float caution_coeff = 5.0f / sqrtf(dim_f) * rel_improvement;
eta = fmaf(eta, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0f);
memcpy(q_try, q_local, static_cast<size_t>(dim) * sizeof(float));
++no_improve_steps;
}
++steps_since_check;
if (steps_since_check == check_interval) {
if (fmaf(-1.0f / f_at_check_start, f, 1.0f) < 1e-3f) break;
f_at_check_start = f;
steps_since_check = 0;
}
if (no_improve_steps > threshold_iters_grad >> 2) break;
++stepI;
}
}
if (f < bestF) {
if (0.95f < p || dim < 3) {
const int last = n - 1;
const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
const float hi = 2.6179938779914943653855361527329f;
float bestLocF = f;
float saved = q_local[last];
float delta = eta;
while (delta > fmaf(delta, 0.15f, 0.0f)) {
int sgn = -1;
while (sgn < 2) {
float cand = fmaf(static_cast<float>(sgn), delta, saved);
if (cand < lo) cand = lo;
else if (cand > hi) cand = hi;
const float backup = q_local[last];
q_local[last] = cand;
float x2;
float y2;
const float f2 = cost(q_local, x2, y2);
if (f2 < bestLocF) {
bestLocF = f2;
x = x2;
y = y2;
saved = cand;
}
q_local[last] = backup;
sgn += 2;
}
delta = fmaf(delta, 0.5f, 0.0f);
}
if (bestLocF < f) {
q_local[last] = saved;
f = bestLocF;
}
}
bestF = f;
bestQ.assign(q_local, q_local + dim);
bestX = x;
bestY = y;
no_improve = 0;
}
else ++no_improve;
return f;
};
const float f_a = evalAt(a);
const float f_b = evalAt(b);
const float Kf = fminf(fmaxf(2.0f * dim_f, 8.0f), 128.0f);
const int K = static_cast<int>(Kf);
H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + 16u);
const int rank = g_world->rank();
const int world = g_world->size();
alignas(16) float seeds[256 * 32];
const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, static_cast<unsigned>(fmaf(static_cast<float>(rank), 7919.0f, static_cast<float>(seed))));
int i = 0;
while (i < seedCnt) {
const float* s = seeds + static_cast<size_t>(fmaf(static_cast<float>(i), 32.0f, 0.0f));
const float t_seed = map.pointToT(s);
const float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f) : fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f), exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
const float t1 = fmaxf(a, fmaf(-interval_size, 1.0f, t_seed));
const float t2 = fminf(b, fmaf(interval_size, 1.0f, t_seed));
if (t2 > t1) {
alignas(16) float q1[32];
alignas(16) float q2[32];
float x1;
float y1;
float x2;
float y2;
map.map01ToPoint(t1, q1);
const float f1 = cost(q1, x1, y1);
map.map01ToPoint(t2, q2);
const float f2 = cost(q2, x2, y2);
IntervalND* I = new IntervalND(t1, t2, f1, f2);
I->i1 = t_to_idx(t1);
I->i2 = t_to_idx(t2);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
if (i < 3) I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f);
else {
const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f);
const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f);
const float mult = fmaf(start_mult, exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
I->R = fmaf(I->R, mult, 0.0f);
}
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
if (f1 < bestF) {
bestF = f1;
bestQ.assign(q1, q1 + dim);
bestX = x1;
bestY = y1;
}
if (f2 < bestF) {
bestF = f2;
bestQ.assign(q2, q2 + dim);
bestX = x2;
bestY = y2;
}
}
++i;
}
float prev_t = a;
float prev_f = f_a;
int k = 1;
while (k <= K) {
const float t = fmaf(fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1)));
const float f = evalAt(t);
IntervalND* I = new IntervalND(prev_t, t, prev_f, f);
I->i1 = t_to_idx(prev_t);
I->i2 = t_to_idx(t);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
prev_t = t;
prev_f = f;
++k;
}
IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b);
tail->i1 = t_to_idx(prev_t);
tail->i2 = t_to_idx(b);
tail->diam = map.block_diameter(tail->i1, tail->i2);
tail->compute_span_level(map);
tail->set_metric(tail->diam);
update_pockets_and_Mmax(tail);
tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(tail);
std::push_heap(H.begin(), H.end(), ComparePtrND);
const int kickEveryDim = static_cast<int>(fmaf(22.5f, exp2f(-0.295f * dim_f), 0.0f));
const int noImproveThrDim = static_cast<int>(fmaf(32.5f, exp2f(-0.1f * dim_f), 0.0f));
while (it < maxIter) {
p = fmaf(-1.0f / initial_len, dmax, 1.0f);
const float p_arg = fmaf(p, 2.3f, -2.9775f);
const float r_eff = fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, r, 0.0f);
const bool staganition = no_improve > noImproveThrDim;
if (!(it % kickEveryDim) || staganition) {
const float t_best = map.pointToT(bestQ.data());
const float log_arg = fmaf(dim_f, 1.0f, -1.0f);
const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f);
int ii = 0;
while (ii < 2) {
const float off = (ii == 0) ? fmaf(adaptive_diameter, 2.0f, 0.0f) : fmaf(-adaptive_diameter, 2.0f, 0.0f);
const float t_seed = fminf(b, fmaxf(a, fmaf(off, 1.0f, t_best)));
const float f_seed = evalAt(t_seed);
IntervalND* J = new IntervalND(fmaf(-adaptive_diameter, 1.0f, t_seed), fmaf(adaptive_diameter, 1.0f, t_seed), f_seed, f_seed);
J->i1 = t_to_idx(fmaf(-adaptive_diameter, 1.0f, t_seed));
J->i2 = t_to_idx(fmaf(adaptive_diameter, 1.0f, t_seed));
J->diam = map.block_diameter(J->i1, J->i2);
J->compute_span_level(map);
J->set_metric(J->diam);
update_pockets_and_Mmax(J);
J->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
J->R = fmaf(J->R, 2.03f - adaptive_coeff, 0.0f);
H.emplace_back(J);
std::push_heap(H.begin(), H.end(), ComparePtrND);
++ii;
}
no_improve = 0;
}
const float tmp_exp_arg_threshold = fmaf(-exp_arg_threshold * p, 10.0f, 0.0f);
adaptive_coeff_threshold = fmaf(fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition_threshold, 1.0f);
const float exp_arg = fmaf(B_dim, p, 0.0f);
adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim);
if (dim < 3) {
const float adaptive_coeff_arg = adaptive_coeff - 1.0f;
const float adaptive_coeff_threshold_arg = adaptive_coeff_threshold - 1.0f;
adaptive_coeff = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
adaptive_coeff_threshold = fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_threshold_arg);
}
const float arg_for_T = fmaf(p, 4.5f, 2.475f);
const float exp_argument = fmaf(-0.2925f, dim_f, 0.0f);
const float exp2_exp_arg = fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float A = fmaf(39.995f, exp2_exp_arg, 0.0f);
const int T = static_cast<int>(fmaf(fmaf(fmaf(1.0f / fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 2.0f), 1.0498f, -0.04978f), fmaf(sqrtf(dim_f), 4.875f, 0.0f), 0.0f), fmaf(-(exp2_exp_arg - 1.0f), A, A), 0.0f));
std::pop_heap(H.begin(), H.end(), ComparePtrND);
IntervalND* cur = H.back();
H.pop_back();
const float x1 = cur->x1;
const float x2 = cur->x2;
const float y1 = cur->y1;
const float y2 = cur->y2;
float m = fmaf(r_eff, Mmax, 0.0f);
float tNew = step(m, x1, x2, y1, y2, dim_f, r_eff);
const float bestFOld = bestF;
const float fNew = evalAt(tNew);
IntervalND* L = new IntervalND(x1, tNew, y1, fNew);
IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2);
L->i1 = t_to_idx(x1);
L->i2 = t_to_idx(tNew);
Rv->i1 = t_to_idx(tNew);
Rv->i2 = t_to_idx(x2);
L->diam = map.block_diameter(L->i1, L->i2);
Rv->diam = map.block_diameter(Rv->i1, Rv->i2);
L->compute_span_level(map);
Rv->compute_span_level(map);
L->set_metric(L->diam);
Rv->set_metric(Rv->diam);
const float Mloc = fmaxf(L->M, Rv->M);
update_pockets_and_Mmax(L);
update_pockets_and_Mmax(Rv);
const float prevMmax = Mmax;
if (Mloc > Mmax) Mmax = Mloc;
m = fmaf(r_eff, Mmax, 0.0f);
if (adaptive) {
const float len1 = fmaf(tNew, 1.0f, -x1);
const float len2 = fmaf(x2, 1.0f, -tNew);
if (fmaf(len1, 1.0f, len2) == dmax) {
dmax = fmaxf(len1, len2);
for (auto pI : H) {
const float Ls = fmaf(pI->x2, 1.0f, -pI->x1);
if (Ls > dmax) dmax = Ls;
}
}
if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) {
const float alpha = p * p;
const float beta = fmaf(-alpha, 1.0f, 2.0f);
const float MULT = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULT, r_eff, -MULT);
const float GF = beta * global_coeff;
L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f)));
Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f)));
const size_t sz = H.size();
RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
else {
if (Mloc > prevMmax) {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
const size_t sz = H.size();
RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
}
else {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
}
}
}
else {
if (Mloc > prevMmax) {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) {
const size_t sz = H.size();
RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
std::make_heap(H.begin(), H.end(), ComparePtrND);
}
}
else {
L->ChangeCharacteristic(m);
Rv->ChangeCharacteristic(m);
}
}
H.emplace_back(L);
std::push_heap(H.begin(), H.end(), ComparePtrND);
H.emplace_back(Rv);
std::push_heap(H.begin(), H.end(), ComparePtrND);
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
IntervalND* const top = H.front();
const float interval_len = dim > 1 ? fmaf(top->x2, 1.0f, -top->x1) : top->diam;
if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter) {
out_iterations = static_cast<size_t>(it);
out_achieved_epsilon = interval_len;
return;
}
if (!(it % T) || staganition) {
MultiCrossMsg out;
const unsigned intervals_to_send = static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f)));
out.count = intervals_to_send;
float* dest = out.intervals;
unsigned ii = 0;
while (ii < 3) {
IntervalND* Tt = H[ii];
dest[0] = Tt->x1;
dest[1] = 0.0f;
dest[2] = Tt->x2;
dest[3] = 0.0f;
dest[4] = Tt->R;
dest += 5;
++ii;
}
const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
bool active = true;
const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter_T), 1.0f, 1.0f)) & 1);
size_t ii2 = 0;
while (ii2 < iterations && active) {
const size_t step = 1ULL << ii2;
const int partner = rank ^ static_cast<int>(step);
if (partner < world) {
const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
if (am_sender) {
g_world->isend(partner, 0, out);
active = false;
}
}
++ii2;
}
++exchange_counter_T;
}
const float improvement_threshold = 2.03f - adaptive_coeff;
if (bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 7), 0.0f) && (p > 0.7f && !(it % 3)) || bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 5), 0.0f) && p > 0.9f) {
BestSolutionMsg out;
out.bestF = bestF;
out.bestX = bestX;
out.bestY = bestY;
out.dim = static_cast<unsigned>(bestQ.size());
memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float));
const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
bool active = true;
const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter), 1.0f, 1.0f)) & 1);
size_t ii2 = 0;
while (ii2 < iterations && active) {
const size_t step = 1ULL << ii2;
const int partner = rank ^ static_cast<int>(step);
if (partner < world) {
const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
if (am_sender) {
g_world->isend(partner, 2, out);
active = false;
}
}
++ii2;
}
++exchange_counter;
}
while (g_world->iprobe(boost::mpi::any_source, 0)) {
MultiCrossMsg in;
g_world->recv(boost::mpi::any_source, 0, in);
const MultiCrossMsg& mX = in;
unsigned ii = 0;
while (ii < mX.count) {
const float* d = &mX.intervals[ii * 5];
float sx = d[0];
float ex = d[2];
if (ex > sx) {
alignas(16) float tmp[32];
float tx;
float ty;
map.map01ToPoint(sx, tmp);
const float y1i = cost(tmp, tx, ty);
map.map01ToPoint(ex, tmp);
const float y2i = cost(tmp, tx, ty);
IntervalND* inj = new IntervalND(sx, ex, y1i, y2i);
inj->i1 = t_to_idx(sx);
inj->i2 = t_to_idx(ex);
inj->diam = map.block_diameter(inj->i1, inj->i2);
inj->compute_span_level(map);
inj->set_metric(inj->diam);
update_pockets_and_Mmax(inj);
inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
_mm_prefetch((const char*)H[0], _MM_HINT_T0);
_mm_prefetch((const char*)H[1], _MM_HINT_T0);
IntervalND* topH = H.front();
const float topH_R = topH->R;
const float arg_for_adaptive = fmaf(2.03f - adaptive_coeff, topH_R, 0.0f);
if (inj->R > fmaf(fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.5819767068693264243850020f, -1.0f) || staganition) {
const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f;
const float kf = staganition ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f);
const float exp_arg = fmaf(B_dim, fmaf(1.0f / static_cast<float>(mX.count - 1u), static_cast<float>(ii), 0.0f), 0.0f);
float adaptive_coeff_clone = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim_clone);
if (dim < 3) {
const float adaptive_coeff_arg = adaptive_coeff_clone - 1.0f;
adaptive_coeff_clone = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
}
const float arg_for_inj = fmaf(1.0f / fmaf(adaptive_coeff, topH_R, 0.0f), inj->R - topH_R, 1.0f);
inj->R = fmaf(d[4], fmaf(fmaf(fmaf(arg_for_inj, fmaf(arg_for_inj, fmaf(arg_for_inj, fmaf(arg_for_inj, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), kf, 0.0f), adaptive_coeff_clone, 0.0f), 0.0f);
H.emplace_back(inj);
std::push_heap(H.begin(), H.end(), ComparePtrND);
}
}
++ii;
}
}
while (g_world->iprobe(boost::mpi::any_source, 2)) {
BestSolutionMsg bm;
g_world->recv(boost::mpi::any_source, 2, bm);
if (bm.bestF < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) || staganition) {
alignas(16) float tmp_q[32];
memcpy(tmp_q, bm.bestQ, bm.dim * sizeof(float));
const float t_best = map.pointToT(tmp_q);
const float log_arg = fmaf(dim_f, 1.0f, -1.0f);
const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f);
const float t1 = fmaf(-adaptive_diameter, 0.2f, t_best);
const float t2 = fmaf(adaptive_diameter, 0.2f, t_best);
alignas(16) float tq1[32];
alignas(16) float tq2[32];
float xx1;
float yy1;
float xx2;
float yy2;
map.map01ToPoint(t1, tq1);
const float f1 = cost(tq1, xx1, yy1);
map.map01ToPoint(t2, tq2);
const float f2 = cost(tq2, xx2, yy2);
IntervalND* I = new IntervalND(t1, t2, f1, f2);
I->i1 = t_to_idx(t1);
I->i2 = t_to_idx(t2);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
I->R = fmaf(I->R, adaptive_coeff, 0.0f);
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
if (bm.bestF < bestF) {
bestF = bm.bestF;
bestX = bm.bestX;
bestY = bm.bestY;
bestQ.assign(bm.bestQ, bm.bestQ + bm.dim);
}
}
}
++it;
}
}" - внимательно рассмотри этот код, тут два принципиально отличающихся типа mpi обменов - первый для исследования поискового пространства, второй для уточнения, проблема сейчас в том что при приёме второго типа сообщений создаются интервалы очень малого диаметра что вводит алгоритм глобального поиска в заблуждение так как создаются очень малые интервалы с большой характеристикой - что нужно сделать: условие для отправки сообщений второго типа трогать не нужно, условие внутри цикла приёма сообщений второго типа для рассмотрения точки как кандидата: "if (bm.bestF < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) || staganition) {" - также трогать не нужно, но теперь нужно чтобы при входе в это условие не создавался малый интервал вокруг лучшей точки а нужно теперь помимо только лишь лучшей точки в сообщениях второго типа передавать также лучший интервал если размерность меньше 3, и два лучших интервала если размерность больше 2, то есть все условие что в отправке что в приёме при рассмотрении точки как перспективного кандидата в сообщениях второго типа - остаются такими же - но теперь помимо точки передаются 1 или 2 лучших интервала, и что потом должно происходить с этими интервалалами: в текущем коде у вновь созданных интервалов при приёме сообщений второго типа характеристика меняется как I->R = fmaf(I->R, adaptive_coeff, 0.0f);, представим теперь что I это переданный интервал, тогда сначала его характеристика должна меняться аналогично тому как она сначала меняется в сообщениях первого типа inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); затем если передавался только 1 лучший интервал - то его характеристика должна будет измениться по закону I->R = fmaf(I->R, adaptive_coeff, 0.0f); - в случае же если передавалось два лучших интервала то дополнительно характеристика каждого из них должна будет умножиться на adaptive_coeff_clone - который будет вводится в сообщениях второго типа по полной аналогии с сообщениями первого типа с тем отличием что его не надо будет смягчать при размерности меньше 3 как это сейчас делается в сообщениях первого типа, то есть вот этого: " if (dim < 3) {
const float adaptive_coeff_arg = adaptive_coeff_clone - 1.0f;
adaptive_coeff_clone = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg);
}" - делать не надо, зато надо также как и в сообщениях первого типа делать _mm_prefetch((const char*)H[0 или (0 и затем 1 - если 2 интервала пересылается)], _MM_HINT_T0); то есть итоговая формула изменений характеристик в сообщениях второго типа будет: "I->R = fmaf(I->R, fmaf(adaptive_coeff_clone, adaptive_coeff, 0.0f), 0.0f);". Заметь в случае с сообщениями первого типа adaptive_coeff_clone зависит от числа передаваемых интервалов через "fmaf(1.0f / static_cast<float>(mX.count - 1u), static_cast<float>(ii), 0.0f)" - тут же сделай проще, используй вместо этой формулы: "fmaf(1.0f / static_cast<float>(mX.count - 1u), static_cast<float>(ii), 0.0f)": 0 для первого интервала, 1 для второго интервала, в случае когда их двое, в случае одного интервала напоминаю домножать на adaptive_coeff_clone не нужно, не добавляй лишних проверок в код по типу проверок на непустоту, эти кучи гарантировано не пусти, не делай никаких проверок на вылет за пределы диапазона и прочее - проверки запрещены, модифицируй мой код в соответствии с указанными требованиями - причём заметь - изменения в код должны быть в моём стиле максимально оптимизированного кода: заботься о скорости работы кода и думай о максимальной производительности, так что инлайнь вычисления, загружай регистры и умножители процессора через вложенные сложные длинные формулы с fmaf, старайся делать все вычисления in_place - минимум обращений к памяти - всё в регистрах, инструкции размещай чтобы обеспечить оптимальную загрузку CPU, минимум кеш-промахов, локальность доступа к данным, если кешируешь переменнные - кешируй их как можно ближе к месту использования, возможно потребуется внести изменения в сериализируемые MPI структуры чтобы сделать это максимально быстро по скорости - мне пришли полную исправленную версию кода без пропусков и комментариев, в полном соответствии со всеми моими требованиями

Pensamento

Modifying MPI message structure for optimization

The user wants to modify an MPI code related to optimization. Specifically, MPI message type 2 should carry not only the best solution but also the best interval(s). Based on the dimension, either 1 or 2 intervals should be sent. I'll need to define the message structure, particularly for handling received intervals and recalculating data using a local map/cost.

Clarifying message structure for interval exchange

For the exchange type 0, they currently use MultiCrossMsg with floating-point intervals and a count. However, the current setup only sends three intervals, even if count can be greater than three — a potential bug I don't need to address right now. For the best solution exchange, the system only sends bestQ and bestF, but now I need to include the best interval(s) as well, possibly in a similar format to MultiCrossMsg. I'll explore using five floats for each interval.

Choosing best intervals for type2 exchange

For type0, the message only includes endpoints and R, while intervals y1 and y2 are computed during reception. For type2, I need to pass the best interval(s), likely from the max-heap H. I'll consider the top two from H (H[0] and H[1]), as they fit the requirement and help optimize the process.

Sending and receiving type2 intervals with correct characteristic updates

For type2, I'll send the BestSolutionMsg along with interval endpoints and its R value. When receiving the interval, its characteristic needs to be updated. If one interval is sent, we'll adjust R based on an adaptive coefficient; for two intervals, we'll apply an additional scaling based on another coefficient. I'll ensure the correct calculations, especially for M, are applied.

Recomputing interval characteristics and updating formula for type2

For type2, instead of sending y1 and y2, we’ll send endpoints x1 and x2 along with R. On the receiving side, we'll reconstruct the interval, recompute y1 and y2, and apply ChangeCharacteristic. For two intervals, I'll adjust the adaptive coefficient by setting a fraction, 0 for the first interval and 1 for the second. This will help simplify the calculation and avoid small intervals.

Updating message structure and handling R value in type2

For type2, we need to change the message structure to include intervals. We'll send x1, x2, and maybe R. However, we can avoid computing R if dim<3 since only one interval is sent. On receiving, we’ll reconstruct the interval, compute y1 and y2, call ChangeCharacteristic (which will set R), and apply scaling. Initial R from the sender isn't necessary.

Handling scaling and message structure for type2

For type2, we'll use the sender's R to scale. Although we don't need to transfer the full IntervalND, we'll only send x1, x2, and R, the minimal fields needed. On receiving, we can recreate the interval, recompute important values, and call ChangeCharacteristic. Afterward, scaling will apply, depending on whether one or two intervals are sent. This approach seems in line with the requirements.

Handling type2 message structure and prefetch

In type2, prefetching wasn’t done before using H, so I’ll add that. I'll push received intervals into H, without triggering recomputation loops. The condition for sending in type2 doesn’t need changing, but the message payload might change to include interval count -- deciding between 1 or 2 intervals based on the dimension (dim). The top interval is likely the one with the highest R in H.

Sending and receiving intervals efficiently

In sending, we can access H[0] and H[1] (if dim > 2) and send the bestF, bestX, bestY, bestQ, but they’re global bests, not related to best intervals. I’ll include intervals with best point information to guide the search. In receiving, we need to continue to update best values and use transmitted intervals instead of creating new ones. We’ll keep performance a priority and only send essential data, like x1, x2, and possibly y1, y2 while avoiding unnecessary checks.

Optimizing interval transmission with y1/y2 values

Sending y1 and y2 along with x1 and x2 would help avoid additional cost evaluations. This could improve performance without significantly increasing message size. Using the existing packing scheme, we can send a representation of intervals with both x1, y1, x2, and y2. Though the specifics aren't fully outlined, transmitting y1 and y2 fits the approach.

Designing interval message for type2 with optimization in mind

For type2, we should send the essential fields for each interval: x1, y1, x2, and y2, plus an optional R. This will minimize costly evaluations. The structure should include up to 2 intervals, best point details, and a count. For performance, we avoid extra checks and use fixed-size arrays to maintain structure alignment and serialization. We'll maintain the existing logic but adapt it for efficiency.

Deciding interval structure for efficient message handling

To optimize the message structure for type2, I’ll pack intervals with 4 floats (x1, y1, x2, y2) per interval. This supports efficient memory usage and alignment. We won't include R, as it's not necessary for computation, and aligning messages to 16 bytes should help with serialization. Prefetching H[0] and H[1] will ensure smooth integration of intervals with minimal computation.

Optimizing message structure and interval handling

For type2, I’ll prefetch H[0] before using intervals, even though it’s not critical for the logic. For sending intervals, I’ll store 4 floats per interval (x1, y1, x2, y2) to reduce size and maintain efficiency. The receiver will create intervals directly from this packed format, ensuring proper calculations and handling alignment. Using eight floats for both intervals ensures 16-byte alignment, which matches the style and improves performance.

Handling interval selection and adaptive scaling

For selecting the two best intervals, I'll rely on H[0] and H[1] for simplicity, adhering to the instructions despite the internal heap structure. We’ll avoid additional checks but use a simple pointer comparison to select the second best interval. In the receive block, I'll integrate the received intervals, update the necessary values, and apply adaptive scaling per the specified conditions. This includes calculating the adaptive coefficient for multiple intervals as required.

Implementing interval handling and message updates

We’ll implement updates by storing intervals from H[0] and H[1], following the conditions for dimensionality. We modify the "send" and "receive" processes to integrate calculated coefficients and scaling for each interval. Prefetching will be done on receiving, and we’ll ensure proper message structure updates without extra checks. The final adjustments align with the user's request for no omitted steps.

Updating interval handling and serialization

I’m updating the process to handle intervals efficiently, ensuring proper packing and serialization of the best interval. I’ll compute adaptive coefficients and apply scaling as needed. I need to avoid unnecessary checks and unroll loops for performance, ensuring minimal memory access. The modifications must align with the user’s request for high performance using fmaf and prefetching where necessary. Let’s prepare the final code with updates for serialization, including handling two intervals for higher dimensions.

Thought for 4m 1s

Resposta

cpp
__declspec(align(16)) struct CrossMsg final { float s_x1; float s_x2; float e_x1; float e_x2; float Rtop; template <typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; } }; __declspec(align(16)) struct MultiCrossMsg final { float intervals[35]; unsigned count; template <typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept { ar& intervals& count; } }; __declspec(align(16)) struct BestSolutionMsg final { float bestF; float bestX; float bestY; float bestQ[32]; float bestI[8]; unsigned dim; template <typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept { ar& bestF& bestX& bestY& bestQ& bestI& dim; } }; static __declspec(noalias) void agp_run_branch_mpi( const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed, std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H, std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ, float& bestF, float& bestX, float& bestY, size_t& out_iterations, float& out_achieved_epsilon, float M_prior) noexcept { const int n = cost.n; const int dim = n + (cost.variableLen ? n : 0); const float dim_f = static_cast<float>(dim); unsigned exchange_counter = 0; unsigned exchange_counter_T = 0; alignas(16) float M_by_span[12]; int msi = 0; while (msi < 12) M_by_span[msi++] = M_prior; float Mmax = M_prior; alignas(16) float q_local[32]; alignas(16) float phi[32]; alignas(16) float s_arr[32]; alignas(16) float c_arr[32]; alignas(16) float sum_s[32]; alignas(16) float sum_c[32]; alignas(16) float q_try[32]; bestQ.reserve(static_cast<size_t>(dim)); float x = 0.0f; float y = 0.0f; int no_improve = 0; auto t_to_idx = [&](float t) -> unsigned long long { unsigned long long idx = static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f)); return idx; }; auto update_pockets_and_Mmax = [&](IntervalND* I) { const int k = I->span_level; if (I->M > M_by_span[k]) M_by_span[k] = I->M; if (M_by_span[k] > Mmax) Mmax = M_by_span[k]; }; const float a = 0.0f; const float b = 1.0f; float p = 0.0f; float dmax = fmaf(b, 1.0f, -a); const float initial_len = dmax; float exp_arg_threshold = fmaf(1.0f / fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, fmaf(dim_f, -0.2f, 0.25f), -0.33333333f), 0.5f), -1.0f), dim_f), 0.415888308336f, 0.0f); const float log_arg_threshold = fmaf(dim_f, 0.1f, -0.105f); const float adaptive_coeff_addition_threshold = fmaf(fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, fmaf(exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 1.0f / fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, fmaf(log_arg_threshold, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg_threshold), 0.17814618538f); float adaptive_coeff_threshold = adaptive_coeff_addition_threshold + 1.0f; const float log_arg = dim_f + 5.0f; const float A_dim = fmaf(1.0f / fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), 3.0f, 0.0f); const float B_dim = fmaf(A_dim, 0.65f, 0.0f); const float log_argument = A_dim - 2.03f; const float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim; const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.55f), 1.0f); float adaptive_coeff = A_dim - adaptive_coeff_addition; const float A_dim_clone = fmaf(A_dim - fmaf(-fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim), 0.5f, 0.0f); int it = 0; auto evalAt = [&](float t) -> float { map.map01ToPoint(t, q_local); float f = cost(q_local, x, y); const float step_arg = fmaf(-dim_f, 0.825f, 3.3f); float eta = fmaf(fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, fmaf(step_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f); if ((((f < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) && p > 0.55f) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && ((p > 0.7f && !(it % 3)) || p > 0.9f))) && dim > 2) || (f < fmaf(bestF, adaptive_coeff, 0.0f) && dim < 3)) { float acc = 0.0f; int ii = 0; while (ii < n) { acc = fmaf(q_local[ii], 1.0f, acc); phi[ii] = acc; ++ii; } FABE13_SINCOS(phi, s_arr, c_arr, n); float as = 0.0f; float ac = 0.0f; int k = n - 1; while (k >= 0) { const float Lk = cost.variableLen ? q_local[n + k] : 1.0f; as = fmaf(Lk, s_arr[k], as); ac = fmaf(Lk, c_arr[k], ac); sum_s[k] = as; sum_c[k] = ac; --k; } const float dx = fmaf(x, 1.0f, -cost.targetX); const float dy = fmaf(y, 1.0f, -cost.targetY); const float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1.0e-8f; int stepI = 0; const float adaptive_window = fmaf(1.0f / fmaf(bestF, adaptive_coeff_threshold, 0.0f), fmaf(bestF, adaptive_coeff_threshold, -f), 0.0f); const float curse_dim_arg = fmaf(dim_f, 0.825f, 0.0f); const int threshold_iters_grad = static_cast<int>(fmaf(fmaf(adaptive_window, adaptive_window, 1.0f), fmaf(fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, fmaf(curse_dim_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0685f, 0.0f), 0.0f)); int no_improve_steps = 0; float f_at_check_start = f; int steps_since_check = 0; const int check_interval = static_cast<int>(fmaf(1.0f / log2f(fmaf(dim_f, 0.5f, 1.0f)), static_cast<float>(threshold_iters_grad >> 2), 0.0f)); while (stepI < threshold_iters_grad) { int i = 0; #pragma loop ivdep while (i < n) { float gpen = 0.0f; { const float ai = fabsf(q_local[i]); const float v = fmaf(ai, 1.0f, -cost.minTheta); if (v > 0.0f) { const float scale_arg = fmaf(2.0f / (cost.minTheta + 1.0e-6f), fmaf(v, 0.69314718055994530941723212145818f, 0.0f), 0.0f); const float exp_val = fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float dpen_dtheta = fmaf(cost.sharpW, fmaf(exp_val, 0.69314718055994530941723212145818f * (2.0f / (cost.minTheta + 1.0e-6f)), 0.0f), copysignf(1.0f, q_local[i])); gpen = fmaf(dpen_dtheta, 1.0f, gpen); } } { const float tsg = fmaf(-q_local[i], cost.archBiasK, 0.0f); const float exp_arg = -tsg; const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float sig = 1.0f / fmaf(exp_val, 1.0f, 1.0f); gpen = fmaf(-fmaf(cost.archBiasW, cost.archBiasK, 0.0f), sig, gpen); } const float g = fmaf(fmaf(dx, -sum_s[i], fmaf(dy, sum_c[i], 0.0f)), 1.0f / dist, gpen); q_try[i] = fmaf(-eta, g, q_local[i]); const float lo = (i == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f; const float hi = 2.6179938779914943653855361527329f; if (q_try[i] < lo) q_try[i] = lo; else if (q_try[i] > hi) q_try[i] = hi; ++i; } if (cost.variableLen) { int j = 0; #pragma loop ivdep while (j < n) { const float g = fmaf(fmaf(dx, c_arr[j], fmaf(dy, s_arr[j], 0.0f)), 1.0f / dist, 0.0f); float v = fmaf(-eta, g, q_local[n + j]); if (v < 0.5f) v = 0.5f; else if (v > 2.0f) v = 2.0f; q_try[n + j] = v; ++j; } } float x2; float y2; const float f2 = cost(q_try, x2, y2); const float rel_improvement = fmaf(-1.0f / f, f2, 1.0f); if (f2 < f) { memcpy(q_local, q_try, static_cast<size_t>(dim) * sizeof(float)); f = f2; x = x2; y = y2; eta = fmaf(eta, fmaf(10.0f / sqrtf(dim_f), 1.0f / (1.0f + rel_improvement) * rel_improvement, 1.0f), 0.0f); } else { const float caution_coeff = 5.0f / sqrtf(dim_f) * rel_improvement; eta = fmaf(eta, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, fmaf(caution_coeff, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.0f); memcpy(q_try, q_local, static_cast<size_t>(dim) * sizeof(float)); ++no_improve_steps; } ++steps_since_check; if (steps_since_check == check_interval) { if (fmaf(-1.0f / f_at_check_start, f, 1.0f) < 1e-3f) break; f_at_check_start = f; steps_since_check = 0; } if (no_improve_steps > threshold_iters_grad >> 2) break; ++stepI; } } if (f < bestF) { if (0.95f < p || dim < 3) { const int last = n - 1; const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f; const float hi = 2.6179938779914943653855361527329f; float bestLocF = f; float saved = q_local[last]; float delta = eta; while (delta > fmaf(delta, 0.15f, 0.0f)) { int sgn = -1; while (sgn < 2) { float cand = fmaf(static_cast<float>(sgn), delta, saved); if (cand < lo) cand = lo; else if (cand > hi) cand = hi; const float backup = q_local[last]; q_local[last] = cand; float x2; float y2; const float f2 = cost(q_local, x2, y2); if (f2 < bestLocF) { bestLocF = f2; x = x2; y = y2; saved = cand; } q_local[last] = backup; sgn += 2; } delta = fmaf(delta, 0.5f, 0.0f); } if (bestLocF < f) { q_local[last] = saved; f = bestLocF; } } bestF = f; bestQ.assign(q_local, q_local + dim); bestX = x; bestY = y; no_improve = 0; } else ++no_improve; return f; }; const float f_a = evalAt(a); const float f_b = evalAt(b); const float Kf = fminf(fmaxf(2.0f * dim_f, 8.0f), 128.0f); const int K = static_cast<int>(Kf); H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + 16u); const int rank = g_world->rank(); const int world = g_world->size(); alignas(16) float seeds[256 * 32]; const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, static_cast<unsigned>(fmaf(static_cast<float>(rank), 7919.0f, static_cast<float>(seed)))); int i = 0; while (i < seedCnt) { const float* s = seeds + static_cast<size_t>(fmaf(static_cast<float>(i), 32.0f, 0.0f)); const float t_seed = map.pointToT(s); const float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f) : fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f), exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)), 0.0f); const float t1 = fmaxf(a, fmaf(-interval_size, 1.0f, t_seed)); const float t2 = fminf(b, fmaf(interval_size, 1.0f, t_seed)); if (t2 > t1) { alignas(16) float q1[32]; alignas(16) float q2[32]; float x1; float y1; float x2; float y2; map.map01ToPoint(t1, q1); const float f1 = cost(q1, x1, y1); map.map01ToPoint(t2, q2); const float f2 = cost(q2, x2, y2); IntervalND* I = new IntervalND(t1, t2, f1, f2); I->i1 = t_to_idx(t1); I->i2 = t_to_idx(t2); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); if (i < 3) I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f); else { const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f); const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f); const float mult = fmaf(start_mult, exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)), 0.0f); I->R = fmaf(I->R, mult, 0.0f); } H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); if (f1 < bestF) { bestF = f1; bestQ.assign(q1, q1 + dim); bestX = x1; bestY = y1; } if (f2 < bestF) { bestF = f2; bestQ.assign(q2, q2 + dim); bestX = x2; bestY = y2; } } ++i; } float prev_t = a; float prev_f = f_a; int k = 1; while (k <= K) { const float t = fmaf(fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1))); const float f = evalAt(t); IntervalND* I = new IntervalND(prev_t, t, prev_f, f); I->i1 = t_to_idx(prev_t); I->i2 = t_to_idx(t); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); prev_t = t; prev_f = f; ++k; } IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b); tail->i1 = t_to_idx(prev_t); tail->i2 = t_to_idx(b); tail->diam = map.block_diameter(tail->i1, tail->i2); tail->compute_span_level(map); tail->set_metric(tail->diam); update_pockets_and_Mmax(tail); tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f)); H.emplace_back(tail); std::push_heap(H.begin(), H.end(), ComparePtrND); const int kickEveryDim = static_cast<int>(fmaf(22.5f, exp2f(-0.295f * dim_f), 0.0f)); const int noImproveThrDim = static_cast<int>(fmaf(32.5f, exp2f(-0.1f * dim_f), 0.0f)); while (it < maxIter) { p = fmaf(-1.0f / initial_len, dmax, 1.0f); const float p_arg = fmaf(p, 2.3f, -2.9775f); const float r_eff = fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, r, 0.0f); const bool staganition = no_improve > noImproveThrDim; if (!(it % kickEveryDim) || staganition) { const float t_best = map.pointToT(bestQ.data()); const float log_arg = fmaf(dim_f, 1.0f, -1.0f); const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f); int ii = 0; while (ii < 2) { const float off = (ii == 0) ? fmaf(adaptive_diameter, 2.0f, 0.0f) : fmaf(-adaptive_diameter, 2.0f, 0.0f); const float t_seed = fminf(b, fmaxf(a, fmaf(off, 1.0f, t_best))); const float f_seed = evalAt(t_seed); IntervalND* J = new IntervalND(fmaf(-adaptive_diameter, 1.0f, t_seed), fmaf(adaptive_diameter, 1.0f, t_seed), f_seed, f_seed); J->i1 = t_to_idx(fmaf(-adaptive_diameter, 1.0f, t_seed)); J->i2 = t_to_idx(fmaf(adaptive_diameter, 1.0f, t_seed)); J->diam = map.block_diameter(J->i1, J->i2); J->compute_span_level(map); J->set_metric(J->diam); update_pockets_and_Mmax(J); J->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); J->R = fmaf(J->R, 2.03f - adaptive_coeff, 0.0f); H.emplace_back(J); std::push_heap(H.begin(), H.end(), ComparePtrND); ++ii; } no_improve = 0; } const float tmp_exp_arg_threshold = fmaf(-exp_arg_threshold * p, 10.0f, 0.0f); adaptive_coeff_threshold = fmaf(fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, fmaf(tmp_exp_arg_threshold, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition_threshold, 1.0f); const float exp_arg = fmaf(B_dim, p, 0.0f); adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim); if (dim < 3) { const float adaptive_coeff_arg = adaptive_coeff - 1.0f; const float adaptive_coeff_threshold_arg = adaptive_coeff_threshold - 1.0f; adaptive_coeff = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg); adaptive_coeff_threshold = fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, fmaf(adaptive_coeff_threshold_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_threshold_arg); } const float arg_for_T = fmaf(p, 4.5f, 2.475f); const float exp_argument = fmaf(-0.2925f, dim_f, 0.0f); const float exp2_exp_arg = fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float A = fmaf(39.995f, exp2_exp_arg, 0.0f); const int T = static_cast<int>(fmaf(fmaf(fmaf(1.0f / fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 2.0f), 1.0498f, -0.04978f), fmaf(sqrtf(dim_f), 4.875f, 0.0f), 0.0f), fmaf(-(exp2_exp_arg - 1.0f), A, A), 0.0f)); std::pop_heap(H.begin(), H.end(), ComparePtrND); IntervalND* cur = H.back(); H.pop_back(); const float x1 = cur->x1; const float x2 = cur->x2; const float y1 = cur->y1; const float y2 = cur->y2; float m = fmaf(r_eff, Mmax, 0.0f); float tNew = step(m, x1, x2, y1, y2, dim_f, r_eff); const float bestFOld = bestF; const float fNew = evalAt(tNew); IntervalND* L = new IntervalND(x1, tNew, y1, fNew); IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2); L->i1 = t_to_idx(x1); L->i2 = t_to_idx(tNew); Rv->i1 = t_to_idx(tNew); Rv->i2 = t_to_idx(x2); L->diam = map.block_diameter(L->i1, L->i2); Rv->diam = map.block_diameter(Rv->i1, Rv->i2); L->compute_span_level(map); Rv->compute_span_level(map); L->set_metric(L->diam); Rv->set_metric(Rv->diam); const float Mloc = fmaxf(L->M, Rv->M); update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv); const float prevMmax = Mmax; if (Mloc > Mmax) Mmax = Mloc; m = fmaf(r_eff, Mmax, 0.0f); if (adaptive) { const float len1 = fmaf(tNew, 1.0f, -x1); const float len2 = fmaf(x2, 1.0f, -tNew); if (fmaf(len1, 1.0f, len2) == dmax) { dmax = fmaxf(len1, len2); for (auto pI : H) { const float Ls = fmaf(pI->x2, 1.0f, -pI->x1); if (Ls > dmax) dmax = Ls; } } if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) { const float alpha = p * p; const float beta = fmaf(-alpha, 1.0f, 2.0f); const float MULT = (1.0f / dmax) * Mmax; const float global_coeff = fmaf(MULT, r_eff, -MULT); const float GF = beta * global_coeff; L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f))); Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f))); const size_t sz = H.size(); RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha); std::make_heap(H.begin(), H.end(), ComparePtrND); } else { if (Mloc > prevMmax) { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) { const size_t sz = H.size(); RecomputeR_ConstM_AVX2_ND(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtrND); } } else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); } } } else { if (Mloc > prevMmax) { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); if (Mloc > fmaf(adaptive_coeff, prevMmax, -0.03f)) { const size_t sz = H.size(); RecomputeR_ConstM_AVX2_ND(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtrND); } } else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); } } H.emplace_back(L); std::push_heap(H.begin(), H.end(), ComparePtrND); H.emplace_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND); _mm_prefetch((const char*)H[0], _MM_HINT_T0); _mm_prefetch((const char*)H[1], _MM_HINT_T0); IntervalND* const top = H.front(); const float interval_len = dim > 1 ? fmaf(top->x2, 1.0f, -top->x1) : top->diam; if ((dim > 1 ? exp2f((1.0f / dim_f) * log2f(interval_len)) : interval_len) < eps || it == maxIter) { out_iterations = static_cast<size_t>(it); out_achieved_epsilon = interval_len; return; } if (!(it % T) || staganition) { MultiCrossMsg out; const unsigned intervals_to_send = static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f))); out.count = intervals_to_send; float* dest = out.intervals; unsigned ii = 0; while (ii < 3) { IntervalND* Tt = H[ii]; dest[0] = Tt->x1; dest[1] = 0.0f; dest[2] = Tt->x2; dest[3] = 0.0f; dest[4] = Tt->R; dest += 5; ++ii; } const size_t iterations = std::bit_width(static_cast<size_t>(world - 1)); bool active = true; const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter_T), 1.0f, 1.0f)) & 1); size_t ii2 = 0; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ static_cast<int>(step); if (partner < world) { const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T); if (am_sender) { g_world->isend(partner, 0, out); active = false; } } ++ii2; } ++exchange_counter_T; } const float improvement_threshold = 2.03f - adaptive_coeff; if (bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 7), 0.0f) && (p > 0.7f && !(it % 3)) || bestF < fmaf(bestFOld, fast_pow_int(improvement_threshold, 5), 0.0f) && p > 0.9f) { BestSolutionMsg out; out.bestF = bestF; out.bestX = bestX; out.bestY = bestY; out.dim = static_cast<unsigned>(bestQ.size()); memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float)); { IntervalND* const I0 = H[0]; out.bestI[0] = I0->x1; out.bestI[1] = I0->x2; out.bestI[2] = I0->y1; out.bestI[3] = I0->y2; if (dim > 2) { IntervalND* const I1 = H[1]; out.bestI[4] = I1->x1; out.bestI[5] = I1->x2; out.bestI[6] = I1->y1; out.bestI[7] = I1->y2; } } const size_t iterations = std::bit_width(static_cast<size_t>(world - 1)); bool active = true; const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter), 1.0f, 1.0f)) & 1); size_t ii2 = 0; while (ii2 < iterations && active) { const size_t step = 1ULL << ii2; const int partner = rank ^ static_cast<int>(step); if (partner < world) { const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T); if (am_sender) { g_world->isend(partner, 2, out); active = false; } } ++ii2; } ++exchange_counter; } while (g_world->iprobe(boost::mpi::any_source, 0)) { MultiCrossMsg in; g_world->recv(boost::mpi::any_source, 0, in); const MultiCrossMsg& mX = in; unsigned ii = 0; while (ii < mX.count) { const float* d = &mX.intervals[ii * 5]; float sx = d[0]; float ex = d[2]; if (ex > sx) { alignas(16) float tmp[32]; float tx; float ty; map.map01ToPoint(sx, tmp); const float y1i = cost(tmp, tx, ty); map.map01ToPoint(ex, tmp); const float y2i = cost(tmp, tx, ty); IntervalND* inj = new IntervalND(sx, ex, y1i, y2i); inj->i1 = t_to_idx(sx); inj->i2 = t_to_idx(ex); inj->diam = map.block_diameter(inj->i1, inj->i2); inj->compute_span_level(map); inj->set_metric(inj->diam); update_pockets_and_Mmax(inj); inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); _mm_prefetch((const char*)H[0], _MM_HINT_T0); _mm_prefetch((const char*)H[1], _MM_HINT_T0); IntervalND* topH = H.front(); const float topH_R = topH->R; const float arg_for_adaptive = fmaf(2.03f - adaptive_coeff, topH_R, 0.0f); if (inj->R > fmaf(fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, fmaf(arg_for_adaptive, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 0.5819767068693264243850020f, -1.0f) || staganition) { const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f; const float kf = staganition ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f); const float exp_arg = fmaf(B_dim, fmaf(1.0f / static_cast<float>(mX.count - 1u), static_cast<float>(ii), 0.0f), 0.0f); float adaptive_coeff_clone = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim_clone); if (dim < 3) { const float adaptive_coeff_arg = adaptive_coeff_clone - 1.0f; adaptive_coeff_clone = fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, fmaf(adaptive_coeff_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), adaptive_coeff_arg); } const float arg_for_inj = fmaf(1.0f / fmaf(adaptive_coeff, topH_R, 0.0f), inj->R - topH_R, 1.0f); inj->R = fmaf(d[4], fmaf(fmaf(fmaf(arg_for_inj, fmaf(arg_for_inj, fmaf(arg_for_inj, fmaf(arg_for_inj, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), kf, 0.0f), adaptive_coeff_clone, 0.0f), 0.0f); H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtrND); } } ++ii; } } while (g_world->iprobe(boost::mpi::any_source, 2)) { BestSolutionMsg bm; g_world->recv(boost::mpi::any_source, 2, bm); if (bm.bestF < fmaf(bestF, 2.03f - adaptive_coeff, 0.0f) || staganition) { if (dim > 2) { _mm_prefetch((const char*)H[0], _MM_HINT_T0); _mm_prefetch((const char*)H[1], _MM_HINT_T0); const float adaptive_coeff_clone0 = fmaf(-1.0f, adaptive_coeff_addition, A_dim_clone); const float exp_arg = B_dim; const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); const float adaptive_coeff_clone1 = fmaf(-exp_val, adaptive_coeff_addition, A_dim_clone); const float coeff0 = fmaf(adaptive_coeff_clone0, adaptive_coeff, 0.0f); const float coeff1 = fmaf(adaptive_coeff_clone1, adaptive_coeff, 0.0f); { const float sx = bm.bestI[0]; const float ex = bm.bestI[1]; const float y1i = bm.bestI[2]; const float y2i = bm.bestI[3]; IntervalND* I = new IntervalND(sx, ex, y1i, y2i); I->i1 = t_to_idx(sx); I->i2 = t_to_idx(ex); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); I->R = fmaf(I->R, coeff0, 0.0f); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } { const float sx = bm.bestI[4]; const float ex = bm.bestI[5]; const float y1i = bm.bestI[6]; const float y2i = bm.bestI[7]; IntervalND* I = new IntervalND(sx, ex, y1i, y2i); I->i1 = t_to_idx(sx); I->i2 = t_to_idx(ex); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); I->R = fmaf(I->R, coeff1, 0.0f); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } } else { _mm_prefetch((const char*)H[0], _MM_HINT_T0); const float sx = bm.bestI[0]; const float ex = bm.bestI[1]; const float y1i = bm.bestI[2]; const float y2i = bm.bestI[3]; IntervalND* I = new IntervalND(sx, ex, y1i, y2i); I->i1 = t_to_idx(sx); I->i2 = t_to_idx(ex); I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam); update_pockets_and_Mmax(I); I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f)); I->R = fmaf(I->R, adaptive_coeff, 0.0f); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND); } if (bm.bestF < bestF) { bestF = bm.bestF; bestX = bm.bestX; bestY = bm.bestY; bestQ.assign(bm.bestQ, bm.bestQ + bm.dim); } } } ++it; } }

Compartilhar esta Q&A