"#include "pch.h" #define XOR_RAND(state, result_...

Question

"#include "pch.h"

#define XOR_RAND(state, result_var)
do {
unsigned s = state;
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
float tmp = static_cast<float>(static_cast<double>(s) *
(1.0 / 4294967296.0));
result_var = tmp;
} while (0)

#define XOR_RAND_GRSH(state, result_var)
do {
unsigned s = state;
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var =
fmaf(static_cast<float>(static_cast<int>(s)), 0x1.0p-31f, -1.0f);
} while (0)

#define FABE13_COS(x, result_var)
do {
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530717958647692f);
if (r > 3.14159265359f)
r = 6.28318530717958647692f - r;
if (r < 1.57079632679f) {
const float t2 = r * r;
const float t4 = t2 * t2;
result_var =
fmaf(t4,
fmaf(t2, -0.0013888889f, 0.0416666667f),
fmaf(t2, -0.5f, 1.0f));
} else {
r = 3.14159265359f - r;
const float t2 = r * r;
const float t4 = t2 * t2;
result_var =
-fmaf(t4,
fmaf(t2, -0.0013888889f, 0.041666667f),
fmaf(t2, -0.5f, 1.0f));
}
} while (0)

#define FABE13_SIN(x, result_var)
do {
const float x = x;
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530717958647692f);
bool sfl = r > 3.14159265359f;
if (sfl)
r = 6.28318530717958647692f - r;
bool cfl = r > 1.57079632679f;
if (cfl)
r = 3.14159265359f - r;
const float t2 = r * r;
float _s = fmaf(t2,
fmaf(t2,
fmaf(t2, -0.0001984127f, 0.0083333333f),
-0.16666666f),
1.0f) *
r;
result_var = ((x < 0.0f) ^ sfl) ? -_s : _s;
} while (0)

#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
const int limit = n & ~7;
if (n >= 8) {
static __declspec(align(16)) const __m256 VEC_TWOPI =
_mm256_set1_ps(6.28318530717958647692f);
static __declspec(align(16)) const __m256 VEC_PI =
_mm256_set1_ps(3.14159265359f);
static __declspec(align(16)) const __m256 VEC_PI_2 =
_mm256_set1_ps(1.57079632679f);
static __declspec(align(16)) const __m256 INV_TWOPI =
_mm256_set1_ps(0.15915494309189535f);
static __declspec(align(16)) const __m256 BIAS =
_mm256_set1_ps(12582912.0f);
static __declspec(align(16)) const __m256 VEC_COS_P5 =
_mm256_set1_ps(-0.0013888889f);
static __declspec(align(16)) const __m256 VEC_COS_P3 =
_mm256_set1_ps(0.0416666667f);
static __declspec(align(16)) const __m256 VEC_COS_P1 =
_mm256_set1_ps(-0.5f);
static __declspec(align(16)) const __m256 VEC_COS_P0 =
_mm256_set1_ps(1.0f);
static __declspec(align(16)) const __m256 VEC_SIN_P5 =
_mm256_set1_ps(-0.0001984127f);
static __declspec(align(16)) const __m256 VEC_SIN_P3 =
_mm256_set1_ps(0.0083333333f);
static __declspec(align(16)) const __m256 VEC_SIN_P1 =
_mm256_set1_ps(-0.16666666f);
static __declspec(align(16)) const __m256 VEC_SIN_P0 =
_mm256_set1_ps(1.0f);
static __declspec(align(16)) const __m256 VEC_ZERO =
_mm256_setzero_ps();
while (i < limit) {
const __m256 vx = _mm256_load_ps(&(in)[i]);
const __m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f),
vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
const __m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
const __m256 r1 =
_mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
const __m256 r2 =
_mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
const __m256 t2 = _mm256_mul_ps(r2, r2);
const __m256 cosv =
_mm256_fmadd_ps(t2,
_mm256_fmadd_ps(t2,
_mm256_fmadd_ps(t2,
VEC_COS_P5,
VEC_COS_P3),
VEC_COS_P1),
VEC_COS_P0);
const __m256 sinv =
_mm256_mul_ps(_mm256_fmadd_ps(t2,
_mm256_fmadd_ps(t2,
_mm256_fmadd_ps(t2,
VEC_SIN_P5,
VEC_SIN_P3),
VEC_SIN_P1),
VEC_SIN_P0),
r2);
const __m256 cflip =
_mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
const __m256 sflip =
_mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ),
_mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
_mm256_store_ps(&(cos_out)[i],
_mm256_blendv_ps(cosv,
_mm256_sub_ps(VEC_ZERO,
cosv),
cflip));
_mm256_store_ps(&(sin_out)[i],
_mm256_blendv_ps(sinv,
_mm256_sub_ps(VEC_ZERO,
sinv),
sflip));
i += 8;
}
}
while (i < n) {
const float x = (in)[i];
const float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
const bool sflip = r > 3.14159265359f;
if (sflip)
r = 6.28318530718f - r;
const bool cflip = r > 1.57079632679f;
if (cflip)
r = 3.14159265359f - r;
const float t2 = r * r;
const float c = fmaf(t2,
fmaf(t2,
fmaf(t2, -0.0013888889f, 0.0416666667f),
-0.5f),
1.0f);
const float s = fmaf(t2,
fmaf(t2,
fmaf(t2, -0.0001984127f, 0.0083333333f),
-0.16666666f),
1.0f) *
r;
(cos_out)[i] = cflip ? -c : c;
(sin_out)[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while (0)

enum List : unsigned {
Top = 0b00u,
Down = 0b01u,
Left = 0b10u,
Right = 0b11u
};

__declspec(align(16)) struct Step final {
const unsigned next;
const unsigned dx;
const unsigned dy;

text
__forceinline Step(unsigned n, unsigned x, unsigned y) noexcept
	: next(n), dx(x), dy(y) {
}

};

__declspec(align(16)) struct InvStep final {
const unsigned q;
const unsigned next;

text
__forceinline InvStep(unsigned q_val, unsigned n) noexcept
	: q(q_val), next(n) {
}

};

__declspec(align(16)) static const Step g_step_tbl[4][4] = {
{ Step(Right, 0u, 0u), Step(Top, 0u, 1u), Step(Top, 1u, 1u), Step(Left, 1u, 0u) },
{ Step(Left, 1u, 1u), Step(Down, 1u, 0u), Step(Down, 0u, 0u), Step(Right, 0u, 1u) },
{ Step(Down, 1u, 1u), Step(Left, 0u, 1u), Step(Left, 0u, 0u), Step(Top, 1u, 0u) },
{ Step(Top, 0u, 0u), Step(Right, 1u, 0u), Step(Right, 1u, 1u), Step(Down, 0u, 1u) }
};

__declspec(align(16)) static const InvStep g_inv_tbl[4][4] = {
{ InvStep(0u, Right), InvStep(1u, Top), InvStep(3u, Left), InvStep(2u, Top) },
{ InvStep(2u, Down), InvStep(3u, Right), InvStep(1u, Down), InvStep(0u, Left) },
{ InvStep(2u, Left), InvStep(1u, Left), InvStep(3u, Top), InvStep(0u, Down) },
{ InvStep(0u, Top), InvStep(3u, Down), InvStep(1u, Right), InvStep(2u, Right) }
};

static const boost::mpi::environment* g_env;
static const boost::mpi::communicator* g_world;
static const pybind11::scoped_interpreter* g_pyInterpreter;
static const pybind11::module_* g_pyOptimizerBridge;
static std::string g_exeDirCache;

__declspec(align(16)) struct CrossMsg final {
float s_x1;
float s_x2;
float e_x1;
float e_x2;
float Rtop;

text
template <typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept {
	ar& s_x1& s_x2& e_x1& e_x2& Rtop;
}

};

__declspec(align(16)) struct MultiCrossMsg final {
float intervals[35];
unsigned count;

text
template <typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept {
	ar& intervals& count;
}

};

__declspec(align(16)) struct BestSolutionMsg final {
float bestF;
float bestX;
float bestY;
float bestQ[32];
float bestI[8];
unsigned dim;

text
template <typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& ar, unsigned int) noexcept {
	ar& bestF& bestX& bestY& bestQ& bestI& dim;
}

};

__declspec(align(16)) struct Slab final sealed{
char* const base;
char* current;
char* const end;

text
	__forceinline Slab(void* memory, size_t usable) noexcept
			: base(static_cast<char*>(memory)), current(base),
				end(base + (usable & ~static_cast<size_t>(63u))) {

}
};

static thread_local tbb::enumerable_thread_specific<Slab*> tls( noexcept {
void* memory = _aligned_malloc(16777216u, 16u);
Slab* slab = static_cast<Slab*>(_aligned_malloc(sizeof(Slab), 16u));
new (slab) Slab(memory, 16777216u);
char* p = slab->base;
#pragma loop ivdep
while (p < slab->end) {
*p = 0;
p += 4096u;
}
return slab;
});

__declspec(align(16)) struct Peano2DMap final sealed{
const int levels;
const float a;
const float b;
const float c;
const float d;
const float lenx;
const float leny;
const float inv_lenx;
const unsigned scale;
const unsigned start;

text
	__forceinline Peano2DMap(int L, float _a, float _b, float _c, float _d, unsigned st) noexcept
			: levels(L), a(_a), b(_b), c(_c), d(_d), lenx(_b - _a), leny(_d - _c),
				inv_lenx(1.0f / (_b - _a)),
				scale(static_cast<unsigned>(1u) << (L << 1)), start(st) {

}
};

static Peano2DMap gActiveMap(0, 0, 0, 0, 0, 0);

__declspec(align(16)) struct Interval1D final sealed{
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float ordinate_factor;
const float N_factor;
const float quadratic_term;
const float M;
float R;

text
	static __declspec(noalias) __forceinline void* operator new(size_t) noexcept {
			Slab* s = tls.local();
			char* r = s->current;
			s->current += 64u;
			return r;
	}

	__declspec(noalias) __forceinline Interval1D(float _x1, float _x2, float _y1, float _y2, float _N) noexcept
			: x1(_x1), x2(_x2), y1(_y1), y2(_y2), delta_y(_y2 - _y1),
				ordinate_factor(-(y1 + y2) * 2.0f),
				N_factor(_N == 1.0f ? _x2 - _x1 : sqrtf(_x2 - _x1)),
				quadratic_term(fmaf((1.0f / N_factor)* delta_y, delta_y, 0.0f)),
				M(fabsf(delta_y) / N_factor) {

}

__declspec(noalias) __forceinline void ChangeCharacteristic(float _m) noexcept {
R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor));
}
};

static __declspec(noalias) __forceinline bool ComparePtr1D(const Interval1D* a, const Interval1D* b) noexcept {
return a->R < b->R;
}

static __declspec(noalias) __forceinline void RecomputeR_ConstM_AVX2_1D(Interval1D* const* arr, size_t n, float m) noexcept {
const __m256 vm = _mm256_set1_ps(m);
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
size_t i = 0;
size_t limit = n & ~static_cast<size_t>(7ull);
alignas(16) float q[8];
alignas(16) float nf[8];
alignas(16) float od[8];
alignas(16) float out[8];
#pragma loop ivdep
while (i < limit) {
int k = 0;
#pragma loop ivdep
while (k < 8) {
const Interval1D* p = arr[i + k];
q[k] = p->quadratic_term;
nf[k] = p->N_factor;
od[k] = p->ordinate_factor;
++k;
}
const __m256 vq = _mm256_load_ps(q);
const __m256 vnf = _mm256_load_ps(nf);
const __m256 vod = _mm256_load_ps(od);
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
_mm256_store_ps(out, res);
k = 0;
#pragma loop ivdep
while (k < 8) {
arr[i + k]->R = out[k];
++k;
}
i += 8;
}
while (i < n) {
arr[i]->ChangeCharacteristic(m);
++i;
}
}

static __declspec(noalias) __forceinline void RecomputeR_AffineM_AVX2_1D(Interval1D* const* arr, size_t n, float GF, float alpha) noexcept {
const __m256 vGF = _mm256_set1_ps(GF);
const __m256 va = _mm256_set1_ps(alpha);
size_t i = 0;
size_t limit = n & ~static_cast<size_t>(7ull);
alignas(16) float ln[8];
alignas(16) float Mv[8];
alignas(16) float q[8];
alignas(16) float nf[8];
alignas(16) float od[8];
alignas(16) float out[8];
#pragma loop ivdep
while (i < limit) {
int k = 0;
#pragma loop ivdep
while (k < 8) {
const Interval1D* p = arr[i + k];
ln[k] = p->x2 - p->x1;
Mv[k] = p->M;
q[k] = p->quadratic_term;
nf[k] = p->N_factor;
od[k] = p->ordinate_factor;
++k;
}
const __m256 vln = _mm256_load_ps(ln);
const __m256 vM = _mm256_load_ps(Mv);
const __m256 vq = _mm256_load_ps(q);
const __m256 vnf = _mm256_load_ps(nf);
const __m256 vod = _mm256_load_ps(od);
const __m256 vm = _mm256_fmadd_ps(vGF, vln, _mm256_mul_ps(va, vM));
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
_mm256_store_ps(out, res);
k = 0;
#pragma loop ivdep
while (k < 8) {
arr[i + k]->R = out[k];
++k;
}
i += 8;
}
while (i < n) {
const Interval1D* p = arr[i];
const float mi = fmaf(GF, fmaf(-p->x1, 1.0f, p->x2), fmaf(p->M, alpha, 0.0f));
arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor));
++i;
}
}

__declspec(align(16)) struct IntervalND final sealed{
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float ordinate_factor;
float N_factor;
float quadratic_term;
float M;
float R;
unsigned long long i1;
unsigned long long i2;
float diam;
int span_level;

text
	static __declspec(noalias) __forceinline void* operator new(size_t) noexcept {
			Slab* s = tls.local();
			char* r = s->current;
			s->current += 64u;
			return r;
	}

	__declspec(noalias) __forceinline IntervalND(float _x1, float _x2, float _y1, float _y2) noexcept
			: x1(_x1), x2(_x2), y1(_y1), y2(_y2), delta_y(fmaf(_y2, 1.0f, -_y1)),
				ordinate_factor(fmaf(fmaf(-y1, 1.0f, -y2), 2.0f, 0.0f)),
				N_factor(0), quadratic_term(0), M(0), R(0),
				i1(0), i2(0), diam(0), span_level(0) {

}

__declspec(noalias) __forceinline void compute_span_level(const struct MortonND& map) noexcept;
__declspec(noalias) __forceinline void set_metric(float d_alpha) noexcept {
N_factor = d_alpha;
quadratic_term = fmaf(1.0f / N_factor, fmaf(delta_y, delta_y, 0.0f), 0.0f);
M = fmaf(1.0f / N_factor, fabsf(delta_y), 0.0f);
}
__declspec(noalias) __forceinline void ChangeCharacteristic(float _m) noexcept {
R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor));
}
};

static __declspec(noalias) __forceinline bool ComparePtrND(const IntervalND* a, const IntervalND* b) noexcept {
return a->R < b->R;
}

static __declspec(noalias) __forceinline void RecomputeR_ConstM_AVX2_ND(IntervalND* const* arr, size_t n, float m) noexcept {
const __m256 vm = _mm256_set1_ps(m);
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
size_t i = 0;
size_t limit = n & ~static_cast<size_t>(7ull);
alignas(16) float q[8];
alignas(16) float nf[8];
alignas(16) float od[8];
alignas(16) float out[8];
#pragma loop ivdep
while (i < limit) {
int k = 0;
#pragma loop ivdep
while (k < 8) {
const IntervalND* p = arr[i + k];
q[k] = p->quadratic_term;
nf[k] = p->N_factor;
od[k] = p->ordinate_factor;
++k;
}
const __m256 vq = _mm256_load_ps(q);
const __m256 vnf = _mm256_load_ps(nf);
const __m256 vod = _mm256_load_ps(od);
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
_mm256_store_ps(out, res);
k = 0;
#pragma loop ivdep
while (k < 8) {
arr[i + k]->R = out[k];
++k;
}
i += 8;
}
while (i < n) {
arr[i]->ChangeCharacteristic(m);
++i;
}
}

static __declspec(noalias) __forceinline void RecomputeR_AffineM_AVX2_ND(IntervalND* const* arr, size_t n, float GF, float alpha) noexcept {
const __m256 vGF = _mm256_set1_ps(GF);
const __m256 va = _mm256_set1_ps(alpha);
size_t i = 0;
size_t limit = n & ~static_cast<size_t>(7ull);
alignas(16) float ln[8];
alignas(16) float Mv[8];
alignas(16) float q[8];
alignas(16) float nf[8];
alignas(16) float od[8];
alignas(16) float out[8];
#pragma loop ivdep
while (i < limit) {
int k = 0;
#pragma loop ivdep
while (k < 8) {
const IntervalND* p = arr[i + k];
ln[k] = p->x2 - p->x1;
Mv[k] = p->M;
q[k] = p->quadratic_term;
nf[k] = p->N_factor;
od[k] = p->ordinate_factor;
++k;
}
const __m256 vln = _mm256_load_ps(ln);
const __m256 vM = _mm256_load_ps(Mv);
const __m256 vq = _mm256_load_ps(q);
const __m256 vnf = _mm256_load_ps(nf);
const __m256 vod = _mm256_load_ps(od);
const __m256 vm = _mm256_fmadd_ps(vGF, vln, _mm256_mul_ps(va, vM));
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
_mm256_store_ps(out, res);
k = 0;
#pragma loop ivdep
while (k < 8) {
arr[i + k]->R = out[k];
++k;
}
i += 8;
}
while (i < n) {
const IntervalND* p = arr[i];
const float mi = fmaf(GF, fmaf(-p->x1, 1.0f, p->x2), fmaf(p->M, alpha, 0.0f));
arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor));
++i;
}
}

static __declspec(noalias) __forceinline float fast_pow_int(const float v, const int n) noexcept {
float r;
switch (n) {
case 3: {
const float v2 = fmaf(v, v, 0.0f);
r = fmaf(v2, v, 0.0f);
} break;
case 4: {
const float v2 = fmaf(v, v, 0.0f);
r = fmaf(v2, v2, 0.0f);
} break;
case 5: {
const float v2 = fmaf(v, v, 0.0f);
const float v4 = fmaf(v2, v2, 0.0f);
r = fmaf(fmaf(v4, v2, 0.0f), v, 0.0f);
} break;
case 6: {
const float v2 = fmaf(v, v, 0.0f);
const float v4 = fmaf(v2, v2, 0.0f);
r = fmaf(v4, v2, 0.0f);
} break;
case 7: {
const float v2 = fmaf(v, v, 0.0f);
const float v4 = fmaf(v2, v2, 0.0f);
r = fmaf(fmaf(v4, v2, 0.0f), v, 0.0f);
} break;
case 8: {
const float v2 = fmaf(v, v, 0.0f);
const float v4 = fmaf(v2, v2, 0.0f);
r = fmaf(v4, v4, 0.0f);
} break;
case 9: {
const float v3 = fmaf(fmaf(v, v, 0.0f), v, 0.0f);
const float v6 = fmaf(v3, v3, 0.0f);
r = fmaf(v6, v3, 0.0f);
} break;
case 10: {
const float v2 = fmaf(v, v, 0.0f);
const float v4 = fmaf(v2, v2, 0.0f);
const float v8 = fmaf(v4, v4, 0.0f);
r = fmaf(v8, v2, 0.0f);
} break;
case 11: {
const float v2 = fmaf(v, v, 0.0f);
const float v4 = fmaf(v2, v2, 0.0f);
const float v8 = fmaf(v4, v4, 0.0f);
r = fmaf(fmaf(v8, v2, 0.0f), v, 0.0f);
} break;
case 12: {
const float v3 = fmaf(fmaf(v, v, 0.0f), v, 0.0f);
const float v6 = fmaf(v3, v3, 0.0f);
r = fmaf(v6, v6, 0.0f);
} break;
case 13: {
const float v3 = fmaf(fmaf(v, v, 0.0f), v, 0.0f);
const float v6 = fmaf(v3, v3, 0.0f);
r = fmaf(fmaf(v6, v6, 0.0f), v, 0.0f);
} break;
case 14: {
const float v7 = fmaf(fmaf(fmaf(fmaf(fmaf(fmaf(v, v, 0.0f), v, 0.0f), v, 0.0f), v, 0.0f), v, 0.0f), v, 0.0f);
r = fmaf(v7, v7, 0.0f);
} break;
case 15: {
const float v7 = fmaf(fmaf(fmaf(fmaf(fmaf(fmaf(v, v, 0.0f), v, 0.0f), v, 0.0f), v, 0.0f), v, 0.0f), v, 0.0f);
r = fmaf(fmaf(v7, v7, 0.0f), v, 0.0f);
} break;
default: {
const float v2 = fmaf(v, v, 0.0f);
const float v4 = fmaf(v2, v2, 0.0f);
const float v8 = fmaf(v4, v4, 0.0f);
r = fmaf(v8, v8, 0.0f);
}
}
return r;
}

static __declspec(noalias) __forceinline float step(const float _m, const float x1, const float x2, const float y1, const float y2, const float _N, const float _r) noexcept {
const float diff = fmaf(y2, 1.0f, -y1);
const unsigned sign_mask = ((reinterpret_cast<const unsigned>(&diff)) & 0x80000000u) ^ 0x80000000u;
const float sign_mult = reinterpret_cast<const float>(&sign_mask);
if (_N == 1.0f)
return fmaf(fmaf(-(1.0f / _m), diff, x1 + x2), 0.5f, 0.0f);
if (_N == 2.0f)
return fmaf(fmaf(fmaf(fmaf(1.0f / fmaf(_m, _m, 0.0f), sign_mult, 0.0f), fmaf(fmaf(diff, diff, 0.0f), _r, 0.0f), 0.0f), 1.0f, fmaf(x1, 1.0f, x2)), 0.5f, 0.0f);
return fmaf(fmaf(fmaf(fmaf(1.0f / fast_pow_int(_m, static_cast<int>(_N)), sign_mult, 0.0f), fast_pow_int(fabsf(diff), static_cast<int>(_N)), 0.0f), _r, 0.0f), 1.0f, fmaf(x1, 1.0f, x2)) * 0.5f;
}

__declspec(align(16)) struct MortonCachePerRank final sealed{
std::vector<int> permCache;
std::vector<unsigned long long> invMaskCache;
unsigned baseSeed;
};

static thread_local MortonCachePerRank g_mc;

static __declspec(noalias) __forceinline unsigned long long gray_encode(unsigned long long x) noexcept {
return x ^ (x >> 1);
}

static __declspec(noalias) __forceinline long long gray_decode(unsigned long long g) noexcept {
g ^= g >> 32;
g ^= g >> 16;
g ^= g >> 8;
g ^= g >> 4;
g ^= g >> 2;
g ^= g >> 1;
return static_cast<long long>(g);
}

__declspec(align(16)) struct MortonND final sealed{
const int dim;
const int levels;
const int eff_levels;
const int extra_levels;
const int chunks;
std::vector<int> chunk_bits;
std::vector<unsigned long long> chunk_bases;
unsigned long long scale;
std::vector<float> low;
std::vector<float> high;
std::vector<float> step;
std::vector<float> invStep;
std::vector<float> baseOff;
std::vector<int> perm;
std::vector<unsigned long long> invMask;
std::vector<unsigned long long> pextMask;
std::vector<unsigned long long> pextMaskChunks;
const float invScaleLevel;
const bool use_gray;

text
	static __declspec(noalias) __forceinline unsigned long long make_mask(int dim, int Lc, int d) noexcept {
			unsigned long long m = 0ull;
			unsigned long long bitpos = static_cast<unsigned long long>(d);
			int b = 0;

#pragma loop ivdep
while (b < Lc) {
m |= 1ull << bitpos;
bitpos += static_cast<unsigned long long>(dim);
++b;
}
return m;
}

text
	__declspec(noalias) __forceinline MortonND(int D, int L, const float* lows, const float* highs, const MortonCachePerRank& mc)
			: dim(D), levels(L),
				eff_levels((std::max)(1, static_cast<int>(63 / (D ? D : 1)))),
				extra_levels((L > eff_levels) ? (L - eff_levels) : 0),
				chunks((extra_levels > 0) ? (1 + (extra_levels + eff_levels - 1) / eff_levels) : 1),
				low(lows, lows + D), high(highs, highs + D), step(D, 0.0f), invStep(D, 0.0f), baseOff(D, 0.0f),
				perm(mc.permCache.begin(), mc.permCache.begin() + D),
				invMask(mc.invMaskCache.begin(), mc.invMaskCache.begin() + D),
				invScaleLevel(1.0f / static_cast<float>(static_cast<unsigned long long>(1) << L)), use_gray(true) {
			int d = 0;

#pragma loop ivdep
while (d < dim) {
const float rng = high[d] - low[d];
const float st = rng * invScaleLevel;
step[d] = st;
invStep[d] = 1.0f / st;
baseOff[d] = fmaf(0.5f, st, low[d]);
++d;
}
chunk_bits.resize(chunks);
pextMaskChunks.resize(static_cast<size_t>(chunks) * static_cast<size_t>(dim));
chunk_bases.resize(chunks);
int remaining = levels;
int c = 0;
while (c < chunks) {
const int Lc = (c == 0) ? (std::min)(eff_levels, remaining) : (std::min)(eff_levels, remaining);
chunk_bits[c] = Lc;
remaining -= Lc;
const unsigned long long baseC = static_cast<unsigned long long>(1) << (dim * Lc);
chunk_bases[c] = baseC;
d = 0;
#pragma loop ivdep
while (d < dim) {
pextMaskChunks[static_cast<size_t>(c) * static_cast<size_t>(dim) + static_cast<size_t>(d)] = make_mask(dim, Lc, d);
++d;
}
++c;
}
pextMask.resize(dim);
d = 0;
#pragma loop ivdep
while (d < dim) {
pextMask[d] = make_mask(dim, chunk_bits[0], d);
++d;
}
scale = static_cast<unsigned long long>(1) << (dim * chunk_bits[0]);
}

text
	__declspec(noalias) __forceinline float block_diameter(unsigned long long i1, unsigned long long i2) const noexcept {
			if (i1 > i2)
					std::swap(i1, i2);
			float s2 = 0.0f;
			int d = 0;

#pragma loop ivdep
while (d < dim) {
const int pd = perm[d];
const unsigned long long varying = (i1 ^ i2) & pextMask[d];
const int nfree_hi = _mm_popcnt_u64(varying);
const int nfree_total = nfree_hi + levels - chunk_bits[0];
const float range = fmaf(step[pd], fmaf(ldexpf(1.0f, nfree_total), 1.0f, -1.0f), 0.0f);
s2 = fmaf(range, range, s2);
++d;
}
return sqrtf(s2);
}

text
	__declspec(noalias) __forceinline void map01ToPoint(float t, float* __restrict out) const noexcept {
			unsigned long long accBits[32] = {0ull};
			int accShifted[32] = {0};
			int c = 0;
			while (c < chunks) {
					const int Lc = chunk_bits[c];
					const unsigned long long baseC = chunk_bases[c];
					const float scaled = t * static_cast<float>(baseC);
					unsigned long long idxc = (scaled >= static_cast<float>(baseC)) ? (baseC - 1ull) : static_cast<unsigned long long>(scaled);
					if (use_gray)
							idxc = gray_encode(idxc);
					int shift_from_top = 0;
					int k = 0;
					while (k <= c) {
							shift_from_top += chunk_bits[k];
							++k;
					}
					const int inv_shift = levels - shift_from_top;
					int d = 0;

#pragma loop ivdep
while (d < dim) {
const int pd = perm[d];
const unsigned long long mask = pextMaskChunks[static_cast<size_t>(c) * static_cast<size_t>(dim) + static_cast<size_t>(d)];
unsigned long long bits = _pext_u64(idxc, mask);
if (inv_shift >= 0) {
unsigned long long invMaskSegment = 0ull;
if (chunk_bits[c] < 63) {
const unsigned long long take = (static_cast<unsigned long long>(1) << chunk_bits[c]) - 1ull;
invMaskSegment = (invMask[pd] >> inv_shift) & take;
}
bits ^= invMaskSegment;
}
accBits[pd] = (accBits[pd] << Lc) | bits;
accShifted[pd] += Lc;
++d;
}
++c;
}
int d = 0;
#pragma loop ivdep
while (d < dim) {
out[d] = fmaf(step[d], static_cast<float>(accBits[d]), baseOff[d]);
++d;
}
}

text
	__declspec(noalias) __forceinline float pointToT(const float* __restrict q) const noexcept {
			const int bitsFull = levels;
			const int bitsCoarse = chunk_bits[0];
			unsigned long long idx0 = 0ull;
			int d = 0;

#pragma loop ivdep
while (d < dim) {
const int pd = perm[d];
const float v = (q[pd] - baseOff[pd]) * invStep[pd];
const long long cell = static_cast<long long>(_mm_cvt_ss2si(_mm_round_ss(_mm_setzero_ps(), _mm_set_ss(v), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)));
const long long maxv = (static_cast<long long>(1) << bitsFull) - 1;
(void)maxv;
unsigned long long b = static_cast<unsigned long long>(cell) >> (bitsFull - bitsCoarse);
unsigned long long invMask0 = 0ull;
if (bitsCoarse < 63) {
const unsigned long long take = (static_cast<unsigned long long>(1) << bitsCoarse) - 1ull;
invMask0 = (invMask[pd] >> (bitsFull - bitsCoarse)) & take;
}
b ^= invMask0;
idx0 |= _pdep_u64(b, pextMask[d]);
++d;
}
if (use_gray)
idx0 = static_cast<unsigned long long>(gray_decode(idx0));
return fmaf(1.0f / static_cast<float>(scale), fmaf(static_cast<float>(idx0), 1.0f, 0.5f), 0.0f);
}
};

__declspec(noalias) __forceinline void IntervalND::compute_span_level(const MortonND& map) noexcept {
span_level = 0;
int d = 0;
#pragma loop ivdep
while (d < map.dim) {
const unsigned long long varying = (i1 ^ i2) & map.pextMask[d];
span_level += _mm_popcnt_u64(varying);
++d;
}
span_level += (map.levels - map.chunk_bits[0]) * map.dim;
span_level = (std::min)(span_level, 11);
}

__declspec(align(16)) struct ManipCost final sealed{
const int n;
const bool variableLen;
const float targetX;
const float targetY;
const float minTheta;
const float archBiasW;
const float archBiasK;
const float sharpW;

text
	__declspec(noalias) __forceinline ManipCost(int _n, bool _variableLen, float _targetX, float _targetY, float _minTheta) noexcept
			: n(_n), variableLen(_variableLen), targetX(_targetX), targetY(_targetY), minTheta(_minTheta),
				archBiasW(0.02f), archBiasK(3.0f), sharpW(0.05f) {

}

__declspec(noalias) __forceinline float operator()(const float* __restrict q, float& out_x, float& out_y) const noexcept {
const float* __restrict th = q;
const float* __restrict L = variableLen ? (q + n) : nullptr;
alignas(16) float phi[32];
alignas(16) float s_arr[32];
alignas(16) float c_arr[32];
float x = 0.0f;
float y = 0.0f;
float phi_acc = 0.0f;
float penC = 0.0f;
float archPen = 0.0f;
int i = 0;
#pragma loop ivdep
while (i < n) {
phi_acc += th[i];
phi[i] = phi_acc;
++i;
}
FABE13_SINCOS(phi, s_arr, c_arr, n);
const float sharpScale = 2.0f / (minTheta);
const float Lc = 1.0f;
if (variableLen) {
i = 0;
while (i < n) {
const float Li = L[i];
x = fmaf(Li, c_arr[i], x);
y = fmaf(Li, s_arr[i], y);
++i;
}
}
else {
i = 0;
while (i < n) {
x = fmaf(Lc, c_arr[i], x);
y = fmaf(Lc, s_arr[i], y);
++i;
}
}
i = 0;
#pragma loop ivdep
while (i < n) {
const float theta = th[i];
const float ai = fabsf(theta);
const float v = fmaf(ai, 1.0f, -minTheta);
if (v > 0.0f) {
const float scale = fmaf(sharpScale, v, 0.0f);
const float arg = fmaf(scale, 0.69314718055994530941723212145818f, 0.0f);
const float exp2_val = fmaf(arg, fmaf(arg, fmaf(arg, fmaf(arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
penC = fmaf(sharpW, fmaf(exp2_val, 1.0f, -1.0f), penC);
}
const float t = fmaf(-theta, archBiasK, 0.0f);
float sp;
if (t > 10.0f)
sp = t;
else {
const float exp_val = fmaf(t, fmaf(t, fmaf(t, fmaf(t, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
sp = log1pf(exp_val);
}
archPen = fmaf(archBiasW, sp, archPen);
++i;
}
const float dx = fmaf(x, 1.0f, -targetX);
const float dy = fmaf(y, 1.0f, -targetY);
const float dist = sqrtf(fmaf(dx, dx, dy * dy));
out_x = x;
out_y = y;
return fmaf(dist, 1.0f, fmaf(penC, 1.0f, archPen));
}
};

static __declspec(noalias) __forceinline void HitTest2D_analytic(float x_param, float& out_x1, float& out_x2) noexcept {
const float a = gActiveMap.a;
const float inv_lenx = gActiveMap.inv_lenx;
const unsigned scale = gActiveMap.scale;
const unsigned scale_minus_1 = scale - 1u;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const float c = gActiveMap.c;
const unsigned start = gActiveMap.start;
const int levels = gActiveMap.levels;
float norm = (x_param - a) * inv_lenx;
norm = fminf(fmaxf(norm, 0.0f), 0x1.fffffep-1f);
unsigned idx = static_cast<unsigned>(norm * static_cast<float>(scale));
idx = idx > scale_minus_1 ? scale_minus_1 : idx;
float sx = lenx;
float sy = leny;
float x1 = a;
float x2 = c;
unsigned type = start;
int l = levels - 1;
while (l >= 0) {
const unsigned q = (idx >> (l * 2)) & 3u;
const Step s = g_step_tbl[type][q];
type = s.next;
sx *= 0.5f;
sy *= 0.5f;
x1 += s.dx ? sx : 0.0f;
x2 += s.dy ? sy : 0.0f;
--l;
}
out_x1 = x1 + sx * 0.5f;
out_x2 = x2 + sy * 0.5f;
}

static __declspec(noalias) __forceinline float FindX2D_analytic(float px, float py) noexcept {
const float a = gActiveMap.a;
const float b = gActiveMap.b;
const float c = gActiveMap.c;
const float d = gActiveMap.d;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const unsigned scale = gActiveMap.scale;
const unsigned start = gActiveMap.start;
const int levels = gActiveMap.levels;
const float clamped_px = fminf(fmaxf(px, a), b);
const float clamped_py = fminf(fmaxf(py, c), d);
float sx = lenx;
float sy = leny;
float x0 = a;
float y0 = c;
unsigned idx = 0u;
unsigned type = start;
int l = 0;
while (l < levels) {
sx *= 0.5f;
sy *= 0.5f;
const float mx = x0 + sx;
const float my = y0 + sy;
const unsigned tr = static_cast<unsigned>((clamped_px > mx) & (clamped_py > my));
const unsigned tl = static_cast<unsigned>((clamped_px < mx) & (clamped_py > my));
const unsigned dl = static_cast<unsigned>((clamped_px < mx) & (clamped_py < my));
const unsigned none = static_cast<unsigned>(1u ^ (tr | tl | dl));
const unsigned dd = (tr << 1) | tr | tl | (none << 1);
const InvStep inv = g_inv_tbl[type][dd];
type = inv.next;
idx = (idx << 2) | inv.q;
const unsigned dx = dd >> 1;
const unsigned dy = dd & 1u;
x0 += dx ? sx : 0.0f;
y0 += dy ? sy : 0.0f;
++l;
}
const float scale_recip = 1.0f / static_cast<float>(scale);
return fmaf(static_cast<float>(idx) * scale_recip, lenx, a);
}

static __declspec(noalias) __forceinline int generate_lhs_seeds_lite(const MortonND& map, int dim, float* __restrict S, int stride, unsigned seed) noexcept {
int temp_dim = dim;
const int ns = static_cast<int>(fmaf(static_cast<float>(--temp_dim), fmaf(static_cast<float>(temp_dim), fmaf(static_cast<float>(temp_dim), fmaf(static_cast<float>(temp_dim), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f));
unsigned st = seed;
alignas(16) int permutations[32][256];
int d = 0;
#pragma loop ivdep
while (d < dim) {
int s = 0;
#pragma loop ivdep
while (s < ns) {
permutations[d][s] = s;
++s;
}
s = ns - 1;
while (s > 0) {
st ^= st << 13;
st ^= st >> 17;
st ^= st << 5;
const int j = static_cast<int>(st % static_cast<unsigned>(s + 1));
const int t = permutations[d][s];
permutations[d][s] = permutations[d][j];
permutations[d][j] = t;
--s;
}
++d;
}
int s2 = 0;
#pragma loop ivdep
while (s2 < ns) {
d = 0;
#pragma loop ivdep
while (d < dim) {
st ^= st << 13;
st ^= st >> 17;
st ^= st << 5;
const float u = (st & 0xFFFFFF) * 5.9604645e-8f;
const int stratum = permutations[d][s2];
const float pos = (static_cast<float>(stratum) + u) / static_cast<float>(ns);
const int pd = map.perm[d];
const float lo = map.low[pd];
const float hi = map.high[pd];
S[s2 * stride + d] = fmaf(pos, hi - lo, lo);
++d;
}
++s2;
}
return ns;
}

static __declspec(noalias) __forceinline int generate_heuristic_seeds(const ManipCost& cost, const MortonND& map, int dim, float* __restrict S, int stride, unsigned seed) noexcept {
const int n = cost.n;
const bool VL = cost.variableLen;
const float tx = cost.targetX;
const float ty = cost.targetY;
int total_seeds = 0;
{
float* s0 = S + total_seeds * stride;
float sin_phi;
float cos_phi;
const float rho = sqrtf(fmaf(tx, tx, ty * ty));
FABE13_SINCOS(&tx, &sin_phi, &cos_phi, 1);
const float phi = (fabsf(sin_phi) > 0.0f) ? atan2f(ty, tx) : 0.0f;
const float len = fmaf(1.0f / static_cast<float>(n), rho, 0.0f);
int i = 0;
#pragma loop ivdep
while (i < n) {
s0[i] = (1.0f / static_cast<float>(n)) * phi;
++i;
}
if (VL) {
i = 0;
while (i < n) {
s0[n + i] = len;
++i;
}
}
++total_seeds;
}
{
float* s1 = S + total_seeds * stride;
float sin_phi;
float cos_phi;
FABE13_SINCOS(&tx, &sin_phi, &cos_phi, 1);
const float phi = (fabsf(sin_phi) > 0.0f) ? atan2f(ty, tx) : 0.0f;
int i = 0;
#pragma loop ivdep
while (i < n) {
s1[i] = 0.5f * phi * ((i & 1) ? -1.0f : 1.0f);
++i;
}
if (VL) {
i = 0;
while (i < n) {
s1[n + i] = fmaf(0.4f, static_cast<float>(i) / static_cast<float>(n), 0.8f);
++i;
}
}
++total_seeds;
}
{
float* s2 = S + total_seeds * stride;
const float inv = (n > 1) ? 1.0f / static_cast<float>(n - 1) : 0.0f;
float sin_phi;
float cos_phi;
FABE13_SINCOS(&tx, &sin_phi, &cos_phi, 1);
const float phi = (fabsf(sin_phi) > 0.0f) ? atan2f(ty, tx) : 0.0f;
int i = 0;
#pragma loop ivdep
while (i < n) {
const float pr = static_cast<float>(i) * inv;
s2[i] = fmaf(phi, fmaf(-0.3f, pr, 1.0f), 0.0f);
++i;
}
if (VL) {
int j = 0;
while (j < n) {
float si;
FABE13_SIN(fmaf(1.5f, static_cast<float>(j), 0.0f), si);
s2[n + j] = fmaf(0.2f, si, 1.0f);
++j;
}
}
++total_seeds;
}
const int lhs_count = generate_lhs_seeds_lite(map, dim, S + total_seeds * stride, stride, seed);
total_seeds += lhs_count;
return total_seeds;
}

static declspec(noalias) void agp_run_branch_mpi(
const MortonND& map,
const ManipCost& cost,
int maxIter,
float r,
bool adaptive,
float eps,
unsigned seed,
std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H,
std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ,
float& bestF,
float& bestX,
float& bestY,
size_t& out_iterations,
float& out_achieved_epsilon,
float M_prior) noexcept {
const int n = cost.n;
const int dim = n + (cost.variableLen ? n : 0);
const float dim_f = static_cast<float>(dim);
unsigned exchange_counter = 0;
unsigned exchange_counter_T = 0;
alignas(16) float M_by_span[12];
int msi = 0;
while (msi < 12)
M_by_span[msi++] = M_prior;
float Mmax = M_prior;
alignas(16) float q_local[32];
alignas(16) float phi[32];
alignas(16) float s_arr[32];
alignas(16) float c_arr[32];
alignas(16) float sum_s[32];
alignas(16) float sum_c[32];
alignas(16) float q_try[32];
bestQ.reserve(static_cast<size_t>(dim));
float x = 0.0f;
float y = 0.0f;
int no_improve = 0;
auto t_to_idx = [&](float t) -> unsigned long long {
unsigned long long idx = static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f));
return idx;
};
auto update_pockets_and_Mmax = [&](IntervalND* I) {
const int k = I->span_level;
if (I->M > M_by_span[k])
M_by_span[k] = I->M;
if (M_by_span[k] > Mmax)
Mmax = M_by_span[k];
};
const float a = 0.0f;
const float b = 1.0f;
float p = 0.0f;
float dmax = b - a;
const float initial_len = dmax;
const float A_dim = fmaf(1.0f / sqrtf(dim_f + 6.75f), 5.535f, 0.0f);
const float A_dim = fmaf(1.0f / sqrtf(dim_f + 7.0f), 3.485f, 0.0f);
const float A_dim_ = fmaf(1.0f / sqrtf(dim_f + 6.75f), 3.425f, 0.0f);
const float B_dim = fmaf(A_dim, 0.7f, 0.0f);
const float B_dim_ = fmaf(A_dim_, 7.0f, 0.0f);
const float B_dim__ = fmaf(A_dim__, 4.325f, 0.0f);
const float log_argument = A_dim - 2.03f;
const float log_argument_ = A_dim_ - 1.9875f;
const float log_argument__ = A_dim__ - 2.03f;
const float C_dim = fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, fmaf(log_argument, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument) - B_dim;
const float C_dim_ = fmaf(log_argument_, fmaf(log_argument_, fmaf(log_argument_, fmaf(log_argument_, fmaf(log_argument_, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument_) - B_dim_;
const float C_dim__ = fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, fmaf(log_argument__, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_argument__) - B_dim__;
const float adaptive_coeff_addition = fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, fmaf(C_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float adaptive_coeff_addition_ = fmaf(C_dim_, fmaf(C_dim_, fmaf(C_dim_, fmaf(C_dim_, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float adaptive_coeff_addition__ = fmaf(C_dim__, fmaf(C_dim__, fmaf(C_dim__, fmaf(C_dim__, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
float adaptive_coeff = A_dim - adaptive_coeff_addition;
float adaptive_coeff_ = A_dim_ - adaptive_coeff_addition_;
float adaptive_coeff__ = A_dim__ - adaptive_coeff_addition__;
const float A_dim_clone = fmaf(A_dim - fmaf(-fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, fmaf(B_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim), 0.5f, 0.0f);
int it = 0;
auto evalAt = [&](const float t) -> float {
map.map01ToPoint(t, q_local);
float f = cost(q_local, x, y);
if (f < fmaf(bestF, adaptive_coeff, 0.0f)) {
float eta = fmaf(sqrtf(fmaf(-dim_f, 2.0f, 23.0f)), 0.0215f, 0.0f);
float acc = 0.0f;
int ii = 0;
while (ii < n) {
acc = fmaf(q_local[ii], 1.0f, acc);
phi[ii] = acc;
++ii;
}
FABE13_SINCOS(phi, s_arr, c_arr, n);
float as = 0.0f;
float ac = 0.0f;
int k = n - 1;
while (k >= 0) {
const float Lk = cost.variableLen ? q_local[n + k] : 1.0f;
as = fmaf(Lk, s_arr[k], as);
ac = fmaf(Lk, c_arr[k], ac);
sum_s[k] = as;
sum_c[k] = ac;
--k;
}
const float dx = fmaf(x, 1.0f, -cost.targetX);
const float dy = fmaf(y, 1.0f, -cost.targetY);
const float dist = sqrtf(fmaf(dx, dx, fmaf(dy, dy, 0.0f)));
const float inv_dist = 1.0f / dist;
int improve_step = 0;
int no_improve_step = 0;
int stepI = 0;
const float adaptive_window = fmaf(1.0f / fmaf(bestF, adaptive_coeff, 0.0f), fmaf(bestF, adaptive_coeff, -f), -1.0f);
const int threshold_iters_grad = static_cast<int>(fmaf(fmaf(adaptive_window, fmaf(adaptive_window, fmaf(adaptive_window, fmaf(adaptive_window, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), fmaf(sqrtf(dim_f), 5.775f, 0.0f), 0.0f));
while (stepI < threshold_iters_grad) {
int i = 0;
#pragma loop ivdep
while (i < n) {
float gpen = 0.0f;
{
const float ai = fabsf(q_local[i]);
const float v = fmaf(ai, 1.0f, -cost.minTheta);
if (v > 0.0f) {
const float scale_arg = fmaf(2.0f / (cost.minTheta), fmaf(v, 0.69314718055994530941723212145818f, 0.0f), 0.0f);
const float exp_val = fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, fmaf(scale_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float dpen_dtheta = fmaf(cost.sharpW, fmaf(exp_val, fmaf(1.0f / (cost.minTheta), 1.3862943611198906188344642429164f, 0.0f), 0.0f), copysignf(1.0f, q_local[i]));
gpen = fmaf(dpen_dtheta, 1.0f, gpen);
}
}
{
const float tsg = fmaf(-q_local[i], cost.archBiasK, 0.0f);
const float exp_arg = -tsg;
const float exp_val = fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
const float sig = 1.0f / fmaf(exp_val, 1.0f, 1.0f);
gpen = fmaf(-fmaf(cost.archBiasW, cost.archBiasK, 0.0f), sig, gpen);
}
const float g = fmaf(fmaf(dx, -sum_s[i], fmaf(dy, sum_c[i], 0.0f)), inv_dist, gpen);
q_try[i] = fmaf(-eta, g, q_local[i]);
const float lo = (i == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
const float hi = 2.6179938779914943653855361527329f;
if (q_try[i] < lo) q_try[i] = lo;
else if (q_try[i] > hi) q_try[i] = hi;
++i;
}
if (cost.variableLen) {
int j = 0;
#pragma loop ivdep
while (j < n) {
const float g = fmaf(fmaf(dx, c_arr[j], fmaf(dy, s_arr[j], 0.0f)), inv_dist, 0.0f);
float v = fmaf(-eta, g, q_local[n + j]);
if (v < 0.5f) v = 0.5f;
else if (v > 2.0f) v = 2.0f;
q_try[n + j] = v;
++j;
}
}
float x2;
float y2;
const float f2 = cost(q_try, x2, y2);
const float rel_improvement = fmaf(-1.0f / f, f2, 1.0f);
if (f2 < f) {
memcpy(q_local, q_try, static_cast<size_t>(dim) * sizeof(float));
++improve_step;
no_improve_step = 0;
f = f2;
x = x2;
y = y2;
if (improve_step > std::max(threshold_iters_grad >> 1, 2)) eta = dim > 2 ? fmaf(fmaf(rel_improvement, fmaf(rel_improvement, fmaf(rel_improvement, fmaf(rel_improvement, fmaf(rel_improvement, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), rel_improvement), fmaf(1.0f / sqrtf(dim_f - 1.0f), 2.5285f, 0.0f), 1.0f) : eta = fmaf(fmaf(rel_improvement, fmaf(rel_improvement, fmaf(rel_improvement, fmaf(rel_improvement, fmaf(rel_improvement, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), rel_improvement), 3.575f, 1.0f);
else { const float eta_arg = dim > 2 ? fmaf(1.0f / sqrtf(dim_f - 1.0f), fmaf(rel_improvement, 1.11375f, 0.0f), 0.0f) : fmaf(rel_improvement, 1.575f, 0.0f); eta = fmaf(eta_arg, fmaf(eta_arg, fmaf(eta_arg, fmaf(eta_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); }
}
else {
memcpy(q_try, q_local, static_cast<size_t>(dim) * sizeof(float));
++no_improve_step;
improve_step = 0;
if (no_improve_step > std::max(threshold_iters_grad >> 2, 2)) { const float eta_arg = fmaf(sqrtf(dim_f), fmaf(rel_improvement, 4.5f, 0.0f), 0.0f); eta = fmaf(eta_arg, fmaf(eta_arg, fmaf(eta_arg, fmaf(eta_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); }
else eta = fmaf(fmaf(rel_improvement, fmaf(rel_improvement, fmaf(rel_improvement, fmaf(rel_improvement, fmaf(rel_improvement, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), rel_improvement), fmaf(sqrtf(dim_f), 1.575f, 0.0f), 1.0f);
if (eta < 0.015f) break;
}
++stepI;
}
if (f < bestF) {
bestF = f;
bestQ.assign(q_local, q_local + dim);
bestX = x;
bestY = y;
no_improve = 0;
}
else ++no_improve;
return f;
}
};
const float f_a = evalAt(a);
const float f_b = evalAt(b);
const int K = static_cast<int>(fmaf(-fmaf(sqrtf(dim_f), dim_f, 0.0f), 0.725f, 10.95f));
H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + 16u);
const int rank = g_world->rank();
const int world = g_world->size();
alignas(16) float seeds[256 * 32];
const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, static_cast<unsigned>(fmaf(static_cast<float>(rank), 7919.0f, static_cast<float>(seed))));
int i = 0;
while (i < seedCnt) {
const float* s = seeds + static_cast<size_t>(fmaf(static_cast<float>(i), 32.0f, 0.0f));
const float t_seed = map.pointToT(s);
const float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f) : fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f), exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
const float t1 = fmaf(-interval_size, 1.0f, t_seed);
const float t2 = fmaf(interval_size, 1.0f, t_seed);
alignas(16) float q1[32];
alignas(16) float q2[32];
float x1;
float y1;
float x2;
float y2;
map.map01ToPoint(t1, q1);
const float f1 = cost(q1, x1, y1);
map.map01ToPoint(t2, q2);
const float f2 = cost(q2, x2, y2);
IntervalND* I = new IntervalND(t1, t2, f1, f2);
I->i1 = t_to_idx(t1);
I->i2 = t_to_idx(t2);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
if (i < 3)
I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f);
else {
const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f);
const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f);
const float mult = fmaf(start_mult, exp2f((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)) * static_cast<float>(i - 3)), 0.0f);
I->R = fmaf(I->R, mult, 0.0f);
}
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
if (f1 < bestF) {
bestF = f1;
bestQ.assign(q1, q1 + dim);
bestX = x1;
bestY = y1;
}
if (f2 < bestF) {
bestF = f2;
bestQ.assign(q2, q2 + dim);
bestX = x2;
bestY = y2;
}
++i;
}
float prev_t = a;
float prev_f = f_a;
int k = 1;
while (k <= K) {
const float t = fmaf(fmaf(b - a, static_cast<float>(k) / static_cast<float>(K + 1), a), 1.0f, static_cast<float>(rank) / static_cast<float>(world * (K + 1)));
const float f = evalAt(t);
IntervalND* I = new IntervalND(prev_t, t, prev_f, f);
I->i1 = t_to_idx(prev_t);
I->i2 = t_to_idx(t);
I->diam = map.block_diameter(I->i1, I->i2);
I->compute_span_level(map);
I->set_metric(I->diam);
update_pockets_and_Mmax(I);
I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(I);
std::push_heap(H.begin(), H.end(), ComparePtrND);
prev_t = t;
prev_f = f;
++k;
}
IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b);
tail->i1 = t_to_idx(prev_t);
tail->i2 = t_to_idx(b);
tail->diam = map.block_diameter(tail->i1, tail->i2);
tail->compute_span_level(map);
tail->set_metric(tail->diam);
update_pockets_and_Mmax(tail);
tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(tail);
std::push_heap(H.begin(), H.end(), ComparePtrND);
const int noImproveThrDim = static_cast<int>(fmaf(32.5f, exp2f(-0.1f * dim_f), 0.0f));
while (it < maxIter) {
p = fmaf(-1.0f / initial_len, dmax, 1.0f);
const float p_arg = fmaf(p, 2.3f, -2.9775f);
const float r_eff = dim > 2 ? fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, fmaf(sqrtf(dim_f - 1), r, 0.0f), 0.0f) : fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.05f, r, 0.0f);
const bool staganition = no_improve > noImproveThrDim;
if (staganition) {
const float t_best = map.pointToT(bestQ.data());
const float log_arg = fmaf(dim_f, 1.0f, -1.0f);
const float adaptive_diameter = fmaf(fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, fmaf(log_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), log_arg), dim > 2 ? fmaf(adaptive_coeff, 0.00475f, 0.0f) : 0.00545f, 0.0f);
int ii = 0;
while (ii < 2) {
const float off = (ii == 0) ? fmaf(adaptive_diameter, 2.0f, 0.0f) : fmaf(-adaptive_diameter, 2.0f, 0.0f);
const float t_seed = fmaf(off, 1.0f, t_best);
const float f_seed = evalAt(t_seed);
IntervalND* J = new IntervalND(fmaf(-adaptive_diameter, 1.0f, t_seed), fmaf(adaptive_diameter, 1.0f, t_seed), f_seed, f_seed);
J->i1 = t_to_idx(fmaf(-adaptive_diameter, 1.0f, t_seed));
J->i2 = t_to_idx(fmaf(adaptive_diameter, 1.0f, t_seed));
J->diam = map.block_diameter(J->i1, J->i2);
J->compute_span_level(map);
J->set_metric(J->diam);
update_pockets_and_Mmax(J);
J->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
J->R = fmaf(J->R, 2.03f - adaptive_coeff__, 0.0f);
H.emplace_back(J);
std::push_heap(H.begin(), H.end(), ComparePtrND);
++ii;
}
no_improve = 0;
}
const float exp_arg = fmaf(B_dim, p, 0.0f);
//const float exp_arg_ = fmaf(B_dim_, p, 0.0f);
//const float exp_arg_ = fmaf(-p, 1.0f, 1.0f);
const float exp_arg__ = fmaf(B_dim__, p, 0.0f);
adaptive_coeff = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim);
//adaptive_coeff_ = fmaf(-fmaf(exp_arg_, fmaf(exp_arg_, fmaf(exp_arg_, fmaf(exp_arg_, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition_, A_dim_);

text
	float first_sqrt = sqrtf(fmaf(1.0f / dim_f, 2.0f, 0.0f));
	float second_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 5.0f, 0.0f));
	float third_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 9.0f, 0.0f));
	float fourth_sqrt = sqrtf(fmaf(1.0f / (dim_f + 7.0f), 6.5f, 0.0f));
	float rr = sqrtf(fmaf(-p, 1.0f, 1.0f)), xx = p * p, tt = fmaf(500.0f, p, -486.95472f);
	adaptive_coeff_ = (p < 0.95f) ? fmaf(fmaf(first_sqrt, xx, 0.0f), 0.0130349902f, fmaf(-0.04f, p, fmaf(fmaf(first_sqrt, rr, 0.0f), 0.15f, 1.1f))) : (p < 0.97390944f) ? fmaf(second_sqrt, rr, 0.9396f) : (p < 0.97590944f) ? fmaf(fmaf(fmaf(fmaf(third_sqrt, tt, 0.0f), tt, 0.0f), fmaf(-2.0f, tt, 3.0f), 0.0f), fmaf(0.25f, rr, -0.0396f), fmaf(fmaf(third_sqrt, rr, 0.0f), 0.75f, 0.9396f)) : fmaf(fourth_sqrt, rr, 0.925f);
	//adaptive_coeff_ = (p < 0.95f) ? fmaf(1.0761439f, sqrtf(exp_arg_), fmaf(0.7292799f, p, 0.1738561f)) :
		//(p < 0.975f) ? fmaf(0.75f, sqrtf(exp_arg_), 0.9396f) :
		//fmaf(1.0f, sqrtf(exp_arg_), 0.9f);
	adaptive_coeff__ = fmaf(-fmaf(exp_arg__, fmaf(exp_arg__, fmaf(exp_arg__, fmaf(exp_arg__, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition__, A_dim__);
	const float arg_for_T = fmaf(p, 4.5f, 2.475f);
	const float exp_argument = fmaf(-0.2925f, dim_f, 0.0f);
	const float exp2_exp_arg = fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(exp_argument, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
	const float A = fmaf(39.995f, exp2_exp_arg, 0.0f);
	const int T = static_cast<int>(fmaf(fmaf(fmaf(1.0f / fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, fmaf(arg_for_T, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 2.0f), 1.0498f, -0.04978f), fmaf(sqrtf(dim_f), 4.88f, 0.0f), 0.0f), fmaf(-(exp2_exp_arg - 1.0f), A, A), 0.0f));
	std::pop_heap(H.begin(), H.end(), ComparePtrND);
	IntervalND* cur = H.back();
	H.pop_back();
	const float x1 = cur->x1;
	const float x2 = cur->x2;
	const float y1 = cur->y1;
	const float y2 = cur->y2;
	float m = fmaf(r_eff, Mmax, 0.0f);
	float tNew = step(m, x1, x2, y1, y2, dim_f, r_eff);
	const float bestFOld = bestF;
	const float fNew = evalAt(tNew);
	IntervalND* L = new IntervalND(x1, tNew, y1, fNew);
	IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2);
	L->i1 = t_to_idx(x1);
	L->i2 = t_to_idx(tNew);
	Rv->i1 = t_to_idx(tNew);
	Rv->i2 = t_to_idx(x2);
	L->diam = map.block_diameter(L->i1, L->i2);
	Rv->diam = map.block_diameter(Rv->i1, Rv->i2);
	L->compute_span_level(map);
	Rv->compute_span_level(map);
	L->set_metric(L->diam);
	Rv->set_metric(Rv->diam);
	const float Mloc = fmaxf(L->M, Rv->M);
	update_pockets_and_Mmax(L);
	update_pockets_and_Mmax(Rv);
	const float prevMmax = Mmax;
	if (Mloc > Mmax)
		Mmax = Mloc;
	m = fmaf(r_eff, Mmax, 0.0f);
	if (adaptive) {
		const float len1 = fmaf(tNew, 1.0f, -x1);
		const float len2 = fmaf(x2, 1.0f, -tNew);
		if (fmaf(len1, 1.0f, len2) == dmax) {
			dmax = fmaxf(len1, len2);
			for (auto pI : H) {
				const float Ls = fmaf(pI->x2, 1.0f, -pI->x1);
				if (Ls > dmax)
					dmax = Ls;
			}
		}
		if ((p > 0.7f && !(it % 3) && dmax < 0.7f) || p > 0.9f) {
			const float alpha = p * p;
			const float beta = fmaf(-alpha, 1.0f, 2.0f);
			const float MULT = (1.0f / dmax) * Mmax;
			const float global_coeff = fmaf(MULT, r_eff, -MULT);
			const float GF = beta * global_coeff;
			L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f)));
			Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f)));
			const size_t sz = H.size();
			RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha);
			std::make_heap(H.begin(), H.end(), ComparePtrND);
		}
		else {
			if (Mloc > prevMmax) {
				L->ChangeCharacteristic(m);
				Rv->ChangeCharacteristic(m);
				if (Mloc > fmaf(adaptive_coeff_, prevMmax, 0.0f)) {
					const size_t sz = H.size();
					RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
					std::make_heap(H.begin(), H.end(), ComparePtrND);
				}
			}
			else {
				L->ChangeCharacteristic(m);
				Rv->ChangeCharacteristic(m);
			}
		}
	}
	else {
		if (Mloc > prevMmax) {
			L->ChangeCharacteristic(m);
			Rv->ChangeCharacteristic(m);
			if (Mloc > fmaf(adaptive_coeff_, prevMmax, 0.0f)) {
				const size_t sz = H.size();
				RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
				std::make_heap(H.begin(), H.end(), ComparePtrND);
			}
		}
		else {
			L->ChangeCharacteristic(m);
			Rv->ChangeCharacteristic(m);
		}
	}
	H.emplace_back(L);
	std::push_heap(H.begin(), H.end(), ComparePtrND);
	H.emplace_back(Rv);
	std::push_heap(H.begin(), H.end(), ComparePtrND);
	_mm_prefetch((const char*)H[0], _MM_HINT_T0);
	_mm_prefetch((const char*)H[1], _MM_HINT_T0);
	IntervalND* const top = H.front();
	const float interval_len = dim > 1 ? fmaf(top->x2, 1.0f, -top->x1) : top->diam;
	if (it == maxIter) {
		memcpy(q_local, bestQ.data(), static_cast<size_t>(dim) * sizeof(float));
		float x_final = bestX;
		float y_final = bestY;
		float f_final = bestF;
		const int last = n - 1;
		const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
		const float hi = 2.6179938779914943653855361527329f;
		float bestLocF = f_final;
		float saved = q_local[last];
		float delta = 0.05f;
		while (delta >= 0.00625f) {
			int sgn = -1;
			while (sgn < 2) {
				float cand = fmaf(static_cast<float>(sgn), delta, saved);
				if (cand < lo) cand = lo;
				else if (cand > hi) cand = hi;
				const float backup = q_local[last];
				q_local[last] = cand;
				float x2;
				float y2;
				const float f2 = cost(q_local, x2, y2);
				if (f2 < bestLocF) {
					bestLocF = f2;
					x_final = x2;
					y_final = y2;
					saved = cand;
				}
				q_local[last] = backup;
				sgn += 2;
			}
			delta *= 0.5f;
		}
		if (bestLocF < f_final) {
			q_local[last] = saved;
			f_final = bestLocF;
			bestF = f_final;
			bestX = x_final;
			bestY = y_final;
			bestQ.assign(q_local, q_local + dim);
		}
		out_iterations = static_cast<size_t>(it);
		out_achieved_epsilon = interval_len;
		return;
	}
	if (!(it % T) || staganition) {
		MultiCrossMsg out;
		const unsigned intervals_to_send = dim < 12 ? static_cast<unsigned>(sqrtf(fmaf(dim_f, 5.5f, 0.0f))) : 7u;
		out.count = intervals_to_send;
		float* dest = out.intervals;
		unsigned ii = 0u;
		while (ii < intervals_to_send) {
			IntervalND* Tt = H[ii];
			dest[0] = Tt->x1;
			dest[1] = 0.0f;
			dest[2] = Tt->x2;
			dest[3] = 0.0f;
			dest[4] = Tt->R;
			dest += 5;
			++ii;
		}
		const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
		bool active = true;
		const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter_T), 1.0f, 1.0f)) & 1);
		size_t ii2 = 0u;
		while (ii2 < iterations && active) {
			const size_t step = 1ULL << ii2;
			const int partner = rank ^ static_cast<int>(step);
			if (partner < world) {
				const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
				if (am_sender) {
					g_world->isend(partner, 0, out);
					active = false;
				}
			}
			++ii2;
		}
		++exchange_counter_T;
	}
	if (bestF < fmaf(bestFOld, 2.03f - adaptive_coeff__, 0.0f)) {
		BestSolutionMsg out;
		out.bestF = bestF;
		out.bestX = bestX;
		out.bestY = bestY;
		out.dim = static_cast<unsigned>(bestQ.size());
		memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float));
		{
			IntervalND* const I0 = H[0];
			out.bestI[0] = I0->x1;
			out.bestI[1] = I0->x2;
			out.bestI[2] = I0->y1;
			out.bestI[3] = I0->y2;
			IntervalND* const I1 = H[1];
			out.bestI[4] = I1->x1;
			out.bestI[5] = I1->x2;
			out.bestI[6] = I1->y1;
			out.bestI[7] = I1->y2;
		}
		const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
		bool active = true;
		const bool invert_T = (static_cast<int>(fmaf(static_cast<float>(exchange_counter), 1.0f, 1.0f)) & 1);
		size_t ii2 = 0u;
		while (ii2 < iterations && active) {
			const size_t step = 1ULL << ii2;
			const int partner = rank ^ static_cast<int>(step);
			if (partner < world) {
				const bool am_sender = ((!!(rank & static_cast<int>(step))) ^ invert_T);
				if (am_sender) {
					g_world->isend(partner, 2, out);
					active = false;
				}
			}
			++ii2;
		}
		++exchange_counter;
	}
	while (g_world->iprobe(boost::mpi::any_source, 0)) {
		MultiCrossMsg in;
		g_world->recv(boost::mpi::any_source, 0, in);
		const MultiCrossMsg& mX = in;
		unsigned ii = 0u;
		while (ii < mX.count) {
			const float* d = &mX.intervals[ii * 5];
			float sx = d[0];
			float ex = d[2];
			alignas(16) float tmp[32];
			float tx;
			float ty;
			map.map01ToPoint(sx, tmp);
			const float y1i = cost(tmp, tx, ty);
			map.map01ToPoint(ex, tmp);
			const float y2i = cost(tmp, tx, ty);
			IntervalND* inj = new IntervalND(sx, ex, y1i, y2i);
			inj->i1 = t_to_idx(sx);
			inj->i2 = t_to_idx(ex);
			inj->diam = map.block_diameter(inj->i1, inj->i2);
			inj->compute_span_level(map);
			inj->set_metric(inj->diam);
			update_pockets_and_Mmax(inj);
			inj->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
			_mm_prefetch((const char*)H[0], _MM_HINT_T0);
			_mm_prefetch((const char*)H[1], _MM_HINT_T0);
			if (inj->R > fmaf(adaptive_coeff, H.front()->R, 0.0f) || staganition) {
				const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f;
				const float kf = staganition ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f);
				const float exp_arg = fmaf(B_dim, fmaf(1.0f / static_cast<float>(mX.count - 1u), static_cast<float>(ii), 0.0f), 0.0f);
				float adaptive_coeff_clone = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim_clone);
				inj->R = fmaf(d[4], fmaf(kf, adaptive_coeff_clone, 0.0f), 0.0f);
				H.emplace_back(inj);
				std::push_heap(H.begin(), H.end(), ComparePtrND);
			}
			++ii;
		}
	}
	while (g_world->iprobe(boost::mpi::any_source, 2)) {
		BestSolutionMsg bm;
		g_world->recv(boost::mpi::any_source, 2, bm);
		if (bm.bestF < fmaf(bestF, 2.03f - adaptive_coeff__, 0.0f) || staganition) {
			_mm_prefetch((const char*)H[0], _MM_HINT_T0);
			_mm_prefetch((const char*)H[1], _MM_HINT_T0);
			const float poly = fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), fmaf(fmaf(p, 0.69314718055994530941723212145818f, 0.0f), 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f;
			const float kf = staganition ? fmaf(0.5819767068693265f, poly, 0.4f) : fmaf(0.3891860241215959f, poly, 0.5f);
			float adaptive_coeff_clone0 = fmaf(-1.0f, adaptive_coeff_addition, A_dim_clone);
			const float exp_arg = B_dim;
			float adaptive_coeff_clone1 = fmaf(-fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, fmaf(exp_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), adaptive_coeff_addition, A_dim_clone);
			{
				const float sx = bm.bestI[0];
				const float ex = bm.bestI[1];
				const float y1i = bm.bestI[2];
				const float y2i = bm.bestI[3];
				IntervalND* I = new IntervalND(sx, ex, y1i, y2i);
				I->i1 = t_to_idx(sx);
				I->i2 = t_to_idx(ex);
				I->diam = map.block_diameter(I->i1, I->i2);
				I->compute_span_level(map);
				I->set_metric(I->diam);
				update_pockets_and_Mmax(I);
				I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
				I->R = fmaf(I->R, fmaf(kf, adaptive_coeff_clone0, 0.0f), 0.0f);
				H.emplace_back(I);
				std::push_heap(H.begin(), H.end(), ComparePtrND);
			}
			{
				const float sx = bm.bestI[4];
				const float ex = bm.bestI[5];
				const float y1i = bm.bestI[6];
				const float y2i = bm.bestI[7];
				IntervalND* I = new IntervalND(sx, ex, y1i, y2i);
				I->i1 = t_to_idx(sx);
				I->i2 = t_to_idx(ex);
				I->diam = map.block_diameter(I->i1, I->i2);
				I->compute_span_level(map);
				I->set_metric(I->diam);
				update_pockets_and_Mmax(I);
				I->ChangeCharacteristic(fmaf(r_eff, Mmax, 0.0f));
				I->R = fmaf(I->R, fmaf(kf, adaptive_coeff_clone1, 0.0f), 0.0f);
				H.emplace_back(I);
				std::push_heap(H.begin(), H.end(), ComparePtrND);
			}
			if (bm.bestF < bestF) {
				bestF = bm.bestF;
				bestX = bm.bestX;
				bestY = bm.bestY;
				bestQ.assign(bm.bestQ, bm.bestQ + bm.dim);
			}
		}
	}
	++it;
}

}

static __declspec(noalias) __forceinline float PivotCalculation(std::vector<IntervalND*>::iterator first, std::vector<IntervalND*>::iterator last) noexcept {
const auto mid = first + ((last - first) >> 1);
float pivot_value = NAN;
if (last - first < 199)
pivot_value = (*mid)->R;
else {
if ((*first)->R < (*mid)->R) {
if ((*mid)->R < (*last)->R)
pivot_value = (*mid)->R;
else
pivot_value = std::max((*first)->R, (*last)->R);
}
else {
if ((*first)->R < (*last)->R)
pivot_value = (*first)->R;
else
pivot_value = std::max((*mid)->R, (*last)->R);
}
}
return pivot_value;
}

static __declspec(noalias) __forceinline void HoaraSort(std::vector<IntervalND*>::iterator first, std::vector<IntervalND*>::iterator last) noexcept {
if (first >= last)
return;
const float pivot_value = PivotCalculation(first, last);
auto left = first;
auto right = last;
do {
while (left < last && (*left)->R < pivot_value)
++left;
while (right > first && (*right)->R > pivot_value)
--right;
if ((*left)->R == (*right)->R && left != right) {
if ((left)->R < ((left + 1))->R)
++left;
else
--right;
}
std::iter_swap(left, right);
} while (left != right);
if (last - first < 199) {
HoaraSort(first, right);
HoaraSort(left + 1, last);
}
else {
oneapi::tbb::parallel_invoke(&first, &right { HoaraSort(first, right); }, &left, &last { HoaraSort(left + 1, last); });
}
}

extern "C" __declspec(dllexport) __declspec(noalias) void AGP_Manip2D(
int nSegments,
bool variableLengths,
float minTheta,
float targetX,
float targetY,
int maxIterPerBranch,
float r,
bool adaptiveMode,
float epsilon,
unsigned int seed,
float baseLength,
float stretchFactor,
float** out_bestQ,
size_t* out_bestQLen,
float* out_bestX,
float* out_bestY,
float* out_bestF,
size_t* out_iterations,
float* out_achieved_epsilon) noexcept {
Slab* slab = tls.local();
slab->current = slab->base;
while (g_world->iprobe(boost::mpi::any_source, 0)) {
MultiCrossMsg dummy;
g_world->recv(boost::mpi::any_source, 0, dummy);
}
while (g_world->iprobe(boost::mpi::any_source, 2)) {
BestSolutionMsg dummy;
g_world->recv(boost::mpi::any_source, 2, dummy);
}
const int dim = nSegments + (variableLengths ? nSegments : 0);
g_mc.permCache.resize(static_cast<size_t>(dim));
int i = 0;
while (i < dim) {
g_mc.permCache[i] = i;
++i;
}
unsigned s = g_mc.baseSeed;
i = dim - 1;
while (i > 0) {
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
const unsigned j = s % static_cast<unsigned>(i + 1);
std::swap(g_mc.permCache[i], g_mc.permCache[j]);
--i;
}
g_mc.invMaskCache.resize(static_cast<size_t>(dim));
int k = 0;
while (k < dim) {
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
g_mc.invMaskCache[k] = static_cast<unsigned long long>(s);
++k;
}
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> low;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> high;
low.reserve(static_cast<size_t>(dim));
high.reserve(static_cast<size_t>(dim));
i = 0;
while (i < nSegments) {
low.emplace_back(i == 0 ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f);
high.emplace_back(2.6179938779914943653855361527329f);
++i;
}
if (variableLengths) {
i = 0;
const float lengthLower = baseLength / stretchFactor;
const float lengthUpper = baseLength * stretchFactor;
while (i < nSegments) {
low.emplace_back(lengthLower);
high.emplace_back(lengthUpper);
++i;
}
}
const ManipCost cost(nSegments, variableLengths, targetX, targetY, minTheta);
const int rank = g_world->rank();
const int world = g_world->size();
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> bestQ;
float bestF = FLT_MAX;
float bx = 0.0f;
float by = 0.0f;
const float exp_arg_lvls = fmaf(-static_cast<float>(dim), 0.455f, 0.0f);
const float exp_arg_lvl0 = fmaf(-static_cast<float>(dim), 0.08f, 0.0f);
const int peanoLevels = static_cast<int>(fminf(fmaf(fmaf(exp_arg_lvls, fmaf(exp_arg_lvls, fmaf(exp_arg_lvls, fmaf(exp_arg_lvls, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 30.0f, 6.125f), 13.0f));
const int levels0 = static_cast<int>(fminf(fmaf(-fminf(fmaf(fmaf(exp_arg_lvl0, fmaf(exp_arg_lvl0, fmaf(exp_arg_lvl0, fmaf(exp_arg_lvl0, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 21.7f, -11.3f), 0.0f), 1.0f, static_cast<float>(peanoLevels)), 8.0f));
const MortonND map0(dim, levels0, low.data(), high.data(), g_mc);
std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>> H_coarse;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> bestQ_coarse;
float bestF_coarse = FLT_MAX;
float bx_coarse = 0.0f;
float by_coarse = 0.0f;
size_t total_oi = 0u;
float total_oe = 0.0f;
size_t oi = 0u;
float oe = 0.0f;
const float M_arg = ldexpf(1.0f, -levels0);
const float M_prior = fmaf(fmaf(variableLengths ? 2.8284271f : 2.0f, static_cast<float>(nSegments), 0.0f), fmaf(M_arg, fmaf(M_arg, fmaf(M_arg, fmaf(M_arg, fmaf(M_arg, 0.164056f, -0.098462f), 0.240884f), -0.351834f), 0.999996f), M_arg), 0.0f);
agp_run_branch_mpi(map0, cost, maxIterPerBranch >> 1, r, adaptiveMode, epsilon, seed, H_coarse, bestQ_coarse, bestF_coarse, bx_coarse, by_coarse, oi, oe, M_prior);
total_oi += oi;
total_oe = oe;
if (bestF_coarse < bestF) {
bestF = bestF_coarse;
bestQ = std::move(bestQ_coarse);
bx = bx_coarse;
by = by_coarse;
}
while (g_world->iprobe(boost::mpi::any_source, 0)) {
MultiCrossMsg dummy;
g_world->recv(boost::mpi::any_source, 0, dummy);
}
while (g_world->iprobe(boost::mpi::any_source, 2)) {
BestSolutionMsg dummy;
g_world->recv(boost::mpi::any_source, 2, dummy);
}
const MortonND map1(dim, peanoLevels, low.data(), high.data(), g_mc);
std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>> H_fine;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> bestQ_fine = bestQ;
float bestF_fine = bestF;
float bx_fine = bx;
float by_fine = by;
size_t oi_fine = 0u;
float oe_fine = 0.0f;
const float M_prior_fine = fmaf(1.0f / static_cast<float>(peanoLevels), fmaf(static_cast<float>(levels0), M_prior, 0.0f), 0.0f);
HoaraSort(H_coarse.begin(), H_coarse.end() - 1);
const float inv_dim = 1.0f / static_cast<float>(dim + 1);
size_t ci = static_cast<size_t>(fmaf(static_cast<float>(H_coarse.size()), fmaf(inv_dim, fmaf(inv_dim, fmaf(inv_dim, fmaf(inv_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) + 1.0f, -0.7f));
while (ci < H_coarse.size()) {
const IntervalND* C = H_coarse[ci];
alignas(16) float q1[32];
alignas(16) float q2[32];
float x1;
float y1;
float x2;
float y2;
map1.map01ToPoint(C->x1, q1);
const float f1 = cost(q1, x1, y1);
map1.map01ToPoint(C->x2, q2);
const float f2 = cost(q2, x2, y2);
IntervalND* I = new IntervalND(C->x1, C->x2, f1, f2);
I->i1 = static_cast<unsigned long long>(fmaf(C->x1, static_cast<float>(map1.scale), 0.0f));
I->i2 = static_cast<unsigned long long>(fmaf(C->x2, static_cast<float>(map1.scale), 0.0f));
I->diam = map1.block_diameter(I->i1, I->i2);
I->set_metric(I->diam);
H_fine.emplace_back(I);
if (f1 < bestF_fine) {
bestF_fine = f1;
bestQ_fine.assign(q1, q1 + dim);
bx_fine = x1;
by_fine = y1;
}
if (f2 < bestF_fine) {
bestF_fine = f2;
bestQ_fine.assign(q2, q2 + dim);
bx_fine = x2;
by_fine = y2;
}
++ci;
}
std::make_heap(H_fine.begin(), H_fine.end(), ComparePtrND);
agp_run_branch_mpi(map1, cost, maxIterPerBranch >> 1, r, adaptiveMode, epsilon, seed, H_fine, bestQ_fine, bestF_fine, bx_fine, by_fine, oi_fine, oe_fine, M_prior_fine);
total_oi += oi_fine;
total_oe = oe_fine;
if (bestF_fine < bestF) {
bestF = bestF_fine;
bestQ = std::move(bestQ_fine);
bx = bx_fine;
by = by_fine;
}
BestSolutionMsg best;
best.bestF = bestF;
best.bestX = bx;
best.bestY = by;
best.dim = static_cast<unsigned>(bestQ.size());
memcpy(best.bestQ, bestQ.data(), static_cast<size_t>(best.dim) * sizeof(float));
const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
bool active = true;
size_t itx = 0;
while (itx < iterations && active) {
const size_t step = 1ULL << itx;
const int partner = rank ^ static_cast<int>(step);
if (partner < world) {
const bool am_sender = (rank & static_cast<int>(step)) != 0;
if (am_sender) {
g_world->isend(partner, 3, best);
active = false;
}
else {
BestSolutionMsg in;
g_world->recv(partner, 3, in);
if (in.bestF < best.bestF)
best = in;
}
}
++itx;
}
if (rank == 0) {
*out_bestQLen = static_cast<size_t>(best.dim);
out_bestQ = static_cast<float>(CoTaskMemAlloc(sizeof(float) * (*out_bestQLen)));
memcpy(*out_bestQ, best.bestQ, sizeof(float) * (*out_bestQLen));
*out_bestX = best.bestX;
*out_bestY = best.bestY;
*out_bestF = best.bestF;
*out_iterations = total_oi;
*out_achieved_epsilon = total_oe;
}
}

extern "C" __declspec(dllexport) __declspec(noalias) _forceinline int AgpInit(int peanoLevel, float a, float b, float c, float d) noexcept {
g_env = new boost::mpi::environment();
g_world = new boost::mpi::communicator();
MM_SET_FLUSH_ZERO_MODE(MM_FLUSH_ZERO_ON);
MM_SET_DENORMALS_ZERO_MODE(MM_DENORMALS_ZERO_ON);
const int rank = g_world->rank();
const int world_size = g_world->size();
if (world_size == 4)
new (&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, rank & 3);
g_mc.baseSeed = static_cast<unsigned>(fmaf(0x9E3779B9u, static_cast<float>(rank), 0x9E3779B9u));
if (rank == 0) {
wchar_t buf[MAX_PATH]{};
GetModuleFileNameW(nullptr, buf, MAX_PATH);
std::wstring ws(buf);
auto pos = ws.find_last_of(L"\/");
if (pos != std::wstring::npos)
ws.resize(pos);
int n = WideCharToMultiByte(CP_UTF8, 0, ws.c_str(), -1, nullptr, 0, nullptr, nullptr);
g_exeDirCache.resize(n, '\0');
WideCharToMultiByte(CP_UTF8, 0, ws.c_str(), -1, g_exeDirCache.data(), n, nullptr, nullptr);
if (!g_exeDirCache.empty() && g_exeDirCache.back() == '\0')
g_exeDirCache.pop_back();
g_pyInterpreter = new pybind11::scoped_interpreter{};
pybind11::module sys = pybind11::module::import("sys");
pybind11::list path = sys.attr("path");
const std::string& exeDir = g_exeDirCache;
path.attr("insert")(0, pybind11::str(exeDir + "\env\Lib\site-packages"));
path.attr("insert")(0, pybind11::str(exeDir + "\env\Scripts"));
path.attr("insert")(0, pybind11::str(exeDir + "\env"));
path.attr("append")(pybind11::str(exeDir));
pybind11::module::import("warnings").attr("filterwarnings")("ignore");
g_pyOptimizerBridge = new pybind11::module(pybind11::module::import("optimizer_bridge"));
}
return rank;
}

static __declspec(noalias) __forceinline float ShekelFunc(float x, float seed) noexcept {
int i = 0;
float st = seed;
float r1;
float r2;
float res = 0.0f;
#pragma loop ivdep
while (i < 10) {
XOR_RAND(st, r1);
const float xp = fmaf(-r1, 10.0f, x);
XOR_RAND(st, r1);
XOR_RAND(st, r2);
float d = fmaf(fmaf(r1, 20.0f, 5.0f), xp * xp, fmaf(r2, 0.2f, 1.0f));
d = copysignf(fmaxf(fabsf(d), FLT_MIN), d);
res -= (1.0f / d) * 1.0f;
++i;
}
return res;
}

static __declspec(noalias) __forceinline float RastriginFunc(float x1, float x2) noexcept {
const float t = fmaf(x1, x1, x2 * x2);
float c1;
float c2;
FABE13_COS(6.28318530717958647692f * x1, c1);
FABE13_COS(6.28318530717958647692f * x2, c2);
return (t - fmaf(c1 + c2, 10.0f, -14.6f)) * fmaf(-t, 0.25f, 18.42f);
}

static __declspec(noalias) __forceinline float HillFunc(float x, float seed) noexcept {
int j = 0;
alignas(16) float ang[14];
const float st_ang = 6.28318530717958647692f * x;
while (j < 14) {
ang[j] = st_ang * static_cast<float>(j + 1);
++j;
}
alignas(16) float sv[14];
alignas(16) float cv[14];
FABE13_SINCOS(ang, sv, cv, 14);
float state = seed;
float r1;
float r2;
XOR_RAND(state, r1);
float res = fmaf(r1, 2.0f, -1.1f);
--j;
#pragma loop ivdep
while (j >= 0) {
XOR_RAND(state, r1);
XOR_RAND(state, r2);
res += fmaf(fmaf(r1, 2.0f, -1.1f), sv[j], fmaf(r2, 2.0f, -1.1f) * cv[j]);
--j;
}
return res;
}

static __declspec(noalias) __forceinline float GrishaginFunc(float x1, float x2, float seed) noexcept {
int j = 0;
alignas(16) float aj[8];
alignas(16) float ak[8];
#pragma loop ivdep
while (j < 8) {
const float pj = 3.14159265358979323846f * static_cast<float>(j + 1);
aj[j] = pj * x1;
ak[j] = pj * x2;
++j;
}
alignas(16) float sj[8];
alignas(16) float cj[8];
alignas(16) float sk[8];
alignas(16) float ck[8];
FABE13_SINCOS(aj, sj, cj, 8);
FABE13_SINCOS(ak, sk, ck, 8);
--j;
float p1 = 0.0f;
float p2 = 0.0f;
float st = seed;
float r1;
float r2;
#pragma loop ivdep
while (j >= 0) {
size_t k2 = 0u;
while (k2 < 8u) {
const float s = sj[j] * sj[j];
const float c = ck[k2] * ck[k2];
XOR_RAND_GRSH(st, r1);
XOR_RAND_GRSH(st, r2);
p1 = fmaf(r1, s, fmaf(r2, c, p1));
XOR_RAND_GRSH(st, r1);
XOR_RAND_GRSH(st, r2);
p2 = fmaf(-r1, c, fmaf(r2, s, p2));
++k2;
}
--j;
}
return -sqrtf(fmaf(p1, p1, p2 * p2));
}

extern "C" __declspec(dllexport) __declspec(noalias) void AGP_1D(
float global_iterations,
float a,
float b,
float r,
bool mode,
float epsilon,
float seed,
float** out_data,
size_t* out_len) noexcept {
Slab* slab = tls.local();
slab->current = slab->base;
int counter = 0;
const float initial_length = b - a;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length;
const float inv_threshold_03 = 1.0f / threshold_03;
const float start_val = ShekelFunc(a, seed);
float best_f = ShekelFunc(b, seed);
float x_Rmax_1 = a;
float x_Rmax_2 = b;
float y_Rmax_1 = start_val;
float y_Rmax_2 = best_f;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
std::vector<Interval1D*, boost::alignment::aligned_allocator<Interval1D*, 16u>> R;
Extr.reserve(static_cast<size_t>(global_iterations) << 2u);
R.reserve(static_cast<size_t>(global_iterations) << 1u);
R.emplace_back(new Interval1D(a, b, start_val, best_f, 1.0f));
float Mmax = R.front()->M;
float m = r * Mmax;
while (true) {
const float new_point = step(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r);
const float new_value = ShekelFunc(new_point, seed);
if (new_value < best_f) {
best_f = new_value;
Extr.emplace_back(best_f);
Extr.emplace_back(new_point);
}
std::pop_heap(R.begin(), R.end(), ComparePtr1D);
const Interval1D* pro = R.back();
const float new_x1 = pro->x1;
const float new_x2 = pro->x2;
const float len2 = new_x2 - new_point;
const float len1 = new_point - new_x1;
const float interval_len = (len1 < len2 ? len1 : len2);
if (++counter == static_cast<int>(global_iterations) || interval_len < epsilon) {
Extr.emplace_back(static_cast<float>(counter));
Extr.emplace_back(interval_len);
*out_len = Extr.size();
out_data = static_cast<float>(CoTaskMemAlloc(sizeof(float) * (out_len)));
memcpy(out_data, Extr.data(), sizeof(float) * (out_len));
return;
}
Interval1D curr = new Interval1D(new_x1, new_point, pro->y1, new_value, 1.0f);
Interval1D curr1 = new Interval1D(new_point, new_x2, new_value, pro->y2, 1.0f);
const float currM = curr->M > curr1->M ? curr->M : curr1->M;
const size_t r_size = R.size();
if (mode) {
if (len2 + len1 == dmax) {
dmax = len2 > len1 ? len2 : len1;
for (auto pI : R) {
const float L = pI->x2 - pI->x1;
if (L > dmax)
dmax = L;
}
}
if ((threshold_03 > dmax && !(counter % 3)) || 10.0f * dmax < initial_length) {
if (currM > Mmax) {
Mmax = currM;
m = r * Mmax;
}
const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
const float alpha = progress * progress;
const float betta = 2.0f - alpha;
const float MULT = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULT, r, -MULT);
const float GF = betta * global_coeff;
curr->ChangeCharacteristic(fmaf(GF, len1, curr->M * alpha));
curr1->ChangeCharacteristic(fmaf(GF, len2, curr1->M * alpha));
RecomputeR_AffineM_AVX2_1D(R.data(), r_size, GF, alpha);
std::make_heap(R.begin(), R.end(), ComparePtr1D);
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
RecomputeR_ConstM_AVX2_1D(R.data(), r_size, m);
std::make_heap(R.begin(), R.end(), ComparePtr1D);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
RecomputeR_ConstM_AVX2_1D(R.data(), r_size, m);
std::make_heap(R.begin(), R.end(), ComparePtr1D);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
R.back() = curr;
std::push_heap(R.begin(), R.end(), ComparePtr1D);
R.emplace_back(curr1);
std::push_heap(R.begin(), R.end(), ComparePtr1D);
const Interval1D top = R.front();
x_Rmax_1 = top->x1;
x_Rmax_2 = top->x2;
y_Rmax_1 = top->y1;
y_Rmax_2 = top->y2;
}
}

extern "C" __declspec(dllexport) __declspec(noalias) void AGP_2D(
float N,
float global_iterations,
float a,
float b,
float c,
float d,
float r,
bool mode,
float epsilon,
float seed,
float** out_data,
size_t* out_len) noexcept {
Slab* slab = tls.local();
slab->current = slab->base;
int counter = 0;
int no_improve = 0;
const int rank = g_world->rank();
const int world_size = g_world->size();
while (g_world->iprobe(boost::mpi::any_source, 0)) {
MultiCrossMsg dummy;
g_world->recv(boost::mpi::any_source, 0, dummy);
}
const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1));
const float x_addition = (b - a) * inv_divider;
const float y_addition = (d - c) * inv_divider;
const float true_start = a + x_addition;
const float true_end = b - x_addition;
float x_Rmax_1 = true_start;
float x_Rmax_2 = true_end;
const float initial_length = x_Rmax_2 - x_Rmax_1;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length;
const float inv_threshold_03 = 1.0f / threshold_03;
const float start_val = (rank % 3) ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition);
float best_f = (rank % 2) ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition);
float y_Rmax_1 = start_val;
float y_Rmax_2 = best_f;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
std::vector<Interval1D* __restrict, boost::alignment::aligned_allocator<Interval1D* __restrict, 16u>> R;
Extr.reserve(static_cast<size_t>(global_iterations) << 2u);
R.reserve(static_cast<size_t>(global_iterations) << 1u);
R.emplace_back(new Interval1D(true_start, true_end, start_val, best_f, 2.0f));
const Interval1D* __restrict top_ptr = R.front();
float Mmax = R.front()->M;
float m = r * Mmax;
while (true) {
const float interval_len = x_Rmax_2 - x_Rmax_1;
const bool stagnation = no_improve > 100 && counter > 270;
const float p = fmaf(-1.0f / initial_length, dmax, 1.0f);
while (g_world->iprobe(boost::mpi::any_source, 0)) {
MultiCrossMsg in;
g_world->recv(boost::mpi::any_source, 0, in);
const MultiCrossMsg& mX = in;
unsigned ii = 0;
while (ii < mX.count) {
const float* d2 = &mX.intervals[ii * 5];
float sx = d2[0];
float ex = d2[2];
if (ex > sx) {
Interval1D* __restrict injected = new Interval1D(sx, ex, RastriginFunc(d2[0], d2[1]), RastriginFunc(d2[2], d2[3]), 2.0f);
injected->ChangeCharacteristic(m);
if (injected->R > 1.15f * top_ptr->R) {
const float poly = fmaf(p * 0.69314718055994530941723212145818f, fmaf(p * 0.69314718055994530941723212145818f, fmaf(p * 0.69314718055994530941723212145818f, fmaf(p * 0.69314718055994530941723212145818f, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f;
const float k = stagnation ? fmaf(0.5819767068693265f, poly, 0.3f) : fmaf(0.3491860241215959f, poly, 0.6f);
injected->R = d2[4] * k;
R.emplace_back(injected);
std::push_heap(R.begin(), R.end(), ComparePtr1D);
}
}
++ii;
}
}
const int T = static_cast<int>(fmaf(-(fmaf(p * 0.69314718055994530941723212145818f, fmaf(p * 0.69314718055994530941723212145818f, fmaf(p * 0.69314718055994530941723212145818f, fmaf(p * 0.69314718055994530941723212145818f, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f) - 1.0f), 264.0f, 277.0f));
const bool want_term = interval_len < epsilon || counter == static_cast<int>(global_iterations);
if (!(++counter % T) || stagnation) {
if (!want_term) {
MultiCrossMsg out;
float s_x1;
float s_x2;
float e_x1;
float e_x2;
HitTest2D_analytic(top_ptr->x1, s_x1, s_x2);
HitTest2D_analytic(top_ptr->x2, e_x1, e_x2);
out.intervals[0] = s_x1;
out.intervals[1] = s_x2;
out.intervals[2] = e_x1;
out.intervals[3] = e_x2;
out.intervals[4] = top_ptr->R;
out.count = 1;
int i2 = 0;
while (i2 < world_size) {
if (i2 != rank)
g_world->isend(i2, 0, out);
++i2;
}
}
}
if (want_term) {
if (!rank) {
Extr.emplace_back(static_cast<float>(counter));
Extr.emplace_back(interval_len);
*out_len = Extr.size();
out_data = static_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (out_len)));
memcpy(out_data, Extr.data(), sizeof(float) * (out_len));
}
return;
}
const float new_point = step(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r);
float new_x1_val;
float new_x2_val;
HitTest2D_analytic(new_point, new_x1_val, new_x2_val);
const float new_value = RastriginFunc(new_x1_val, new_x2_val);
if (new_value < best_f) {
best_f = new_value;
Extr.emplace_back(best_f);
Extr.emplace_back(new_x1_val);
Extr.emplace_back(new_x2_val);
no_improve = 0;
}
else
++no_improve;
std::pop_heap(R.begin(), R.end(), ComparePtr1D);
Interval1D __restrict intermediate = R.back();
const float segment_x1 = intermediate->x1;
const float segment_x2 = intermediate->x2;
const float len2 = segment_x2 - new_point;
const float len1 = new_point - segment_x1;
Interval1D __restrict curr = new Interval1D(segment_x1, new_point, intermediate->y1, new_value, 2.0f);
Interval1D __restrict curr1 = new Interval1D(new_point, segment_x2, new_value, intermediate->y2, 2.0f);
const float currM = (std::max)(curr->M, curr1->M);
const size_t r_size = R.size();
if (mode) {
if (len2 + len1 == dmax) {
dmax = (std::max)(len1, len2);
for (auto pI : R) {
const float L = pI->x2 - pI->x1;
if (L > dmax)
dmax = L;
}
}
if ((threshold_03 > dmax && !(counter % 3)) || 10.0f * dmax < initial_length) {
if (currM > Mmax) {
Mmax = currM;
m = r * Mmax;
}
const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
const float alpha = progress * progress;
const float betta = 2.0f - alpha;
const float MULTIPLIER = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
const float GLOBAL_FACTOR = betta * global_coeff;
curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));
RecomputeR_AffineM_AVX2_1D(R.data(), r_size, GLOBAL_FACTOR, alpha);
std::make_heap(R.begin(), R.end(), ComparePtr1D);
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
RecomputeR_ConstM_AVX2_1D(R.data(), r_size, m);
std::make_heap(R.begin(), R.end(), ComparePtr1D);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
RecomputeR_ConstM_AVX2_1D(R.data(), r_size, m);
std::make_heap(R.begin(), R.end(), ComparePtr1D);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
R.back() = curr;
std::push_heap(R.begin(), R.end(), ComparePtr1D);
R.emplace_back(curr1);
std::push_heap(R.begin(), R.end(), ComparePtr1D);
top_ptr = R.front();
x_Rmax_1 = top_ptr->x1;
x_Rmax_2 = top_ptr->x2;
y_Rmax_1 = top_ptr->y1;
y_Rmax_2 = top_ptr->y2;
}
}

__declspec(align(16)) struct RunParams final sealed{
int nSegments;
unsigned varLen;
float minTheta;
float tx;
float ty;
int maxIter;
float r;
unsigned adaptive;
float eps;
unsigned seed;
float baseLength;
float stretchFactor;

text
	template <typename Archive>
	void serialize(Archive& ar, unsigned int) {
			ar& nSegments& varLen& minTheta& tx& ty& maxIter& r& adaptive& eps& seed& baseLength& stretchFactor;
	}

};

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpStartManipND(
int nSegments,
bool variableLengths,
float minTheta,
float targetX,
float targetY,
int maxIterPerBranch,
float r,
bool adaptiveMode,
float epsilon,
unsigned int seed,
float baseLength,
float stretchFactor) noexcept {
RunParams p;
p.nSegments = nSegments;
p.varLen = static_cast<unsigned>(variableLengths);
p.minTheta = minTheta;
p.tx = targetX;
p.ty = targetY;
p.maxIter = maxIterPerBranch;
p.r = r;
p.adaptive = static_cast<unsigned>(adaptiveMode);
p.eps = epsilon;
p.seed = seed;
p.baseLength = baseLength;
p.stretchFactor = stretchFactor;
int i = 1;
const int world = g_world->size();
while (i < world) {
g_world->isend(i, 1, p);
++i;
}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() noexcept {
RunParams p;
float* __restrict q;
size_t qlen;
float bx;
float by;
float bf;
size_t oi;
float oa;
while (true) {
if (g_world->iprobe(0, 1)) {
g_world->recv(0, 1, p);
AGP_Manip2D(p.nSegments, static_cast<bool>(p.varLen), p.minTheta, p.tx, p.ty, p.maxIter, p.r, static_cast<bool>(p.adaptive), p.eps, p.seed, p.baseLength, p.stretchFactor, &q, &qlen, &bx, &by, &bf, &oi, &oa);
}
Sleep(0);
}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun2D() noexcept {
int dummy;
float* __restrict buf;
size_t len;
while (true) {
if (g_world->iprobe(0, 1)) {
g_world->recv(0, 1, dummy);
AGP_2D(2.0f, 10000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.00001f, static_cast<float>(GetTickCount()), &buf, &len);
}
Sleep(0);
}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpStartWorkers() noexcept {
int i = 1;
const int world = g_world->size();
while (i < world) {
g_world->isend(i, 1, 0);
++i;
}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_Free(float* p) noexcept {
CoTaskMemFree(p);
}

extern "C" __declspec(dllexport) __declspec(noalias) void RunPythonOptimization(
const char* backend,
int n_seg,
bool var_len,
float min_theta,
float tx,
float ty,
int levels,
int max_iter,
float r_param,
float eps,
bool adaptive,
float baseLength,
float stretchFactor,
float* __restrict out_bestF,
float* __restrict out_bestX,
float* __restrict out_bestY,
float* __restrict out_angles,
float* __restrict out_lengths,
int* __restrict out_iterations,
float* __restrict out_eps,
float* __restrict out_micros) noexcept {
pybind11::gil_scoped_acquire gil;
pybind11::dict result;
if (backend && strcmp(backend, "optuna") == 0) {
result = (*g_pyOptimizerBridge).attr("run_optuna")(n_seg, var_len, min_theta, tx, ty, max_iter, baseLength, stretchFactor);
}
else {
result = (*g_pyOptimizerBridge).attr("run_iopt")(n_seg, var_len, min_theta, tx, ty, levels, max_iter, r_param, eps, adaptive, baseLength, stretchFactor);
}
*out_bestF = result["BEST_F"].cast<float>();
*out_bestX = result["BEST_X"].cast<float>();
*out_bestY = result["BEST_Y"].cast<float>();
*out_iterations = result["ITERATIONS"].cast<int>();
out_eps = result["EPS"].cast<float>();
out_micros = result["TIME"].cast<float>();
std::vector<float> angles_vec = result["ANGLES"].cast<std::vector<float>>();
std::vector<float> lengths_vec = result["LENGTHS"].cast<std::vector<float>>();
const size_t n = static_cast<size_t>(n_seg);
if (out_angles) {
const size_t m = angles_vec.size();
const size_t limit = m < n ? m : n;
const float __restrict src = angles_vec.data();
size_t i = 0;
#pragma loop ivdep
while (i < limit) {
out_angles[i] = src[i];
++i;
}
}
if (out_lengths) {
if (var_len) {
const size_t mL = lengths_vec.size();
const size_t limitL = mL < n ? mL : n;
const float __restrict srcL = lengths_vec.data();
size_t j = 0;
#pragma loop ivdep
while (j < limitL) {
out_lengths[j] = srcL[j];
++j;
}
}
else {
const float one = 1.0f;
size_t j = 0;
#pragma loop ivdep
while (j < n) {
out_lengths[j] = one;
++j;
}
}
}
}" - я продолжаю работать над этим кодом спустя месяц перерыва и не помню на чём последний раз остановился - найди в этом коде концептуальные ошибки или то что точно работает неправильно или не имеет смысла, рассматривай именно общий многомерный алгоритм и то, что с ним связано, маломерные алгоритмы уже отлажены

Answer 1

Ниже — вещи, которые в многомерной ветке (MortonND + agp_run_branch_mpi + генерация сидов/интервалов) выглядят как гарантированные ошибки / UB / выходы за границы или как концептуально бессмысленные места, которые почти наверняка ломают работу (даже если «иногда вроде едет»).

1) `evalAt()` в `agp_run_branch_mpi` возвращает значение не всегда (UB)

cpp
auto evalAt = [&](const float t) -> float {
    map.map01ToPoint(t, q_local);
    float f = cost(q_local, x, y);
    if (f < fmaf(bestF, adaptive_coeff, 0.0f)) {
        ...
        return f;
    }
};

Если условие не выполнено — нет return. Это undefined behavior: наружный код использует f_a = evalAt(a), f_b = evalAt(b), и т.д., и периодически будет получать мусор/NaN/старое значение регистра.

Фикс: всегда возвращать f в конце.

cpp
auto evalAt = [&](float t) -> float {
    map.map01ToPoint(t, q_local);
    float f = cost(q_local, x, y);
    if (f < fmaf(bestF, adaptive_coeff, 0.0f)) {
        ... // локальный улучшатель
    }
    return f;
};

2) Ветка “последняя итерация” никогда не выполняется (логическая ошибка)

Внутри:

cpp
while (it < maxIter) {
   ...
   if (it == maxIter) { ... return; }
   ...
   ++it;
}

it == maxIter никогда не наступит внутри while(it < maxIter). Если ты рассчитывал в конце сделать “финальный локальный тюнинг” — он сейчас мертвый код.

Фиксы (любой):

if (it == maxIter - 1) { ... }
или вынести “финальную обработку” после цикла.

3) Критично: `t` часто выходит за [0,1], а `map01ToPoint()` это не переживает

Где это происходит точно

Сиды:

cpp
const float t1 = t_seed - interval_size;
const float t2 = t_seed + interval_size;

t_seed может быть около 0 или 1 → t1 < 0 или t2 > 1.

“Stagnation”:

cpp
const float t_seed = t_best + off; // off = ±adaptive_diameter*2

Вообще step(...) теоретически должен давать точку внутри интервала, но у тебя там много спец-случаев/аппроксимаций — и даже небольшой вылет за границу на float возможен.

Почему это ломает всё

map01ToPoint() делает:

cpp
const float scaled = t * baseC;
unsigned long long idxc = (scaled >= baseC) ? baseC-1 : (unsigned long long)scaled;

Если t < 0, то scaled отрицательный, условие scaled >= baseC ложно, и происходит приведение отрицательного float к unsigned long long → гигантское число → _pext_u64/маски → мусорная точка в пространстве. Это не “чуть хуже”, это полный разнос.

Минимальный фикс: в каждом месте, где вычисляешь t, делать clamp.

cpp
auto clamp01 = [](float t){
    return fminf(fmaxf(t, 0.0f), 0x1.fffffep-1f); // чуть меньше 1.0f
};
tNew = clamp01(tNew);
t1   = clamp01(t1);
t2   = clamp01(t2);

Правильнее: clamp внутри evalAt() (как последний барьер), чтобы вообще никакой путь не мог вызвать map01ToPoint() с вылетом.

4) `MortonND` с `chunks>1`: текущая логика разбиения на чанки концептуально неправильна

Это, на мой взгляд, самая “системная” проблема.

Что сейчас делается

В map01ToPoint:

cpp
for c in chunks:
    baseC = 1 << (dim * Lc);
    idxc = floor(t * baseC);   // !!! одинаковая формула для каждого чанка
    (опционально gray)
    извлекаем биты idxc и дописываем accBits[pd] = (accBits[pd] << Lc) | bits;

Если Lc одинаковые для чанков (а у тебя обычно так и будет: eff_levels почти везде одинаковый), то baseC одинаковое → idxc одинаковый для всех чанков → ты фактически дописываешь в каждый следующий блок уровней те же самые биты, только переинтерпретированные масками. Это не извлечение “следующих разрядов” параметра t, это повторение/алиасинг.

Как это должно выглядеть, если t — дробь в [0,1)

Нужно извлекать “цифры” по основанию baseC последовательно через остаток:

cpp
double u = t;
for each chunk:
    u *= baseC;
    idxc = (uint64_t)u;
    if (idxc >= baseC) idxc = baseC-1;
    u -= (double)idxc;   // остаток, который идет в следующий чанк

Тогда каждый чанк получает свои разряды.

Вторая часть проблемы: `pointToT()` игнорирует чанки

pointToT() возвращает t только из первого чанка (bitsCoarse = chunk_bits[0]) и в конце:

cpp
return (idx0 + 0.5f) / scale;

То есть при chunks>1:

map01ToPoint() пытается использовать много уровней,
pointToT() инвертирует только верхушку,
генерация сидов/локализация по t_best и т.п. начинает жить в “урезанном t-пространстве”.

Если ты реально хочешь levels > eff_levels, то нужно чинить пару map01ToPoint/pointToT как согласованный энкодер/декодер. Иначе проще честно ограничить levels <= eff_levels и не делать чанки вообще.

5) Генерация LHS сидов гарантированно может выходить за пределы фиксированных буферов

5.1 `permutations[32][256]` и `seeds[256*32]`

В generate_lhs_seeds_lite:

cpp
alignas(16) int permutations[32][256];
...
while (s < ns) permutations[d][s] = s;

ns у тебя вычисляется полиномом от (dim-1) и не ограничен 256.

Если ns > 256 → OOB write в permutations.
В agp_run_branch_mpi сиды хранятся в float seeds[256*32], а total_seeds = 3 + ns. Если total_seeds > 256 → тоже OOB.

Фикс: жесткий clamp.

cpp
ns = std::min(ns, 256);

И лучше ещё ограничить итоговый total_seeds по факту доступного буфера.

5.2 `dim` может быть > 32, а у тебя везде статические массивы на 32

Ты системно используешь:

alignas(16) float q_local[32]; phi[32]; s_arr[32]; ...
BestSolutionMsg.bestQ[32]
seeds[256*32] со stride 32
в ManipCost тоже массивы на 32

Но dim = n + (varLen ? n : 0).
Если nSegments > 16 при varLen=true → dim > 32 и всё начинает писать мимо.

Если ты гарантируешь nSegments <= 16 при varLen, это должно быть явно проверено в API и в рантайме:

cpp
if (dim > 32) { /* вернуть ошибку / обрезать / перейти на динамические буферы */ }

6) `pointToT()` не клипует координаты в допустимый диапазон клеток

В pointToT:

cpp
const float v = (q[pd] - baseOff[pd]) * invStep[pd];
const long long cell = round(v);
...
unsigned long long b = (unsigned long long)cell >> (bitsFull - bitsCoarse);

Если q чуть вне диапазона (а у сидов/эвристик/локального шага такое реально возможно из-за float), cell может стать отрицательным или > max. Дальше каст в unsigned даёт гигантское значение → мусорный t.

Фикс: clamp cell в [0, (1<<bitsFull)-1] перед кастом.

7) В `generate_heuristic_seeds` есть бессмысленный/вредный тест с `sin(tx)`

Ты делаешь:

cpp
FABE13_SINCOS(&tx, &sin_phi, &cos_phi, 1);
const float phi = (fabsf(sin_phi) > 0.0f) ? atan2f(ty, tx) : 0.0f;

sin_phi здесь — это sin(tx), и условие “если sin(tx) != 0” вообще не связано с тем, можно ли вычислять atan2(ty,tx).

В результате при tx ≈ k*pi ты внезапно принудительно ставишь phi=0, хотя atan2 корректен почти всегда (кроме случая tx=0 && ty=0, который надо проверять по rho).

Фикс:

cpp
const float rho = hypotf(tx, ty);
const float phi = (rho > 0.0f) ? atan2f(ty, tx) : 0.0f;

8) “достигнутая точность” в ND сейчас измеряется в длине интервала по `t`, а не в диаметре в пространстве

Ты пишешь:

cpp
const float interval_len = dim > 1 ? (top->x2 - top->x1) : top->diam;
out_achieved_epsilon = interval_len;

Для ND смысл “ε” обычно должен быть про геометрию в исходном пространстве, т.е. diam, а не длину параметра на кривой, потому что одна и та же длина по t может давать разные пространственные диаметры (особенно если кривая/мэппинг неравномерны или если чанки сломаны).

Если ты реально хотел ε именно в параметре t — тогда IntervalND::diam и set_metric(diam) начинают быть концептуально “про другое”, и алгоритм становится внутренне несогласованным.

9) Бонус: строгая алиасинг-UB в “битовых трюках” (в ND тоже используется)

Например:

cpp
const unsigned sign_mask = ((*reinterpret_cast<const unsigned*>(&diff)) & 0x80000000u) ^ 0x80000000u;
const float sign_mult = *reinterpret_cast<const float*>(&sign_mask);

Это формально UB в C++ (strict aliasing). На MSVC часто “прокатывает”, но если включены агрессивные оптимизации/векторизация — может стрелять редко и мерзко.

Нормальный фикс: std::bit_cast (C++20) или memcpy.

Что бы я чинил в первую очередь (если цель — “чтобы ND вообще стал корректным”)

Сделать return f; в evalAt() (иначе всё бессмысленно).
Клиповать t в [0,1) в evalAt() + в генерации сидов.
Починить chunk-логику MortonND (или запретить levels > eff_levels до починки).
Жёстко ограничить dim <= 32 (или переписать все статические буферы на динамику).
Ограничить ns <= 256 и общий seedCnt по размеру буфера.
Подправить pointToT() (clamp cell), убрать странный тест с sin(tx).

Если хочешь — я могу предложить конкретный “безболезненный” патч на MortonND::map01ToPoint() под твою chunk-схему (с остатком u), который минимально ломает остальной код, и отдельно — как адаптировать pointToT() так, чтобы он работал хотя бы на уровне тех же чанков (или честно возвращал только coarse и тогда не использовать его для сидов).

"#include "pch.h" #define XOR_RAND(state, result_...

Pergunta

Pensamento

Identifying conceptual errors in user code

Identifying bug in evalAt function

Identifying logical errors and potential issues

Clarifying HoaraSort behavior

Identifying potential bugs in heuristics and seed generation

Identifying issues with series approximation and seed generation

Identifying issues with seed and stack overflow

Identifying bugs with clamping in map01ToPoint

Identifying conceptual bugs with clamping and chunk-bit calculations

Revising resolution handling and chunk separation logic

Analyzing multi-chunk mapping and indexing issues

Identifying critical issues in multi-dimensional algorithm

Resposta

1) `evalAt()` в `agp_run_branch_mpi` возвращает значение не всегда (UB)

2) Ветка “последняя итерация” никогда не выполняется (логическая ошибка)

3) Критично: `t` часто выходит за [0,1], а `map01ToPoint()` это не переживает

Где это происходит точно

Почему это ломает всё

4) `MortonND` с `chunks>1`: текущая логика разбиения на чанки концептуально неправильна

Что сейчас делается

Как это должно выглядеть, если t — дробь в [0,1)

Вторая часть проблемы: `pointToT()` игнорирует чанки

5) Генерация LHS сидов гарантированно может выходить за пределы фиксированных буферов

5.1 `permutations[32][256]` и `seeds[256*32]`

5.2 `dim` может быть > 32, а у тебя везде статические массивы на 32

6) `pointToT()` не клипует координаты в допустимый диапазон клеток

7) В `generate_heuristic_seeds` есть бессмысленный/вредный тест с `sin(tx)`

8) “достигнутая точность” в ND сейчас измеряется в длине интервала по `t`, а не в диаметре в пространстве

9) Бонус: строгая алиасинг-UB в “битовых трюках” (в ND тоже используется)

Что бы я чинил в первую очередь (если цель — “чтобы ND вообще стал корректным”)

Compartilhar esta Q&A

Pergunta

Pensamento

Identifying conceptual errors in user code

Identifying bug in evalAt function

Identifying logical errors and potential issues

Clarifying HoaraSort behavior

Identifying potential bugs in heuristics and seed generation

Identifying issues with series approximation and seed generation

Identifying issues with seed and stack overflow

Identifying bugs with clamping in map01ToPoint

Identifying conceptual bugs with clamping and chunk-bit calculations

Revising resolution handling and chunk separation logic

Analyzing multi-chunk mapping and indexing issues

Identifying critical issues in multi-dimensional algorithm

Resposta

1) evalAt() в agp_run_branch_mpi возвращает значение не всегда (UB)

2) Ветка “последняя итерация” никогда не выполняется (логическая ошибка)

3) Критично: t часто выходит за [0,1], а map01ToPoint() это не переживает

Где это происходит точно

Почему это ломает всё

4) MortonND с chunks>1: текущая логика разбиения на чанки концептуально неправильна

Что сейчас делается

Как это должно выглядеть, если t — дробь в [0,1)

Вторая часть проблемы: pointToT() игнорирует чанки

5) Генерация LHS сидов гарантированно может выходить за пределы фиксированных буферов

5.1 permutations[32][256] и seeds[256*32]

5.2 dim может быть > 32, а у тебя везде статические массивы на 32

6) pointToT() не клипует координаты в допустимый диапазон клеток

7) В generate_heuristic_seeds есть бессмысленный/вредный тест с sin(tx)

8) “достигнутая точность” в ND сейчас измеряется в длине интервала по t, а не в диаметре в пространстве

9) Бонус: строгая алиасинг-UB в “битовых трюках” (в ND тоже используется)

Что бы я чинил в первую очередь (если цель — “чтобы ND вообще стал корректным”)

Compartilhar esta Q&A

1) `evalAt()` в `agp_run_branch_mpi` возвращает значение не всегда (UB)

3) Критично: `t` часто выходит за [0,1], а `map01ToPoint()` это не переживает

4) `MortonND` с `chunks>1`: текущая логика разбиения на чанки концептуально неправильна

Вторая часть проблемы: `pointToT()` игнорирует чанки

5.1 `permutations[32][256]` и `seeds[256*32]`

5.2 `dim` может быть > 32, а у тебя везде статические массивы на 32

6) `pointToT()` не клипует координаты в допустимый диапазон клеток

7) В `generate_heuristic_seeds` есть бессмысленный/вредный тест с `sin(tx)`

8) “достигнутая точность” в ND сейчас измеряется в длине интервала по `t`, а не в диаметре в пространстве