"#include "pch.h" #define XOR_RAND(state, result_...

Question

"#include "pch.h"

#define XOR_RAND(state, result_var)
do {
int s = (state);
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = state * 0x1.0p-32f;
} while (0)

#define XOR_RAND_GRSH(state, result_var)
do {
int s = (state);
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = fmaf(state, 0x1.0p-31f, -1.0f);
} while (0)

#define FABE13_COS(x, result_var)
do {
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530718f);
if (r > 3.14159265359f) r = 6.28318530718f - r;
if (r < 1.57079632679f) {
const float t2 = r * r;
const float t4 = t2 * t2;
result_var = fmaf(t4, fmaf(t2, -0.0013888889f, 0.0416666667f), fmaf(t2, -0.5f, 1.0f));
} else {
r = 3.14159265359f - r;
const float t2 = r * r;
const float t4 = t2 * t2;
result_var = -fmaf(t4, fmaf(t2, -0.0013888889f, 0.0416666667f), fmaf(t2, -0.5f, 1.0f));
}
} while (0)

#define FABE13_SIN(x, result_var)
do {
const float x = (x);
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530718f);
bool sfl = r > 3.14159265359f;
if (sfl) r = 6.28318530718f - r;
bool cfl = r > 1.57079632679f;
if (cfl) r = 3.14159265359f - r;
const float t2 = r * r;
float _s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
result_var = ((x < 0.0f) ^ sfl) ? -_s : _s;
} while (0)

#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
const int limit = (n) & ~7;
if ((n) >= 8) {
static __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f);
static __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f);
static __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f);
static __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f);
static __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f);
static __declspec(align(32)) const __m256 VEC_COS_P5 = _mm256_set1_ps(-0.0013888889f);
static __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f);
static __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f);
static __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_SIN_P5 = _mm256_set1_ps(-0.0001984127f);
static __declspec(align(32)) const __m256 VEC_SIN_P3 = _mm256_set1_ps(0.0083333333f);
static __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f);
static __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps();
while (i < limit) {
const __m256 vx = _mm256_load_ps(&(in)[i]);
const __m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
const __m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
const __m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
const __m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
const __m256 t2 = _mm256_mul_ps(r2, r2);
const __m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P5, VEC_COS_P3), VEC_COS_P1), VEC_COS_P0);
const __m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_SIN_P5, VEC_SIN_P3), VEC_SIN_P1), VEC_SIN_P0), r2);
const __m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
const __m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ), _mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
_mm256_store_ps(&(cos_out)[i], _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip));
_mm256_store_ps(&(sin_out)[i], _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip));
i += 8;
}
}
while (i < (n)) {
const float x = (in)[i];
const float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
const bool sflip = r > 3.14159265359f;
if (sflip) r = 6.28318530718f - r;
const bool cflip = r > 1.57079632679f;
if (cflip) r = 3.14159265359f - r;
const float t2 = r * r;
const float c = fmaf(t2, fmaf(t2, fmaf(t2, -0.0013888889f, 0.0416666667f), -0.5f), 1.0f);
const float s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
(cos_out)[i] = cflip ? -c : c;
(sin_out)[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while (0)

enum List : uint8_t { Top = 0b00u, Down = 0b01u, Left = 0b10u, Right = 0b11u };

__declspec(align(4)) struct Step final { const uint8_t next; const uint8_t dx; const uint8_t dy; };
__declspec(align(4)) struct InvStep final { const uint8_t q; const uint8_t next; };

__declspec(align(64)) static const Step g_step_tbl[4][4] = {
{ {Right, 0u, 0u}, {Top, 0u, 1u}, {Top, 1u, 1u}, {Left, 1u, 0u} },
{ {Left, 1u, 1u}, {Down, 1u, 0u}, {Down, 0u, 0u}, {Right, 0u, 1u} },
{ {Down, 1u, 1u}, {Left, 0u, 1u}, {Left, 0u, 0u}, {Top, 1u, 0u} },
{ {Top, 0u, 0u}, {Right, 1u, 0u}, {Right, 1u, 1u}, {Down, 0u, 1u} }
};

__declspec(align(64)) static const InvStep g_inv_tbl[4][4] = {
{ {0u, Right}, {1u, Top}, {3u, Left}, {2u, Top} },
{ {2u, Down}, {3u, Right}, {1u, Down}, {0u, Left} },
{ {2u, Left}, {1u, Left}, {3u, Top}, {0u, Down} },
{ {0u, Top}, {3u, Down}, {1u, Right}, {2u, Right} }
};

static const boost::mpi::environment* g_env;
static const boost::mpi::communicator* g_world;

__declspec(align(16)) struct CrossMsg final {
float s_x1, s_x2;
float e_x1, e_x2;
float Rtop;
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }
};

__declspec(align(16)) struct CtrlMsg final {
bool kind;
CrossMsg xchg;
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& kind& xchg; }
};

__declspec(align(16)) struct Slab final {
char* const base;
char* current;
char* const end;
__forceinline Slab(void* const memory, const size_t usable) : base((char*)memory), current(base), end(base + (usable & ~(size_t)63u)) {}
};

static tbb::enumerable_thread_specific<Slab*> tls( noexcept {
void* memory = _aligned_malloc(16777216u, 64u);
Slab* slab = (Slab*)_aligned_malloc(32u, 64u);
new (slab) Slab(memory, 16777216u);
char* p = slab->base;
while (p < slab->end) { *p = 0; p += 4096u; }
return slab;
});

__declspec(align(16)) struct Peano2DMap final {
const int levels;
const float a, b, c, d;
const float lenx, leny;
const float inv_lenx;
const uint32_t scale;
const uint8_t start;
__forceinline Peano2DMap(int L, float _a, float _b, float _c, float _d, uint8_t st)
: levels(L), a(_a), b(_b), c(_c), d(_d), lenx(_b - _a), leny(_d - _c), inv_lenx(1.0f / (_b - _a)), scale((uint32_t)1u << (L << 1)), start(st) {
}
};

static Peano2DMap gActiveMap(0, 0, 0, 0, 0, 0);

__forceinline bool ComparePtr(const struct Interval* a, const struct Interval* b) noexcept;

__declspec(align(64)) struct Interval final {
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float ordinate_factor;
const float N_factor;
const float quadratic_term;
const float M;
float R;
__forceinline void* operator new(size_t) noexcept { Slab* s = tls.local(); char* r = s->current; s->current += 64u; return r; }
__forceinline Interval(float _x1, float _x2, float _y1, float _y2, float _N) noexcept
: x1(_x1), x2(_x2), y1(_y1), y2(_y2),
delta_y(_y2 - _y1),
ordinate_factor(-(y1 + y2) * 2.0f),
N_factor(_N == 1.0f ? _x2 - _x1 : _N == 2.0f ? sqrtf(_x2 - _x1) : powf(_x2 - _x1, 1.0f / _N)),
quadratic_term((1.0f / N_factor)* delta_y* delta_y),
M((1.0f / N_factor)* fabsf(delta_y)) {
}
__forceinline void ChangeCharacteristic(float _m) noexcept { R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor)); }
};

__forceinline bool ComparePtr(const Interval* a, const Interval* b) noexcept { return a->R < b->R; }

__forceinline void RecomputeR_ConstM_AVX2(Interval* const* arr, size_t n, float m) {
const __m256 vm = _mm256_set1_ps(m);
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
size_t i = 0;
const int limit = (int)(n & ~7u);
if (n >= 8u) {
while (i < (size_t)limit) {
__declspec(align(32)) float q[8], nf[8], ord[8];
for (int k = 0; k < 8; ++k) { const Interval* p = arr[i + k]; q[k] = p->quadratic_term; nf[k] = p->N_factor; ord[k] = p->ordinate_factor; }
const __m256 vq = _mm256_load_ps(q);
const __m256 vnf = _mm256_load_ps(nf);
const __m256 vod = _mm256_load_ps(ord);
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
__declspec(align(32)) float out[8];
_mm256_store_ps(out, res);
for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k];
i += 8u;
}
}
while (i < n) { arr[i]->ChangeCharacteristic(m); ++i; }
}

__forceinline void RecomputeR_AffineM_AVX2(Interval* const* arr, size_t n, float GF, float alpha) {
const __m256 vGF = _mm256_set1_ps(GF);
const __m256 va = _mm256_set1_ps(alpha);
size_t i = 0;
const int limit = (int)(n & ~7u);
if (n >= 8u) {
while (i < (size_t)limit) {
__declspec(align(32)) float len[8], Mv[8], q[8], nf[8], ord[8];
for (int k = 0; k < 8; ++k) { const Interval* p = arr[i + k]; len[k] = p->x2 - p->x1; Mv[k] = p->M; q[k] = p->quadratic_term; nf[k] = p->N_factor; ord[k] = p->ordinate_factor; }
const __m256 vlen = _mm256_load_ps(len);
const __m256 vM = _mm256_load_ps(Mv);
const __m256 vq = _mm256_load_ps(q);
const __m256 vnf = _mm256_load_ps(nf);
const __m256 vod = _mm256_load_ps(ord);
const __m256 vm = _mm256_fmadd_ps(vGF, vlen, _mm256_mul_ps(va, vM));
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
__declspec(align(32)) float out[8];
_mm256_store_ps(out, res);
for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k];
i += 8u;
}
}
while (i < n) {
const Interval* p = arr[i];
const float mi = fmaf(GF, (p->x2 - p->x1), p->M * alpha);
arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor));
++i;
}
}

__forceinline float fast_pow_int(float v, int n) {
float r;
switch (n) {
case 1: r = v; break;
case 2: r = v * v; break;
case 3: { float v2 = v * v; r = v2 * v; } break;
case 4: { float v2 = v * v; r = v2 * v2; } break;
case 5: { float v2 = v * v; r = v2 * v2 * v; } break;
case 6: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v2; } break;
case 7: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v2 * v; } break;
case 8: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v4; } break;
case 9: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v3; } break;
case 10: { float v2 = v * v; float v4 = v2 * v2; float v8 = v4 * v4; r = v8 * v2; } break;
case 11: { float v2 = v * v; float v4 = v2 * v2; float v8 = v4 * v4; r = v8 * v2 * v; } break;
case 12: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v6; } break;
case 13: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v6 * v; } break;
case 14: { float v7 = v * v * v * v * v * v * v; r = v7 * v7; } break;
default: { float v7 = v * v * v * v * v * v * v; r = v7 * v7 * v; } break;
}
return r;
}

__forceinline float Shag(float _m, float x1, float x2, float y1, float y2, float _N, float _r) {
const float diff = y2 - y1;
const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(0x3F800000u | (((((uint32_t)&diff)) & 0x80000000u) ^ 0x80000000u))));
const int Ni = (int)(_N + 0.5f);
if (Ni == 1) return fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f;
if (Ni == 2) return fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f;
const float invmN = 1.0f / fast_pow_int(_m, Ni);
const float dN = fast_pow_int(fabsf(diff), Ni);
return fmaf(sign_mult * invmN, dN * _r, x1 + x2) * 0.5f;
}

struct MortonCachePerRank { std::vector<int> permCache; std::vector<uint64_t> invMaskCache; uint32_t baseSeed; };
static MortonCachePerRank g_mc;

struct MortonND final {
int dim;
int levels;
uint64_t scale;
std::vector<float> low, high;
std::vector<float> step, baseOff;
std::vector<int> perm;
std::vector<uint64_t> invMask;
std::vector<uint64_t> pextMask;
float invScaleLevel;
__forceinline MortonND(int D, int L, const float* lows, const float* highs, const MortonCachePerRank& mc)
: dim(D), levels(L), scale(1ull), low(lows, lows + D), high(highs, highs + D),
step(D, 0.0f), baseOff(D, 0.0f), perm(mc.permCache.begin(), mc.permCache.begin() + D),
invMask(mc.invMaskCache.begin(), mc.invMaskCache.begin() + D), pextMask(D, 0ull), invScaleLevel(0.0f)
{
uint64_t level_mask = (1ull << L) - 1ull;
for (int k = 0; k < D; ++k) invMask[k] &= level_mask;
int maxbits = 63;
int need = L * D;
if (need > maxbits) { levels = maxbits / D; if (levels < 1) levels = 1; }
scale = 1ull << (uint64_t)(levels * D);
invScaleLevel = 1.0f / (float)((uint64_t)1 << levels);
for (int d = 0; d < dim; ++d) {
float rng = high[d] - low[d];
float st = rng * invScaleLevel;
step[d] = st;
baseOff[d] = fmaf(0.5f, st, low[d]);
}
for (int d = 0; d < dim; ++d) {
uint64_t m = 0ull;
uint64_t bitpos = (uint64_t)d;
for (int b = 0; b < levels; ++b) { m |= 1ull << bitpos; bitpos += (uint64_t)dim; }
pextMask[d] = m;
}
}
__forceinline void map01ToPoint(float t, float* __restrict out) const noexcept {
if (t < 0.0f) t = 0.0f; else if (t >= 1.0f) t = nextafterf(1.0f, 0.0f);
uint64_t idx = (uint64_t)((double)t * (double)scale);
if (idx >= scale) idx = scale - 1;
for (int d = 0; d < dim; ++d) {
const int pd = perm[d];
uint64_t bits = _pext_u64(idx, pextMask[d]);
if (invMask[pd]) bits ^= invMask[pd];
out[pd] = fmaf(step[pd], (float)bits, baseOff[pd]);
}
}
__forceinline uint64_t clamp_bits(int64_t v, int bits) const noexcept {
if (v < 0) v = 0;
int64_t maxv = ((int64_t)1 << bits) - 1;
if (v > maxv) v = maxv;
return (uint64_t)v;
}

text
__forceinline float pointToT(const float* __restrict q) const noexcept {
	const int bits = levels;
	uint64_t idx = 0ull;

#if defined(BMI2) || defined(_MSC_VER)
for (int d = 0; d < dim; ++d) {
int pd = perm[d];
float v = (q[pd] - baseOff[pd]) / step[pd];
int64_t cell = llroundf(v);
uint64_t b = clamp_bits(cell, bits);
if (invMask[pd]) b ^= invMask[pd];
#if defined(GNUC) || defined(clang)
idx |= _pdep_u64(b, pextMask[d]);
#else
idx |= _pdep_u64(b, pextMask[d]);
#endif
}
#else
// Fallback без BMI2
for (int d = 0; d < dim; ++d) {
int pd = perm[d];
float v = (q[pd] - baseOff[pd]) / step[pd];
uint64_t b = clamp_bits(llroundf(v), bits);
if (invMask[pd]) b ^= invMask[pd];
uint64_t bitpos = (uint64_t)d;
for (int k = 0; k < bits; ++k) {
uint64_t bit = (b >> k) & 1ull;
idx |= (bit << bitpos);
bitpos += (uint64_t)dim;
}
}
#endif
return ((float)idx + 0.5f) / (float)scale;
}
};

struct ManipCost final {
int n;
bool variableLen;
float targetX, targetY;
float minTheta;
__forceinline ManipCost(int _n, bool _variableLen, float _targetX, float _targetY, float _minTheta)
: n(_n), variableLen(_variableLen), targetX(_targetX), targetY(_targetY), minTheta(_minTheta) {
}
__forceinline float operator()(const float* __restrict q, float& out_x, float& out_y) const noexcept {
const float* th = q;
const float* L = variableLen ? (q + n) : nullptr;
__declspec(align(64)) float phi[16];
__declspec(align(64)) float s_arr[16];
__declspec(align(64)) float c_arr[16];
float x = 0.0f, y = 0.0f, phi_acc = 0.0f, penC = 0.0f;
for (int i = 0; i < n; ++i) { phi_acc += th[i]; phi[i] = phi_acc; }
FABE13_SINCOS(phi, s_arr, c_arr, n);
const float Lc = 1.0f;
if (variableLen) {
for (int i = 0; i < n; ++i) { float Li = L[i]; x = fmaf(Li, c_arr[i], x); y = fmaf(Li, s_arr[i], y); }
}
else {
for (int i = 0; i < n; ++i) { x = fmaf(Lc, c_arr[i], x); y = fmaf(Lc, s_arr[i], y); }
}
for (int i = 0; i < n; ++i) { float ai = fabsf(th[i]); float v = minTheta - ai; float u = v > 0.0f ? v : 0.0f; penC = fmaf(u, u, penC); }
float dx = x - targetX;
float dy = y - targetY;
float dist = sqrtf(fmaf(dx, dx, dy * dy));
out_x = x; out_y = y;
return dist + penC;
}
};

__forceinline void HitTest2D_analytic(float x_param, float& out_x1, float& out_x2) {
const float a = gActiveMap.a;
const float inv_lenx = gActiveMap.inv_lenx;
const uint32_t scale = gActiveMap.scale;
const uint32_t scale_minus_1 = scale - 1u;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const float c = gActiveMap.c;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;
float norm = (x_param - a) * inv_lenx;
norm = fminf(fmaxf(norm, 0.0f), 0x1.fffffep-1f);
uint32_t idx = (uint32_t)(norm * (float)scale);
idx = idx > scale_minus_1 ? scale_minus_1 : idx;
float sx = lenx, sy = leny;
float x1 = a, x2 = c;
uint8_t type = start;
int l = levels - 1;
while (l >= 0) {
const uint32_t q = (idx >> (l * 2)) & 3u;
const Step s = g_step_tbl[type][q];
type = s.next;
sx *= 0.5f; sy *= 0.5f;
x1 += s.dx ? sx : 0.0f;
x2 += s.dy ? sy : 0.0f;
--l;
}
out_x1 = x1 + sx * 0.5f;
out_x2 = x2 + sy * 0.5f;
}

__forceinline float FindX2D_analytic(float px, float py) {
const float a = gActiveMap.a;
const float b = gActiveMap.b;
const float c = gActiveMap.c;
const float d = gActiveMap.d;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const uint32_t scale = gActiveMap.scale;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;
const float clamped_px = fminf(fmaxf(px, a), b);
const float clamped_py = fminf(fmaxf(py, c), d);
float sx = lenx, sy = leny;
float x0 = a, y0 = c;
uint32_t idx = 0u;
uint8_t type = start;
int l = 0;
while (l < levels) {
sx *= 0.5f; sy *= 0.5f;
const float mx = x0 + sx;
const float my = y0 + sy;
const uint32_t tr = (uint32_t)((clamped_px > mx) & (clamped_py > my));
const uint32_t tl = (uint32_t)((clamped_px < mx) & (clamped_py > my));
const uint32_t dl = (uint32_t)((clamped_px < mx) & (clamped_py < my));
const uint32_t none = (uint32_t)(1u ^ (tr | tl | dl));
const uint32_t dd = (tr << 1) | tr | tl | (none << 1);
const InvStep inv = g_inv_tbl[type][dd];
type = inv.next;
idx = (idx << 2) | inv.q;
const uint32_t dx = dd >> 1;
const uint32_t dy = dd & 1u;
x0 += dx ? sx : 0.0f;
y0 += dy ? sy : 0.0f;
++l;
}
const float scale_recip = 1.0f / (float)scale;
return fmaf((float)idx * scale_recip, lenx, a);
}

// Структура для трех интервалов (простая, без конструкторов/деструкторов)
__declspec(align(16)) struct MultiCrossMsg final {
float intervals[15]; // [s_x1, s_x2, e_x1, e_x2, Rtop] × 3 интервала
uint8_t count;

text
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept {
	ar& intervals& count;
}

};

// Структура для лучшего решения (простая, без векторов)
__declspec(align(16)) struct BestSolutionMsg final {
float bestF;
float bestX, bestY;
float bestQ[32]; // Фиксированный массив
uint8_t dim;

text
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept {
	ar& bestF& bestX& bestY& bestQ& dim;
}

};

// Общая структура для многомерного MPI (совместима с двумерной логикой)
__declspec(align(16)) struct CtrlMsgND final {
uint8_t kind; // 0=term, 1=single, 2=multi, 3=best
CrossMsg xchg;
MultiCrossMsg multiXchg;
BestSolutionMsg bestSol;

text
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept {
	ar& kind;
	if (kind == 1) ar& xchg;
	else if (kind == 2) ar& multiXchg;
	else if (kind == 3) ar& bestSol;
}

};

static __forceinline void generate_heuristic_seeds(const ManipCost& cost, int dim, std::vector<std::vector<float>>& seeds) {
seeds.clear();
int n = cost.n;
bool variableLen = cost.variableLen;
float targetX = cost.targetX, targetY = cost.targetY;

text
// БЫСТРОЕ КИНЕМАТИЧЕСКОЕ СЕМЯ (по совету chat01ai)
std::vector<float> kinematic_seed(dim, 0.0f);
float phi = atan2f(targetY, targetX); // φ = atan2(ty,tx)
float rho = sqrtf(fmaf(targetX, targetX, targetY * targetY)); // ρ = √(tx²+ty²)

// Углы: θ_i = φ/n (все звенья направлены на цель)
for (int i = 0; i < n; ++i) {
	kinematic_seed[i] = phi / n;
}

// Длины (если включены): min(max(|ρ|/n, 0.1), 2.0)
if (variableLen) {
	float len_est = rho / n;
	len_est = fminf(fmaxf(len_est, 0.1f), 2.0f); // clamp [0.1, 2.0]
	for (int i = 0; i < n; ++i) {
		kinematic_seed[n + i] = len_est;
	}
}
seeds.push_back(kinematic_seed);

// Оригинальные эвристики (сохраняем для разнообразия)

// Эвристика 1: "Змеевидная" конфигурация
std::vector<float> seed2(dim, 0.0f);
for (int i = 0; i < n; ++i) {
	seed2[i] = phi * (i % 2 == 0 ? 1.0f : -1.0f) * 0.5f;
}
if (variableLen) {
	float len_est = rho / n;
	for (int i = 0; i < n; ++i) {
		seed2[n + i] = len_est * (0.8f + 0.4f * (float)i / n);
	}
}
seeds.push_back(seed2);

// Эвристика 2: Постепенное наведение
std::vector<float> seed3(dim, 0.0f);
for (int i = 0; i < n; ++i) {
	float progress = (float)i / (n - 1);
	seed3[i] = phi * (1.0f - progress * 0.3f);
}
if (variableLen) {
	float base_len = rho / n;
	for (int i = 0; i < n; ++i) {
		seed3[n + i] = base_len * (1.0f + 0.2f * sinf((float)i * 1.5f));
	}
}
seeds.push_back(seed3);

}

// Модифицируем функцию agp_run_branch_mpi для использования априорной оценки
static __forceinline void agp_run_branch_mpi(const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed, std::vector<Interval*>& H, std::vector<float>& bestQ, float& bestF, float& bestX, float& bestY, float M_prior = 1e-3f) {
const int dim = cost.n + (cost.variableLen ? cost.n : 0);
__declspec(align(64)) float q_local[16];
bestQ.reserve(dim);
float x, y;
int no_improve = 0;

text
auto evalAt = [&](float t) -> float {
	map.map01ToPoint(t, q_local);
	float f = cost(q_local, x, y);

	// Локальный щелчок градиентом (малый бюджет)
	if (f < bestF * 1.05f) {
		const int n = cost.n;
		const float* th = q_local;
		const float* L = cost.variableLen ? (q_local + n) : nullptr;

		alignas(64) float phi[16], s_arr[16], c_arr[16];
		float phi_acc = 0.0f;
		for (int i = 0; i < n; ++i) { phi_acc += th[i]; phi[i] = phi_acc; }
		FABE13_SINCOS(phi, s_arr, c_arr, n);

		alignas(64) float sum_s[16], sum_c[16];
		float acc_s = 0.0f, acc_c = 0.0f;
		for (int k = n - 1; k >= 0; --k) {
			float Lk = cost.variableLen ? L[k] : 1.0f;
			acc_s += Lk * s_arr[k];
			acc_c += Lk * c_arr[k];
			sum_s[k] = acc_s;
			sum_c[k] = acc_c;
		}

		const float dx = x - cost.targetX;
		const float dy = y - cost.targetY;
		const float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1e-8f;

		float eta = 0.1f;
		for (int step = 0; step < 2; ++step) {
			alignas(64) float q_try[16];

			for (int i = 0; i < n; ++i) {
				float dxi = -sum_s[i];
				float dyi = sum_c[i];
				float gpen = 0.0f;
				float ai = fabsf(th[i]);
				float v = cost.minTheta - ai;
				if (v > 0.0f) gpen = -2.0f * copysignf(v, th[i]);
				float grad = (dx * dxi + dy * dyi) / dist + gpen;
				q_try[i] = th[i] - eta * grad;
				if (q_try[i] < -3.14159265359f) q_try[i] = -3.14159265359f;
				else if (q_try[i] > 3.14159265359f) q_try[i] = 3.14159265359f;
			}

			if (cost.variableLen) {
				for (int i = 0; i < n; ++i) {
					float grad = (dx * c_arr[i] + dy * s_arr[i]) / dist;
					q_try[n + i] = L[i] - eta * grad;
					if (q_try[n + i] < 0.1f) q_try[n + i] = 0.1f;
					else if (q_try[n + i] > 2.0f) q_try[n + i] = 2.0f;
				}
			}

			float x2, y2;
			float f2 = cost(q_try, x2, y2);
			if (f2 < f) {
				memcpy(q_local, q_try, dim * sizeof(float));
				f = f2;
				x = x2;
				y = y2;
				break;
			}
			eta *= 0.5f;
		}
	}

	if (f < bestF) {
		bestF = f;
		bestQ.assign(q_local, q_local + dim);
		bestX = x;
		bestY = y;
		no_improve = 0;
	}
	else {
		no_improve++;
	}
	return f;
	};

float a = 0.0f, b = 1.0f;
float f_a = evalAt(a);
float f_b = evalAt(b);

const int K = (std::min)((std::max)(2 * dim, 8), 128);
H.clear();
H.reserve((size_t)maxIter + K + 16);

// ИСПОЛЬЗУЕМ АПРИОРНУЮ ОЦЕНКУ КАК НИЖНИЙ BOUND ДЛЯ Mmax
float Mmax = M_prior;
const int rank = g_world->rank();
const int world = g_world->size();

// БЫСТРОЕ ДОБАВЛЕНИЕ КИНЕМАТИЧЕСКОГО СЕМЕНИ ПЕРЕД ОСНОВНОЙ ИНИЦИАЛИЗАЦИЕЙ
std::vector<std::vector<float>> heuristic_seeds;
generate_heuristic_seeds(cost, dim, heuristic_seeds);

// Приоритетно добавляем кинематическое семя (первое в списке)
if (!heuristic_seeds.empty()) {
	const auto& kinematic_seed = heuristic_seeds[0];
	float t_seed = map.pointToT(kinematic_seed.data());

	// Создаем узкий интервал вокруг кинематического семени
	float t1 = fmaxf(a, t_seed - 0.002f);
	float t2 = fminf(b, t_seed + 0.002f);
	if (t2 > t1) {
		__declspec(align(64)) float q1[32], q2[32];
		float x1, y1, x2, y2;
		map.map01ToPoint(t1, q1);
		float f1 = cost(q1, x1, y1);
		map.map01ToPoint(t2, q2);
		float f2 = cost(q2, x2, y2);

		Interval* kinematic_interval = new Interval(t1, t2, f1, f2, (float)dim);
		Mmax = (std::max)(Mmax, kinematic_interval->M);

		// Приоритетный R для быстрого исследования
		kinematic_interval->ChangeCharacteristic(r * Mmax);
		kinematic_interval->R *= 0.5f; // Высокий приоритет

		H.emplace_back(kinematic_interval);

		// Обновляем лучшее решение если нашли
		if (f1 < bestF) {
			bestF = f1;
			bestQ.assign(q1, q1 + dim);
			bestX = x1;
			bestY = y1;
		}
		if (f2 < bestF) {
			bestF = f2;
			bestQ.assign(q2, q2 + dim);
			bestX = x2;
			bestY = y2;
		}
	}
}

float prev_t = a;
float prev_f = f_a;
for (int k = 1; k <= K; ++k) {
	float t = a + (b - a) * ((float)k / (K + 1)) + (float)rank / (float)(world * (K + 1));
	float f = evalAt(t);
	Interval* I = new Interval(prev_t, t, prev_f, f, (float)dim);
	Mmax = (std::max)(Mmax, I->M);
	I->ChangeCharacteristic(r * Mmax);
	H.emplace_back(I);
	std::push_heap(H.begin(), H.end(), ComparePtr);
	prev_t = t;
	prev_f = f;
}

Interval* tail = new Interval(prev_t, b, prev_f, f_b, (float)dim);
Mmax = (std::max)(Mmax, tail->M);
float m = r * Mmax;
tail->ChangeCharacteristic(m);
H.emplace_back(tail);
std::push_heap(H.begin(), H.end(), ComparePtr);

// ДОБАВЛЯЕМ ОСТАЛЬНЫЕ ЭВРИСТИКИ (если есть место)
if (!H.empty() && heuristic_seeds.size() > 1) {
	for (size_t i = 1; i < heuristic_seeds.size(); ++i) {
		const auto& seed_q = heuristic_seeds[i];
		float t_seed = map.pointToT(seed_q.data());
		float t1 = fmaxf(a, t_seed - 0.001f);
		float t2 = fminf(b, t_seed + 0.001f);
		if (t2 > t1) {
			float f1 = evalAt(t1);
			float f2 = evalAt(t2);
			Interval* seed_interval = new Interval(t1, t2, f1, f2, (float)dim);
			Mmax = (std::max)(Mmax, seed_interval->M);
			seed_interval->ChangeCharacteristic(m);
			seed_interval->R *= 0.7f;
			H.emplace_back(seed_interval);
			std::push_heap(H.begin(), H.end(), ComparePtr);
		}
	}
}

float dmax = b - a;
const float initial_len = dmax;
const float thr03 = 0.3f * initial_len;
const float inv_thr03 = 1.0f / thr03;
int it = 0;

while (it < maxIter) {
	if (it % 100 == 0 && no_improve > 50 && !bestQ.empty()) {
		float t_best = map.pointToT(bestQ.data());
		for (int i = 0; i < 2; ++i) {
			float offset = (i == 0) ? 0.01f : -0.01f;
			float t_seed = fmaxf(a, fminf(b, t_best + offset));
			float f_seed = evalAt(t_seed);
			Interval* new_seed = new Interval(t_seed - 0.005f, t_seed + 0.005f, f_seed, f_seed, (float)dim);
			new_seed->ChangeCharacteristic(m);
			new_seed->R *= 0.8f;
			H.emplace_back(new_seed);
			std::push_heap(H.begin(), H.end(), ComparePtr);
		}
		no_improve = 0;
	}

	const float p = fmaf(-1.0f / initial_len, dmax, 1.0f);
	bool stagnation = (no_improve > 100) && (it > 270);
	float r_eff = fmaxf(1.0f, r * (0.7f + 0.3f * (1.0f - p)));
	m = r_eff * Mmax;
	float k = stagnation ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f);

	// Обработка входящих сообщений (тег 0 как в двумерном)
	while (g_world->iprobe(boost::mpi::any_source, 0)) {
		CtrlMsgND in;
		g_world->recv(boost::mpi::any_source, 0, in);

		if (in.kind == 0) { // term
			if (!rank) break; else return;
		}
		else if (in.kind == 1) { // single cross
			float sx = in.xchg.s_x1, ex = in.xchg.e_x1;
			if (sx < 0.0f) sx = 0.0f;
			if (ex > 1.0f) ex = 1.0f;
			if (ex > sx) {
				__declspec(align(64)) float tmp[16];
				float tmp_x, tmp_y;
				map.map01ToPoint(sx, tmp);
				float y1 = cost(tmp, tmp_x, tmp_y);
				map.map01ToPoint(ex, tmp);
				float y2 = cost(tmp, tmp_x, tmp_y);
				Interval* inj = new Interval(sx, ex, y1, y2, (float)dim);
				inj->ChangeCharacteristic(m);
				Interval* top = H.front();
				if (inj->R > 1.15f * top->R) {
					inj->R = in.xchg.Rtop * k;
					H.emplace_back(inj);
					std::push_heap(H.begin(), H.end(), ComparePtr);
				}
			}
		}
		else if (in.kind == 2) { // multi cross (НОВОЕ: 3 интервала)
			const MultiCrossMsg& multi = in.multiXchg;
			for (uint8_t i = 0; i < multi.count; ++i) {
				const float* interval_data = &multi.intervals[i * 5];
				float sx = interval_data[0], ex = interval_data[2];
				if (sx < 0.0f) sx = 0.0f;
				if (ex > 1.0f) ex = 1.0f;
				if (ex > sx) {
					__declspec(align(64)) float tmp[16];
					float tmp_x, tmp_y;
					map.map01ToPoint(sx, tmp);
					float y1 = cost(tmp, tmp_x, tmp_y);
					map.map01ToPoint(ex, tmp);
					float y2 = cost(tmp, tmp_x, tmp_y);
					Interval* inj = new Interval(sx, ex, y1, y2, (float)dim);
					inj->ChangeCharacteristic(m);
					Interval* top = H.front();
					if (inj->R > 1.15f * top->R) {
						inj->R = interval_data[4] * k;
						H.emplace_back(inj);
						std::push_heap(H.begin(), H.end(), ComparePtr);
					}
				}
			}
		}
		else if (in.kind == 3) { // best solution (НОВОЕ: лучшее решение)
			const BestSolutionMsg& bestMsg = in.bestSol;
			if (bestMsg.bestF < bestF * 1.15f) {
				// Создаем узкий интервал вокруг лучшей точки
				__declspec(align(64)) float tmp_q[32];
				memcpy(tmp_q, bestMsg.bestQ, bestMsg.dim * sizeof(float));
				float t_best = map.pointToT(tmp_q);
				float t1 = fmaxf(a, t_best - 0.001f);
				float t2 = fminf(b, t_best + 0.001f);
				if (t2 > t1) {
					__declspec(align(64)) float tmp1[16], tmp2[16];
					float x1, y1, x2, y2;
					map.map01ToPoint(t1, tmp1);
					float f1 = cost(tmp1, x1, y1);
					map.map01ToPoint(t2, tmp2);
					float f2 = cost(tmp2, x2, y2);

					Interval* seed_interval = new Interval(t1, t2, f1, f2, (float)dim);
					seed_interval->ChangeCharacteristic(m);
					seed_interval->R *= 0.7f;

					H.emplace_back(seed_interval);
					std::push_heap(H.begin(), H.end(), ComparePtr);
				}
				// Обновляем локальное лучшее решение если полученное лучше
				if (bestMsg.bestF < bestF) {
					bestF = bestMsg.bestF;
					bestX = bestMsg.bestX;
					bestY = bestMsg.bestY;
					bestQ.assign(bestMsg.bestQ, bestMsg.bestQ + bestMsg.dim);
				}
			}
		}
	}

	// Основная логика AGP
	std::pop_heap(H.begin(), H.end(), ComparePtr);
	Interval* cur = H.back();
	H.pop_back();
	const float x1 = cur->x1, x2 = cur->x2, y1 = cur->y1, y2 = cur->y2;
	float tNew = Shag(m, x1, x2, y1, y2, (float)dim, r);
	tNew = fminf(fmaxf(tNew, a), b);
	float fNew = evalAt(tNew);
	Interval* left = new Interval(x1, tNew, y1, fNew, (float)dim);
	Interval* right = new Interval(tNew, x2, fNew, y2, (float)dim);
	float Mloc = (std::max)(left->M, right->M);

	if (adaptive) {
		float len1 = tNew - x1, len2 = x2 - tNew;
		if (len1 + len2 == dmax) {
			dmax = (std::max)(len1, len2);
			for (auto pI : H) {
				float L = pI->x2 - pI->x1;
				if (L > dmax) dmax = L;
			}
		}
		if ((thr03 > dmax && !(it % 3)) || (10.0f * dmax < initial_len)) {
			if (Mloc > Mmax) {
				Mmax = Mloc;
				m = r_eff * Mmax;
			}
			const float progress = fmaf(-dmax, inv_thr03, 1.0f);
			const float alpha = progress * progress;
			const float beta = fmaf(-alpha, 1.0f, 2.0f);
			const float MULT = (1.0f / dmax) * Mmax;
			const float global_coeff = fmaf(MULT, r_eff, -MULT);
			const float GF = fmaf(beta, global_coeff, 0.0f);
			left->ChangeCharacteristic(fmaf(GF, len1, left->M * alpha));
			right->ChangeCharacteristic(fmaf(GF, len2, right->M * alpha));
			size_t sz = H.size();
			RecomputeR_AffineM_AVX2(H.data(), sz, GF, alpha);
			std::make_heap(H.begin(), H.end(), ComparePtr);
		}
		else {
			if (Mloc > Mmax) {
				Mmax = Mloc;
				m = r_eff * Mmax;
				left->ChangeCharacteristic(m);
				right->ChangeCharacteristic(m);
				if (Mloc >= 1.15f * Mmax) {
					size_t sz = H.size();
					RecomputeR_ConstM_AVX2(H.data(), sz, m);
					std::make_heap(H.begin(), H.end(), ComparePtr);
				}
			}
			else {
				left->ChangeCharacteristic(m);
				right->ChangeCharacteristic(m);
			}
		}
	}
	else {
		if (Mloc > Mmax) {
			Mmax = Mloc;
			m = r_eff * Mmax;
			left->ChangeCharacteristic(m);
			right->ChangeCharacteristic(m);
			if (Mloc >= 1.15f * Mmax) {
				size_t sz = H.size();
				RecomputeR_ConstM_AVX2(H.data(), sz, m);
				std::make_heap(H.begin(), H.end(), ComparePtr);
			}
		}
		else {
			left->ChangeCharacteristic(m);
			right->ChangeCharacteristic(m);
		}
	}

	H.push_back(left); std::push_heap(H.begin(), H.end(), ComparePtr);
	H.push_back(right); std::push_heap(H.begin(), H.end(), ComparePtr);
	Interval* top = H.front();
	float interval_len = top->x2 - top->x1;

	// ОТПРАВКА СООБЩЕНИЙ (строго по совету chat01ai)
	const int T = (int)fmaf(-expm1f(p), 264.0f, 277.0f);
	bool want_term = (powf(interval_len, 1.0f / (float)dim) < eps) || (it == maxIter - 1);

	// Основная рассылка: раз в T итераций шлем 3 интервала
	if (!(it % T) || want_term) {
		CtrlMsgND out;

		if (want_term) {
			out.kind = 0; // term
		}
		else {
			// Рассылаем топ-3 интервала вместо одного (НОВОЕ по совету chat01ai)
			out.kind = 2; // multi cross
			out.multiXchg.count = (uint8_t)(H.size() >= 3 ? 3 : H.size());

			// Быстрый поиск топ-3 интервалов по R
			Interval* top1 = H[0];
			Interval* top2 = H[1];
			Interval* top3 = H[2];
			if (H.size() > 3) {
				for (size_t i = 3; i < H.size(); ++i) {
					if (H[i]->R > top1->R) {
						top3 = top2;
						top2 = top1;
						top1 = H[i];
					}
					else if (H[i]->R > top2->R) {
						top3 = top2;
						top2 = H[i];
					}
					else if (H[i]->R > top3->R) {
						top3 = H[i];
					}
				}
			}

			// Заполняем данные для 3 интервалов
			float* dest = out.multiXchg.intervals;
			if (out.multiXchg.count >= 1) {
				dest[0] = top1->x1; dest[1] = 0.0f; dest[2] = top1->x2; dest[3] = 0.0f; dest[4] = top1->R;
			}
			if (out.multiXchg.count >= 2) {
				dest = out.multiXchg.intervals + 5;
				dest[0] = top2->x1; dest[1] = 0.0f; dest[2] = top2->x2; dest[3] = 0.0f; dest[4] = top2->R;
			}
			if (out.multiXchg.count >= 3) {
				dest = out.multiXchg.intervals + 10;
				dest[0] = top3->x1; dest[1] = 0.0f; dest[2] = top3->x2; dest[3] = 0.0f; dest[4] = top3->R;
			}
		}

		for (int i = 0; i < world; ++i) {
			if (i != rank) g_world->isend(i, 0, out);
		}

		if (want_term) break;
	}

	// ДОПОЛНИТЕЛЬНАЯ рассылка: раз в 500 итераций шлем лучшее решение (НОВОЕ по совету chat01ai)
	if (!(it % 500) && !bestQ.empty()) {
		CtrlMsgND out;
		out.kind = 3; // best solution
		out.bestSol.bestF = bestF;
		out.bestSol.bestX = bestX;
		out.bestSol.bestY = bestY;
		out.bestSol.dim = (uint8_t)bestQ.size();
		memcpy(out.bestSol.bestQ, bestQ.data(), bestQ.size() * sizeof(float));

		for (int i = 0; i < world; ++i) {
			if (i != rank) g_world->isend(i, 0, out);
		}
	}

	++it;
}

}

struct BestPacket { float bestF; int dim; float bestX; float bestY; template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar& bestF& dim& bestX& bestY; } };

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_Manip2D(int nSegments, bool variableLengths, float minTheta,
float targetX, float targetY, int peanoLevels, int maxIterPerBranch,
float r, bool adaptiveMode, float epsilon, unsigned int seed,
float** out_bestQ, size_t* out_bestQLen,
float* out_bestX, float* out_bestY, float* out_bestF) {

text
const int dim = nSegments + (variableLengths ? nSegments : 0);
g_mc.permCache.resize(dim);
for (int i = 0; i < dim; ++i) g_mc.permCache[i] = i;
uint32_t s = g_mc.baseSeed;
for (int i = dim - 1; i > 0; --i) {
	s ^= s << 13; s ^= s >> 17; s ^= s << 5;
	uint32_t j = s % (uint32_t)(i + 1);
	std::swap(g_mc.permCache[i], g_mc.permCache[j]);
}
g_mc.invMaskCache.resize(dim);
for (int k = 0; k < dim; ++k) {
	s ^= s << 13; s ^= s >> 17; s ^= s << 5;
	g_mc.invMaskCache[k] = (uint64_t)s;
}

const float thetaMin = -3.14159265359f;
const float thetaMax = 3.14159265359f;
const float lenMin = 0.1f;
const float lenMax = 2.0f;
std::vector<float> low, high;
low.reserve(dim); high.reserve(dim);
for (int i = 0; i < nSegments; ++i) { low.push_back(thetaMin); high.push_back(thetaMax); }
if (variableLengths) { for (int i = 0; i < nSegments; ++i) { low.push_back(lenMin); high.push_back(lenMax); } }

ManipCost cost(nSegments, variableLengths, targetX, targetY, minTheta);
const int rank = g_world->rank();
const int world = g_world->size();

// ДВУХЭТАПНАЯ ДИСКРЕТИЗАЦИЯ - MAXIMUM SPEED
std::vector<float> bestQ;
float bestF = FLT_MAX, bx = 0.0f, by = 0.0f;

// ЭТАП 1: Coarse - levels0 = min(levels, 8), 20% бюджета
const int levels0 = (std::min)(peanoLevels, 8);
const int maxIter0 = maxIterPerBranch * 0.2f;

MortonND map0(dim, levels0, low.data(), high.data(), g_mc);
std::vector<Interval*> H_coarse;
std::vector<float> bestQ_coarse;
float bestF_coarse = FLT_MAX, bx_coarse = 0.0f, by_coarse = 0.0f;

// ВЫЧИСЛЯЕМ АПРИОРНУЮ ОЦЕНКУ LIPSCHITZ ДЛЯ МАНИПУЛЯТОРА
float M_prior = 0.0f;
if (variableLengths) {
	// Для переменных длин: оценка ||∇θ(x,y)||₂ ≤ 2 * ∑L_i
	float sumLi = nSegments * lenMax; // максимальная сумма длин
	M_prior = 2.0f * sumLi;
}
else {
	// Для фиксированных длин: все L_i = 1.0f
	M_prior = 2.0f * nSegments;
}

// Умножаем на "макс. шаг" карты SFC (домен-aware композиция)
float max_step_sfc = 1.0f / (1u << levels0); // максимальный шаг на текущем уровне SFC
M_prior *= max_step_sfc;

// Добавляем компоненту для производной по длинам если нужно
if (variableLengths) {
	M_prior += 1.41421356237f * max_step_sfc; // √2 * max_step_sfc
}

agp_run_branch_mpi(map0, cost, maxIter0, r, adaptiveMode, epsilon, seed,
	H_coarse, bestQ_coarse, bestF_coarse, bx_coarse, by_coarse, M_prior); // передаем M_prior

// Сохраняем лучший результат coarse этапа
if (bestF_coarse < bestF) {
	bestF = bestF_coarse;
	bestQ = bestQ_coarse;
	bx = bx_coarse;
	by = by_coarse;
}

// ЭТАП 2: Fine - только если нужно больше детализации
if (levels0 < peanoLevels) {
	MortonND map1(dim, peanoLevels, low.data(), high.data(), g_mc);
	std::vector<Interval*> H_fine;
	std::vector<float> bestQ_fine = bestQ;
	float bestF_fine = bestF, bx_fine = bx, by_fine = by;

	// Пересчитываем M_prior для fine уровня
	float M_prior_fine = 0.0f;
	if (variableLengths) {
		float sumLi = nSegments * lenMax;
		M_prior_fine = 2.0f * sumLi;
	}
	else {
		M_prior_fine = 2.0f * nSegments;
	}
	float max_step_sfc_fine = 1.0f / (1u << peanoLevels);
	M_prior_fine *= max_step_sfc_fine;
	if (variableLengths) {
		M_prior_fine += 1.41421356237f * max_step_sfc_fine;
	}

	// Отбираем топ-интервалы из coarse этапа
	if (!H_coarse.empty()) {
		// Быстрая сортировка лучших интервалов по R
		std::sort(H_coarse.begin(), H_coarse.end(), [](const Interval* a, const Interval* b) {
			return a->R < b->R;
			});

		// Берем топ 30% интервалов для fine этапа
		const size_t topCount = H_coarse.size() * 0.3f;

		// Перемапляем интервалы в fine пространство
		for (size_t i = 0; i < topCount && i < H_coarse.size(); ++i) {
			const Interval* coarseInt = H_coarse[i];

			// Пересчитываем значения функции на границах в fine сетке
			__declspec(align(64)) float q1[32], q2[32];
			float x1, y1, x2, y2;
			map1.map01ToPoint(coarseInt->x1, q1);
			float f1 = cost(q1, x1, y1);
			map1.map01ToPoint(coarseInt->x2, q2);
			float f2 = cost(q2, x2, y2);

			// Создаем новый интервал для fine этапа
			H_fine.push_back(new Interval(coarseInt->x1, coarseInt->x2, f1, f2, (float)dim));

			// Обновляем лучший результат
			if (f1 < bestF_fine) {
				bestF_fine = f1;
				bestQ_fine.assign(q1, q1 + dim);
				bx_fine = x1;
				by_fine = y1;
			}
			if (f2 < bestF_fine) {
				bestF_fine = f2;
				bestQ_fine.assign(q2, q2 + dim);
				bx_fine = x2;
				by_fine = y2;
			}
		}

		// Создаем кучу для fine этапа
		std::make_heap(H_fine.begin(), H_fine.end(), ComparePtr);
	}

	// Запускаем fine этап с оставшимся бюджетом
	agp_run_branch_mpi(map1, cost, maxIterPerBranch - maxIter0, r, adaptiveMode, epsilon,
		seed, H_fine, bestQ_fine, bestF_fine, bx_fine, by_fine, M_prior_fine);

	// Используем лучший результат из fine этапа
	if (bestF_fine < bestF) {
		bestF = bestF_fine;
		bestQ = bestQ_fine;
		bx = bx_fine;
		by = by_fine;
	}
}

// Отправка результатов (оригинальный код)
BestPacket me{ bestF, dim, bx, by };
if (!rank) {
	std::vector<float> winnerQ = bestQ;
	float winF = bestF, wx = bx, wy = by;
	for (int i = 1; i < world; ++i) {
		BestPacket bp; g_world->recv(i, 2, bp);
		std::vector<float> qin; g_world->recv(i, 3, qin);
		if (bp.bestF < winF) { winF = bp.bestF; wx = bp.bestX; wy = bp.bestY; winnerQ = qin; }
	}
	*out_bestQLen = winnerQ.size();
	*out_bestQ = (float*)CoTaskMemAlloc(sizeof(float) * (*out_bestQLen));
	std::memcpy(*out_bestQ, winnerQ.data(), sizeof(float) * (*out_bestQLen));
	*out_bestX = wx; *out_bestY = wy; *out_bestF = winF;
}
else {
	g_world->send(0, 2, me);
	g_world->send(0, 3, bestQ);
}

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(int peanoLevel, float a, float b, float c, float d) {
g_env = new boost::mpi::environment();
g_world = new boost::mpi::communicator();
const int rank = g_world->rank();
const int world_size = g_world->size();
if (world_size == 4) {
new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, rank & 3);
}
g_mc.baseSeed = fmaf(0x9E3779B9u, rank, 0x9E3779B9u);
return rank;
}

__forceinline float ShekelFunc(float x, float seed) {
int i = 0; float st = seed, r1, r2, res = 0.0f;
while (i < 10) {
XOR_RAND(st, r1);
float xp = fmaf(-r1, 10.0f, x);
XOR_RAND(st, r1);
XOR_RAND(st, r2);
float d = fmaf(fmaf(r1, 20.0f, 5.0f), xp * xp, fmaf(r2, 0.2f, 1.0f));
d = copysignf(fmaxf(fabsf(d), FLT_MIN), d);
res -= 1.0f / d;
++i;
}
return res;
}

__forceinline float RastriginFunc(float x1, float x2) {
const float t = fmaf(x1, x1, x2 * x2);
float c1, c2;
FABE13_COS(6.28318530717958647692f * x1, c1);
FABE13_COS(6.28318530717958647692f * x2, c2);
return (t - fmaf(c1 + c2, 10.0f, -14.6f)) * fmaf(-t, 0.25f, 18.42f);
}

__forceinline float HillFunc(float x, float seed) {
int j = 0;
__declspec(align(32)) float ang[14u];
float st = 6.28318530717958647692f * x;
while (j < 14) { ang[j] = st * (float)(j + 1); ++j; }
__declspec(align(32)) float sv[14u], cv[14u];
FABE13_SINCOS(ang, sv, cv, 14u);
float state = seed, r1, r2;
XOR_RAND(state, r1);
float res = fmaf(r1, 2.0f, -1.1f);
--j;
while (j >= 0) {
XOR_RAND(state, r1);
XOR_RAND(state, r2);
res += fmaf(fmaf(r1, 2.0f, -1.1f), sv[j], fmaf(r2, 2.0f, -1.1f) * cv[j]);
--j;
}
return res;
}

__forceinline float GrishaginFunc(float x1, float x2, float seed) {
int j = 0;
__declspec(align(32)) float aj[8u], ak[8u];
while (j < 8) { float pj = 3.14159265358979323846f * (float)(j + 1); aj[j] = pj * x1; ak[j] = pj * x2; ++j; }
__declspec(align(32)) float sj[8u], cj[8u], sk[8u], ck[8u];
FABE13_SINCOS(aj, sj, cj, 8u);
FABE13_SINCOS(ak, sk, ck, 8u);
--j;
float p1 = 0.0f, p2 = 0.0f;
float st = seed, r1, r2;
while (j >= 0) {
size_t k = 0u;
while (k < 8u) {
float s = sj[j] * sj[j];
float c = ck[k] * ck[k];
XOR_RAND_GRSH(st, r1);
XOR_RAND_GRSH(st, r2);
p1 = fmaf(r1, s, fmaf(r2, c, p1));
XOR_RAND_GRSH(st, r1);
XOR_RAND_GRSH(st, r2);
p2 = fmaf(-r1, c, fmaf(r2, s, p2));
++k;
}
--j;
}
return -sqrtf(fmaf(p1, p1, p2 * p2));
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_1D(float global_iterations, float a, float b, float r, bool mode, float epsilon, float seed, float** out_data, size_t* out_len) {
Slab* slab = tls.local();
slab->current = slab->base;
int schetchick = 0;
const float initial_length = b - a;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length;
const float inv_threshold_03 = 1.0f / threshold_03;
const float start_val = ShekelFunc(a, seed);
float best_f = ShekelFunc(b, seed);
float x_Rmax_1 = a;
float x_Rmax_2 = b;
float y_Rmax_1 = start_val;
float y_Rmax_2 = best_f;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
std::vector<Interval*, boost::alignment::aligned_allocator<Interval*, 64u>> R;
Extr.reserve((size_t)global_iterations << 2u);
R.reserve((size_t)global_iterations << 1u);
R.emplace_back(new Interval(a, b, start_val, best_f, 1.0f));
float Mmax = R.front()->M;
float m = r * Mmax;
while (true) {
const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r);
const float new_value = ShekelFunc(new_point, seed);
if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_point); }
std::pop_heap(R.begin(), R.end(), ComparePtr);
const Interval* pro = R.back();
const float new_x1 = pro->x1;
const float new_x2 = pro->x2;
const float len2 = new_x2 - new_point;
const float len1 = new_point - new_x1;
const float interval_len = len1 < len2 ? len1 : len2;
if (++schetchick == (int)global_iterations || interval_len < epsilon) {
Extr.emplace_back((float)schetchick);
Extr.emplace_back(interval_len);
*out_len = Extr.size();
out_data = (float)CoTaskMemAlloc(sizeof(float) * (out_len));
memcpy(out_data, Extr.data(), sizeof(float) * (out_len));
return;
}
Interval curr = new Interval(new_x1, new_point, pro->y1, new_value, 1.0f);
Interval curr1 = new Interval(new_point, new_x2, new_value, pro->y2, 1.0f);
const float currM = curr->M > curr1->M ? curr->M : curr1->M;
const size_t r_size = R.size();
if (mode) {
if (len2 + len1 == dmax) { dmax = len2 > len1 ? len2 : len1; for (auto p : R) { float L = p->x2 - p->x1; if (L > dmax) dmax = L; } }
if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) {
if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
const float alpha = progress * progress;
const float betta = 2.0f - alpha;
const float MULT = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULT, r, -MULT);
const float GF = betta * global_coeff;
curr->ChangeCharacteristic(fmaf(GF, len1, curr->M * alpha));
curr1->ChangeCharacteristic(fmaf(GF, len2, curr1->M * alpha));
RecomputeR_AffineM_AVX2(R.data(), r_size, GF, alpha);
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); }
}
else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
}
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); }
}
else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
}
R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr);
R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr);
const Interval top = R.front();
x_Rmax_1 = top->x1; x_Rmax_2 = top->x2; y_Rmax_1 = top->y1; y_Rmax_2 = top->y2;
}
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_2D(
const float N,
const float global_iterations,
const float a,
const float b,
const float c,
const float d,
const float r,
const bool mode,
const float epsilon,
const float seed,
float** const __restrict out_data,
size_t* const __restrict out_len) noexcept
{
Slab* const __restrict slab = tls.local();
slab->current = slab->base;
int schetchick = 0;
int no_improve = 0;
const int rank = g_world->rank();
const int world_size = g_world->size();
while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg dummy; g_world->recv(boost::mpi::any_source, 0, dummy); }
const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1));
const float x_addition = (b - a) * inv_divider;
const float y_addition = (d - c) * inv_divider;
const float true_start = a + x_addition;
const float true_end = b - x_addition;
float x_Rmax_1 = true_start;
float x_Rmax_2 = true_end;
const float initial_length = x_Rmax_2 - x_Rmax_1;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length;
const float inv_threshold_03 = 1.0f / threshold_03;
const float start_val = rank % 3 ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition);
float best_f = rank % 2 ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition);
float y_Rmax_1 = start_val;
float y_Rmax_2 = best_f;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R;
Extr.clear(); Extr.reserve(static_cast<size_t>(global_iterations) << 2u);
R.clear(); R.reserve(static_cast<size_t>(global_iterations) << 1u);
R.emplace_back(new Interval(true_start, true_end, start_val, best_f, 2.0f));
const Interval* __restrict top_ptr;
float Mmax = R.front()->M;
float m = r * Mmax;
while (true) {
const float interval_len = x_Rmax_2 - x_Rmax_1;
const bool stagnation = no_improve > 100 && schetchick > 270;
const float p = fmaf(-1.0f / initial_length, dmax, 1.0f);
while (g_world->iprobe(boost::mpi::any_source, 0)) {
CtrlMsg in; g_world->recv(boost::mpi::any_source, 0, in);
if (in.kind) {
if (!rank) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (out_len)); }
return;
}
const float sx = FindX2D_analytic(in.xchg.s_x1, in.xchg.s_x2);
const float ex = FindX2D_analytic(in.xchg.e_x1, in.xchg.e_x2);
Interval const __restrict injected = new Interval(sx, ex, RastriginFunc(in.xchg.s_x1, in.xchg.s_x2), RastriginFunc(in.xchg.e_x1, in.xchg.e_x2), 2.0f);
injected->ChangeCharacteristic(m);
if (injected->R > 1.15f * top_ptr->R) {
const float k = stagnation ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f);
injected->R = in.xchg.Rtop * k;
R.emplace_back(injected);
std::push_heap(R.begin(), R.end(), ComparePtr);
}
}
const int T = static_cast<int>(fmaf(-expm1f(p), 264.0f, 277.0f));
const bool want_term = interval_len < epsilon || schetchick == static_cast<int>(global_iterations);
if (!(++schetchick % T) || stagnation || want_term) {
CtrlMsg out; out.kind = want_term;
if (!out.kind) {
float s_x1, s_x2, e_x1, e_x2;
HitTest2D_analytic(top_ptr->x1, s_x1, s_x2);
HitTest2D_analytic(top_ptr->x2, e_x1, e_x2);
out.xchg = CrossMsg{ s_x1, s_x2, e_x1, e_x2, top_ptr->R };
}
for (int i = 0; i < world_size; ++i) { if (i != rank) g_world->isend(i, 0, out); }
if (out.kind) {
if (!rank) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (out_len))); memcpy(out_data, Extr.data(), sizeof(float) * (out_len)); }
return;
}
}
const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r);
float new_x1_val, new_x2_val;
HitTest2D_analytic(new_point, new_x1_val, new_x2_val);
const float new_value = RastriginFunc(new_x1_val, new_x2_val);
if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_x1_val); Extr.emplace_back(new_x2_val); no_improve = 0; }
else { ++no_improve; }
std::pop_heap(R.begin(), R.end(), ComparePtr);
Interval const __restrict promejutochny_otrezok = R.back();
const float segment_x1 = promejutochny_otrezok->x1;
const float segment_x2 = promejutochny_otrezok->x2;
const float len2 = segment_x2 - new_point;
const float len1 = new_point - segment_x1;
Interval const __restrict curr = new Interval(segment_x1, new_point, promejutochny_otrezok->y1, new_value, 2.0f);
Interval const __restrict curr1 = new Interval(new_point, segment_x2, new_value, promejutochny_otrezok->y2, 2.0f);
const float currM = (std::max)(curr->M, curr1->M);
const size_t r_size = R.size();
if (mode) {
if (len2 + len1 == dmax) { dmax = (std::max)(len1, len2); for (auto pI : R) { float L = pI->x2 - pI->x1; if (L > dmax) dmax = L; } }
if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) {
if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
const float alpha = progress * progress;
const float betta = 2.0f - alpha;
const float MULTIPLIER = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
const float GLOBAL_FACTOR = betta * global_coeff;
curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));
RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha);
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); }
}
else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
}
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); }
}
else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
}
R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr);
R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr);
top_ptr = R.front();
x_Rmax_1 = top_ptr->x1; x_Rmax_2 = top_ptr->x2; y_Rmax_1 = top_ptr->y1; y_Rmax_2 = top_ptr->y2;
}
}

struct RunParams { int nSegments; uint8_t varLen; float minTheta; float tx, ty; int levels, maxIter; float r; uint8_t adaptive; float eps; unsigned int seed; template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar& nSegments& varLen& minTheta& tx& ty& levels& maxIter& r& adaptive& eps& seed; } };

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartManipND(int nSegments, bool variableLengths, float minTheta, float targetX, float targetY, int peanoLevels, int maxIterPerBranch, float r, bool adaptiveMode, float epsilon, unsigned int seed) {
RunParams p; p.nSegments = nSegments; p.varLen = (uint8_t)variableLengths; p.minTheta = minTheta; p.tx = targetX; p.ty = targetY; p.levels = peanoLevels; p.maxIter = maxIterPerBranch; p.r = r; p.adaptive = (uint8_t)adaptiveMode; p.eps = epsilon; p.seed = seed;
const int rank = g_world->rank(); const int world = g_world->size();
for (int i = 1; i < world; ++i) { if (i != rank) g_world->isend(i, 1, p); }
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() {
while (true) {
if (g_world->iprobe(0, 1)) {
RunParams p; g_world->recv(0, 1, p);
float* q = nullptr; size_t qlen = 0; float bx = 0, by = 0, bf = 0;
AGP_Manip2D(p.nSegments, (bool)p.varLen, p.minTheta, p.tx, p.ty, p.levels, p.maxIter, p.r, (bool)p.adaptive, p.eps, p.seed, &q, &qlen, &bx, &by, &bf);
if (q) CoTaskMemFree(q);
}
Sleep(0);
}
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartWorkers() { int i = 1; const int world = g_world->size(); while (i < world) { g_world->isend(i, 1, 0); ++i; } }

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_Free(float* p) { CoTaskMemFree(p); }" - код DLL библиотеки, #pragma once
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>
#include <stdint.h>

using namespace System;
using namespace System::Drawing;
using namespace System::Windows::Forms;
using namespace System::Collections::Generic;
using namespace System::Drawing::Drawing2D;

typedef void(__cdecl* P_MANIP)(
int nSegments,
bool variableLengths,
float deltaThetaMax,
float targetX, float targetY,
int peanoLevels, int maxIterPerBranch,
float r, bool adaptiveMode, float epsilon, unsigned int seed,
float** out_bestQ, size_t* out_bestQLen,
float* out_bestX, float* out_bestY, float* out_bestF
);
typedef void(__cdecl* P_FREE)(float*);
typedef int(__cdecl* P_INIT)(int, float, float, float, float);
typedef void(__cdecl* P_WAIT)();
typedef void(__cdecl* P_START)(
int nSegments,
bool variableLengths,
float deltaThetaMax,
float targetX, float targetY,
int peanoLevels, int maxIterPerBranch,
float r, bool adaptiveMode, float epsilon, unsigned int seed
);

namespace TESTAGP {

text
public ref class MyForm : public Form
{
public:
	MyForm(HMODULE hLib) : hLib(hLib) {
		this->SetStyle(ControlStyles::AllPaintingInWmPaint | ControlStyles::UserPaint | ControlStyles::OptimizedDoubleBuffer, true);
		this->Text = L"AGP Manipulator 2D - Оптимизация положения захвата";
		this->ClientSize = System::Drawing::Size(1000, 700);
		this->Resize += gcnew EventHandler(this, &MyForm::OnResize);
		fManip = (P_MANIP)GetProcAddress(hLib, "AGP_Manip2D");
		pFree = (P_FREE)GetProcAddress(hLib, "AGP_Free");
		pStart = (P_START)GetProcAddress(hLib, "AgpStartManipND");
		angles = gcnew List<float>(0.0f);
		lengths = gcnew List<float>(0.0f);
		InitUI();
		ResetRandomConfig();
	}

protected:
	~MyForm() {}

private:
	HMODULE  hLib;
	P_MANIP  fManip;
	P_FREE   pFree;
	P_START  pStart;

	int nSegments;
	bool variableLengths;
	List<float>^ angles;
	List<float>^ lengths;

	CheckBox^ cbVarLen;
	NumericUpDown^ nudMinTheta;
	NumericUpDown^ nudBaseLength;
	NumericUpDown^ nudStretchFactor;
	NumericUpDown^ nudTargetX;
	NumericUpDown^ nudTargetY;
	NumericUpDown^ nudLevels;
	NumericUpDown^ nudMaxIter;
	CheckBox^ cbAdaptive;
	NumericUpDown^ nudR;
	NumericUpDown^ nudEps;
	Button^ btnAdd;
	Button^ btnRem;
	Button^ btnOptimize;
	Label^ lblInfo;

	System::UInt32 rngState = 0xA5C39E0Du;

	void InitUI() {
		int y = 10, w = 180, h = 24, pad = 8;
		int currentX = 10;
		Label^ L;
		L = gcnew Label(); L->Text = L"Мин. угол между звеньями (рад)"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L);
		nudMinTheta = gcnew NumericUpDown(); nudMinTheta->Left = currentX; nudMinTheta->Top = y + h + 2; nudMinTheta->Width = w;
		nudMinTheta->DecimalPlaces = 3; nudMinTheta->Minimum = (Decimal)0.01; nudMinTheta->Maximum = (Decimal)3.14159; nudMinTheta->Value = (Decimal)0.5;
		this->Controls->Add(nudMinTheta);
		currentX += w + 20;

		L = gcnew Label(); L->Text = L"Базовая длина звена"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L);
		nudBaseLength = gcnew NumericUpDown(); nudBaseLength->Left = currentX; nudBaseLength->Top = y + h + 2; nudBaseLength->Width = w;
		nudBaseLength->DecimalPlaces = 2; nudBaseLength->Minimum = (Decimal)0.1; nudBaseLength->Maximum = (Decimal)3.0; nudBaseLength->Value = (Decimal)1.0;
		this->Controls->Add(nudBaseLength);
		currentX += w + 20;

		L = gcnew Label(); L->Text = L"Макс. коэф. растяжения/сжатия"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L);
		nudStretchFactor = gcnew NumericUpDown(); nudStretchFactor->Left = currentX; nudStretchFactor->Top = y + h + 2; nudStretchFactor->Width = w;
		nudStretchFactor->DecimalPlaces = 2; nudStretchFactor->Minimum = (Decimal)1.0; nudStretchFactor->Maximum = (Decimal)3.0; nudStretchFactor->Value = (Decimal)1.5;
		this->Controls->Add(nudStretchFactor);
		currentX += w + 20;

		cbVarLen = gcnew CheckBox(); cbVarLen->Text = L"Переменные длины звеньев"; cbVarLen->Left = currentX; cbVarLen->Top = y + h + 2; cbVarLen->Width = w; cbVarLen->Checked = false;
		this->Controls->Add(cbVarLen);
		currentX += w + 20;
		y += h * 2 + pad + 10;
		currentX = 10;
		L = gcnew Label(); L->Text = L"Целевая позиция X"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L);
		nudTargetX = gcnew NumericUpDown(); nudTargetX->Left = currentX; nudTargetX->Top = y + h + 2; nudTargetX->Width = w;
		nudTargetX->DecimalPlaces = 2; nudTargetX->Minimum = (Decimal)-10.0; nudTargetX->Maximum = (Decimal)10.0; nudTargetX->Value = (Decimal)3.5;
		this->Controls->Add(nudTargetX);
		currentX += w + 20;
		L = gcnew Label(); L->Text = L"Целевая позиция Y"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L);
		nudTargetY = gcnew NumericUpDown(); nudTargetY->Left = currentX; nudTargetY->Top = y + h + 2; nudTargetY->Width = w;
		nudTargetY->DecimalPlaces = 2; nudTargetY->Minimum = (Decimal)-10.0; nudTargetY->Maximum = (Decimal)10.0; nudTargetY->Value = (Decimal)1.0;
		this->Controls->Add(nudTargetY);
		currentX += w + 20;
		L = gcnew Label(); L->Text = L"Глубина развертки"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L);
		nudLevels = gcnew NumericUpDown(); nudLevels->Left = currentX; nudLevels->Top = y + h + 2; nudLevels->Width = w;
		nudLevels->Minimum = 4; nudLevels->Maximum = 20; nudLevels->Value = 12;
		this->Controls->Add(nudLevels);
		currentX += w + 20;
		L = gcnew Label(); L->Text = L"Параметр надежности (r)"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L);
		nudR = gcnew NumericUpDown(); nudR->Left = currentX; nudR->Top = y + h + 2; nudR->Width = w;
		nudR->DecimalPlaces = 2; nudR->Minimum = (Decimal)1.0; nudR->Maximum = (Decimal)10.0; nudR->Value = (Decimal)2.5;
		this->Controls->Add(nudR);
		currentX += w + 20;
		cbAdaptive = gcnew CheckBox(); cbAdaptive->Text = L"Адаптивная схема"; cbAdaptive->Left = currentX; cbAdaptive->Top = y + h + 2; cbAdaptive->Width = w; cbAdaptive->Checked = true;
		this->Controls->Add(cbAdaptive);
		y += h * 2 + pad + 10;
		currentX = 10;
		L = gcnew Label(); L->Text = L"Точность (epsilon)"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L);
		nudEps = gcnew NumericUpDown(); nudEps->Left = currentX; nudEps->Top = y + h + 2; nudEps->Width = w;
		nudEps->DecimalPlaces = 6; nudEps->Minimum = (Decimal)0.000001; nudEps->Maximum = (Decimal)0.1; nudEps->Value = (Decimal)0.0001;
		this->Controls->Add(nudEps);
		currentX += w + 20;
		L = gcnew Label(); L->Text = L"Макс. число итераций"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L);
		nudMaxIter = gcnew NumericUpDown(); nudMaxIter->Left = currentX; nudMaxIter->Top = y + h + 2; nudMaxIter->Width = w;
		nudMaxIter->Minimum = 10; nudMaxIter->Maximum = 50000; nudMaxIter->Value = 1000;
		this->Controls->Add(nudMaxIter);
		currentX += 200;
		btnAdd = gcnew Button(); btnAdd->Text = L"+ Звено"; btnAdd->Left = currentX; btnAdd->Top = y + h + 2; btnAdd->Width = 80;
		btnAdd->Click += gcnew EventHandler(this, &MyForm::OnAddClick);
		this->Controls->Add(btnAdd);
		currentX += 85;
		btnRem = gcnew Button(); btnRem->Text = L"- Звено"; btnRem->Left = currentX; btnRem->Top = y + h + 2; btnRem->Width = 80;
		btnRem->Click += gcnew EventHandler(this, &MyForm::OnRemClick);
		this->Controls->Add(btnRem);
		currentX += 125;
		btnOptimize = gcnew Button(); btnOptimize->Text = L"Оптимизировать"; btnOptimize->Left = currentX; btnOptimize->Top = y + h + 2; btnOptimize->Width = 120;
		btnOptimize->Click += gcnew EventHandler(this, &MyForm::OnOptimizeClick);
		this->Controls->Add(btnOptimize);
		currentX += 125;
		lblInfo = gcnew Label();
		lblInfo->Left = currentX;
		lblInfo->Top = y;
		lblInfo->Width = 250;
		lblInfo->Height = 60;
		lblInfo->BorderStyle = BorderStyle::FixedSingle;
		lblInfo->Text = L"Готов к работе";
		this->Controls->Add(lblInfo);
	}

	void ResetRandomConfig() {
		nSegments = 1;
		angles->Clear();
		lengths->Clear();
		angles->Add(RandAngle());
		lengths->Add((float)nudBaseLength->Value);
		variableLengths = false;
		this->Invalidate();
	}

	float Rand01() {
		rngState ^= rngState << 13; rngState ^= rngState >> 17; rngState ^= rngState << 5;
		return (float)((double)(unsigned)rngState / 4294967296.0);
	}
	float RandAngle() {
		return (Rand01() * 6.28318530717958647692f) - 3.14159265358979323846f;
	}

	System::Void OnResize(System::Object^ sender, System::EventArgs^ e) {
		this->Invalidate();
	}
	System::Void OnAddClick(System::Object^ sender, System::EventArgs^ e) {
		++nSegments;
		angles->Add(RandAngle());
		lengths->Add((float)nudBaseLength->Value);
		this->Invalidate();
	}

	System::Void OnRemClick(System::Object^ sender, System::EventArgs^ e) {
		if (nSegments > 1) {
			--nSegments;
			angles->RemoveAt(angles->Count - 1);
			lengths->RemoveAt(lengths->Count - 1);
			this->Invalidate();
		}
	}

	System::Void OnOptimizeClick(System::Object^ sender, System::EventArgs^ e) {
		if (!fManip) return;

		variableLengths = cbVarLen->Checked;
		float minTheta = (float)nudMinTheta->Value;
		float tx = (float)nudTargetX->Value;
		float ty = (float)nudTargetY->Value;
		int levels = (int)nudLevels->Value;
		int maxIter = (int)nudMaxIter->Value;
		bool adaptive = cbAdaptive->Checked;
		float r_param = (float)nudR->Value;
		float eps = (float)nudEps->Value;
		unsigned int seed = (unsigned)GetTickCount();

		if (pStart) {
			pStart(
				nSegments, variableLengths, minTheta,
				tx, ty, levels, maxIter, r_param, adaptive, eps, seed
			);
		}

		LARGE_INTEGER t0, t1, fq; QueryPerformanceCounter(&t0);
		float* bestQ = nullptr; size_t bestQLen = 0; float bestX = 0, bestY = 0, bestF = 0;
		fManip(
			nSegments, variableLengths, minTheta,
			tx, ty, levels, maxIter, r_param, adaptive, eps, seed,
			&bestQ, &bestQLen, &bestX, &bestY, &bestF
		);
		QueryPerformanceCounter(&t1); QueryPerformanceFrequency(&fq);
		double micros = 1e6 * (double)(t1.QuadPart - t0.QuadPart) / (double)fq.QuadPart;

		if (bestQ && bestQLen >= (size_t)nSegments) {
			angles->Clear();
			for (int i = 0; i < nSegments; ++i) angles->Add(bestQ[i]);
			if (variableLengths) {
				lengths->Clear();
				for (int i = 0; i < nSegments; ++i) lengths->Add(bestQ[nSegments + i]);
			}
			else {
				lengths->Clear();
				for (int i = 0; i < nSegments; ++i) lengths->Add((float)nudBaseLength->Value);
			}
			if (pFree) pFree(bestQ);
		}
		lblInfo->Text = String::Format(L"Результат оптимизации:\nЦелевая функция: {0:F5}\nКонечная точка: ({1:F3}, {2:F3})\nВремя: {3:F0} мкс",
			bestF, bestX, bestY, micros);
		this->Invalidate();
	}

protected:
	virtual void OnPaint(PaintEventArgs^ e) override {
		Form::OnPaint(e);
		Graphics^ g = e->Graphics;
		g->SmoothingMode = System::Drawing::Drawing2D::SmoothingMode::HighQuality;
		g->Clear(this->BackColor);
		int topOffset = 150;
		System::Drawing::Rectangle drawArea = System::Drawing::Rectangle(0, topOffset, this->ClientSize.Width, this->ClientSize.Height - topOffset);
		g->FillRectangle(Brushes::White, drawArea);
		int leftWallX = drawArea.Left + this->ClientSize.Width * 25 / 100;
		Pen^ wallPen = gcnew Pen(Color::Black, 2);
		g->DrawLine(wallPen, leftWallX, drawArea.Top, leftWallX, drawArea.Bottom);
		HatchBrush^ hatchBrush = gcnew HatchBrush(HatchStyle::BackwardDiagonal, Color::LightGray, Color::White);
		int leftHatchWidth = 100;
		g->FillRectangle(hatchBrush, leftWallX - leftHatchWidth, drawArea.Top, leftHatchWidth, drawArea.Height);
		float targetX = (float)nudTargetX->Value;
		float targetY = (float)nudTargetY->Value;
		float scale = 160.0f;
		int baseX = leftWallX;
		int baseY = drawArea.Top + drawArea.Height / 2;
		float pixelTargetX = baseX + targetX * scale;
		float pixelTargetY = baseY - targetY * scale;
		int rightWallX = (int)(pixelTargetX + 8);
		rightWallX = Math::Min(rightWallX, drawArea.Right - 10);
		Pen^ dashedPen = gcnew Pen(Color::Black, 2);
		dashedPen->DashStyle = DashStyle::Dash;
		g->DrawLine(dashedPen, rightWallX, drawArea.Top, rightWallX, drawArea.Bottom);
		int rightHatchWidth = leftHatchWidth;
		g->FillRectangle(hatchBrush, rightWallX, drawArea.Top, rightHatchWidth, drawArea.Height);
		Pen^ targetPen = gcnew Pen(Color::Green, 1.5f);
		targetPen->DashStyle = DashStyle::Dot;
		g->DrawEllipse(targetPen, pixelTargetX - 8.0f, pixelTargetY - 8.0f, 16.0f, 16.0f);
		cli::array<PointF>^ pts = gcnew cli::array<PointF>(nSegments + 1);
		pts[0] = PointF((float)baseX, (float)baseY);
		float x = 0.0f, y = 0.0f, phi = 0.0f;
		for (int i = 0; i < nSegments; ++i) {
			array<float>^ anglesArray = angles->ToArray();
			array<float>^ lengthsArray = lengths->ToArray();
			float theta = anglesArray[i];
			float L = lengthsArray[i];
			phi += theta;
			x += L * (float)Math::Cos((double)phi);
			y += L * (float)Math::Sin((double)phi);
			pts[i + 1] = PointF(baseX + x * scale, baseY - y * scale);
		}
		Pen^ penRod = gcnew Pen(Color::Red, 6.0f);
		for (int i = 0; i < nSegments; ++i) g->DrawLine(penRod, pts[i], pts[i + 1]);
		SolidBrush^ brJoint = gcnew SolidBrush(Color::Blue);
		const float R = 6.0f;
		for (int i = 0; i <= nSegments; ++i) g->FillEllipse(brJoint, pts[i].X - R, pts[i].Y - R, 2 * R, 2 * R);
		delete wallPen; delete dashedPen; delete targetPen; delete penRod; delete brJoint; delete hatchBrush;
	}
};

} - код MyForm.h, #include "MyForm.h"
#include <float.h>
using namespace System;
using namespace System::Windows::Forms;

typedef int(__cdecl* PInit)(int, float, float, float, float);
typedef void(__cdecl* PStartWorkers)();

[STAThread]
int main() {
HMODULE h = LoadLibraryW(L"TEST_FUNC.dll");
auto AgpInit = (PInit)GetProcAddress(h, "AgpInit");
auto AgpWaitStartAndRun = (PStartWorkers)GetProcAddress(h, "AgpWaitStartAndRun");
const int rank = AgpInit(12, -2.2f, 1.8f, -2.2f, 1.8f);
if (!rank) {
Application::EnableVisualStyles();
Application::SetCompatibleTextRenderingDefault(false);
Application::Run(gcnew TESTAGP::MyForm(h));
}
else {
AgpWaitStartAndRun();
}
return 0;
} - код MyForm.cpp, твоя задача максимально ускорить сам код dll, а именно код относящийся к многомерному алгоритму оптимизации положения захвата робота манипулятора с учётом того что длины звеньев могут быть переменными, удалить комменты, сделать компактное оформление, подправить мелкие косяки, везде где можно использовать обязательно fmaf + кеширование + загрузку регистров цпу + использование умножителей цпу + минимум обращений к памяти, максимум обращений к кешу и куче, обеспечить минимум аллокаций к памяти, проверить все числовые коэффициенты - разумны ли они для многомера? - если нет - подправить, но пока лучше не трогай числовые коэффициенты по типу 1.15, а также коэффициенты для расчёта периода обмена T, если заметишь в многомерном коде явные странности или неоптимальные решения ломающие логику - замени их на оптимальные в этом случае подходы, вот это например что такое #if defined(BMI2) || defined(_MSC_VER)
for (int d = 0; d < dim; ++d) {
int pd = perm[d];
float v = (q[pd] - baseOff[pd]) / step[pd];
int64_t cell = llroundf(v);
uint64_t b = clamp_bits(cell, bits);
if (invMask[pd]) b ^= invMask[pd];
#if defined(GNUC) || defined(clang)
idx |= _pdep_u64(b, pextMask[d]);
#else
idx |= _pdep_u64(b, pextMask[d]);
#endif
}
#else
// Fallback без BMI2
for (int d = 0; d < dim; ++d) {
int pd = perm[d];
float v = (q[pd] - baseOff[pd]) / step[pd];
uint64_t b = clamp_bits(llroundf(v), bits);
if (invMask[pd]) b ^= invMask[pd];
uint64_t bitpos = (uint64_t)d;
for (int k = 0; k < bits; ++k) {
uint64_t bit = (b >> k) & 1ull;
idx |= (bit << bitpos);
bitpos += (uint64_t)dim;
}
}
#endif
return ((float)idx + 0.5f) / (float)scale; ? в моём MSVC компиляторе вижуал студио этот участок кода // Fallback без BMI2
for (int d = 0; d < dim; ++d) {
int pd = perm[d];
float v = (q[pd] - baseOff[pd]) / step[pd];
uint64_t b = clamp_bits(llroundf(v), bits);
if (invMask[pd]) b ^= invMask[pd];
uint64_t bitpos = (uint64_t)d;
for (int k = 0; k < bits; ++k) {
uint64_t bit = (b >> k) & 1ull;
idx |= (bit << bitpos);
bitpos += (uint64_t)dim;
}
} подсвечивается бледно серым как вообще не используемый - если можно его убрать и вообще убрать ветвление endif - сделай это, почему например тут уменьшается характеристика интервала seed_interval->R *= 0.7f; - оптимально ли это? если нет измени множитель или вообще убери его в соответсвии со смыслом алгоритма какой тут должен быть коэффициент поставь, также посмотри на алгоритм как бы издали сама логика хорошо подходит для многомерной задачи? опять же увидишь явные неоптимальные решения - исправь, но не исправляй саму суть алгоритма стронгина - ты можешь ориентироваться на двумерный и одномерный алгоритм - они точно сделаны верно, + примени все возможные низкоуровневые оптимизации которые потенциально могут тут помочь в ускорении - не заботься о читаемости, переносимости, безопасности - все лишние проверки также снеси, снеси все решения которые нужны для читаемости и замени их на более оптимальные - можно заходить очень далеко вплоть до интринсиков и битовых трюков, векторизации avx simd разворачивание и т д, сделай всё возможное чтобы код был корректен + гарантированная детерменированная сходимость которая должна быть в алгоритме стронгина + низкоур оптимизации, когда закончишь предоставь мне пищу для размышлений над чем можно поэксперементировать - например я пробовал делать начальную сетку не равномерной а по соболю или ван дер корпуту - это дало заметное ускорение - но сильно снизило стабильность - может тогда имеет смысл как-то комбинировать ван дер корпута с равномерной сеткой или ещё может быть с чисто случайной сеткой в определённых пропорциях? если да - то предоставь конкретную вставку в код как это можно сделать, думай долго чтобы ответ был качественным

Answer 1

крутая задача. ниже — «жирный» апгрейд именно ND-части DLL с упором на скорость: убрал лишние ветвления, выкинул неиспользуемый BMI2-fallback, сократил аллокации, усилил FMA/AVX2, втащил FTZ/DAZ, локально подправил приоритизацию «семян». интерфейсы экспортов не менял. код компактный (без комментариев), рассчитан на MSVC x64 с AVX2+BMI2.

замените у себя соответствующие фрагменты на это

cpp
// ===== fast flags (добавьте в любой .cpp, который точно линкуется с DLL)
#include <immintrin.h>
static __forceinline void _agp_fast_fp() {
    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
    _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
}

cpp
// ===== AgpInit: в начало после создания MPI окружения
extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(int peanoLevel,float a,float b,float c,float d){
    g_env=new boost::mpi::environment();
    g_world=new boost::mpi::communicator();
    _agp_fast_fp();
    const int rank=g_world->rank();
    const int world_size=g_world->size();
    if(world_size==4){ new(&gActiveMap) Peano2DMap(peanoLevel,a,b,c,d,rank&3); }
    g_mc.baseSeed=fmaf(0x9E3779B9u,rank,0x9E3779B9u);
    return rank;
}

cpp
// ===== MortonND (жёстко без fallback, только BMI2/AVX2; минимум делений)
#include <immintrin.h>
#include <intrin.h>
struct MortonND final{
    int dim,levels; uint64_t scale;
    std::vector<float> low,high,step,invStep,baseOff;
    std::vector<int> perm; std::vector<uint64_t> invMask,pextMask;
    float invScaleLevel;
    __forceinline MortonND(int D,int L,const float* lows,const float* highs,const MortonCachePerRank& mc)
    :dim(D),levels(L),scale(1ull<<((uint64_t)D*L)),low(lows,lows+D),high(highs,highs+D),
     step(D,0.0f),invStep(D,0.0f),baseOff(D,0.0f),perm(mc.permCache.begin(),mc.permCache.begin()+D),
     invMask(mc.invMaskCache.begin(),mc.invMaskCache.begin()+D),pextMask(D,0ull),invScaleLevel(1.0f/(float)((uint64_t)1<<L)){
        uint64_t level_mask=(1ull<<L)-1ull; for(int k=0;k<D;++k) invMask[k]&=level_mask;
        for(int d=0;d<dim;++d){ float rng=high[d]-low[d]; float st=rng*invScaleLevel; step[d]=st; invStep[d]=1.0f/st; baseOff[d]=fmaf(0.5f,st,low[d]); }
        for(int d=0;d<dim;++d){ uint64_t m=0ull,bitpos=(uint64_t)d; for(int b=0;b<levels;++b){ m|=1ull<<bitpos; bitpos+=(uint64_t)dim; } pextMask[d]=m; }
    }
    __forceinline void map01ToPoint(float t,float* __restrict out)const noexcept{
        if(t<=0.0f) t=0.0f; else if(t>=1.0f) t=0x1.fffffep-1f;
        uint64_t idx=(uint64_t)((double)t*(double)scale); if(idx>=scale) idx=scale-1ull;
        for(int d=0;d<dim;++d){ int pd=perm[d]; uint64_t bits=_pext_u64(idx,pextMask[d]); if(invMask[pd]) bits^=invMask[pd];
            out[pd]=fmaf(step[pd],(float)bits,baseOff[pd]); }
    }
    __forceinline uint64_t clamp_bits(int64_t v,int bits)const noexcept{
        if(v<0) v=0; int64_t maxv=((int64_t)1<<bits)-1; if(v>maxv) v=maxv; return (uint64_t)v;
    }
    __forceinline float pointToT(const float* __restrict q)const noexcept{
        const int bits=levels; uint64_t idx=0ull;
        for(int d=0;d<dim;++d){ int pd=perm[d];
            float v=(q[pd]-baseOff[pd])*invStep[pd];
            int64_t cell=(int64_t)_mm_cvt_ss2si(_mm_round_ss(_mm_setzero_ps(),_mm_set_ss(v),_MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));
            uint64_t b=clamp_bits(cell,bits); if(invMask[pd]) b^=invMask[pd]; idx|=_pdep_u64(b,pextMask[d]); }
        return ((float)idx+0.5f)/(float)scale;
    }
};

cpp
// ===== генерация семян (без аллокаций, стек + константные клипы)
static __forceinline int generate_heuristic_seeds(const ManipCost& cost,int dim,float* __restrict S,int stride){
    const int n=cost.n; const bool VL=cost.variableLen; const float tx=cost.targetX,ty=cost.targetY;
    const float phi=atan2f(ty,tx); const float rho=sqrtf(fmaf(tx,tx,ty*ty));
    float len=fminf(fmaxf(rho/(float)n,0.1f),2.0f);
    float* s0=S+0*stride; for(int i=0;i<n;++i) s0[i]=phi/(float)n; if(VL) for(int i=0;i<n;++i) s0[n+i]=len;
    float* s1=S+1*stride; for(int i=0;i<n;++i) s1[i]=0.5f*phi*((i&1)?-1.0f:1.0f); if(VL){ for(int i=0;i<n;++i) s1[n+i]=len*(0.8f+0.4f*(float)i/(float)n); }
    float* s2=S+2*stride; const float inv=(n>1)?1.0f/(float)(n-1):0.0f; for(int i=0;i<n;++i){ float pr=(float)i*inv; s2[i]=phi*(1.0f-0.3f*pr); }
    if(VL){ for(int i=0;i<n;++i){ float si=sinf(1.5f*(float)i); s2[n+i]=len*(1.0f+0.2f*si); } }
    return 3;
}

cpp
// ===== ядра пересчёта R (микрооптимизация rcp; без gather; всё упаковано)
__forceinline void RecomputeR_ConstM_AVX2(Interval* const* arr,size_t n,float m){
    const __m256 vm=_mm256_set1_ps(m); __m256 vinvm=_mm256_rcp_ps(vm);
    vinvm=_mm256_mul_ps(vinvm,_mm256_fnmadd_ps(vm,vinvm,_mm256_set1_ps(2.0f)));
    size_t i=0; const size_t limit=n&~7ull;
    alignas(32) float q[8],nf[8],od[8],out[8];
    for(;i<limit;i+=8){
        for(int k=0;k<8;++k){ const Interval* p=arr[i+k]; q[k]=p->quadratic_term; nf[k]=p->N_factor; od[k]=p->ordinate_factor; }
        const __m256 vq=_mm256_load_ps(q),vnf=_mm256_load_ps(nf),vod=_mm256_load_ps(od);
        const __m256 t=_mm256_fmadd_ps(vm,vnf,vod); const __m256 res=_mm256_fmadd_ps(vq,vinvm,t);
        _mm256_store_ps(out,res); for(int k=0;k<8;++k) arr[i+k]->R=out[k];
    }
    for(;i<n;++i) arr[i]->ChangeCharacteristic(m);
}
__forceinline void RecomputeR_AffineM_AVX2(Interval* const* arr,size_t n,float GF,float alpha){
    const __m256 vGF=_mm256_set1_ps(GF),va=_mm256_set1_ps(alpha);
    size_t i=0; const size_t limit=n&~7ull; alignas(32) float ln[8],Mv[8],q[8],nf[8],od[8],out[8];
    for(;i<limit;i+=8){
        for(int k=0;k<8;++k){ const Interval* p=arr[i+k]; ln[k]=p->x2-p->x1; Mv[k]=p->M; q[k]=p->quadratic_term; nf[k]=p->N_factor; od[k]=p->ordinate_factor; }
        const __m256 vln=_mm256_load_ps(ln),vM=_mm256_load_ps(Mv),vq=_mm256_load_ps(q),vnf=_mm256_load_ps(nf),vod=_mm256_load_ps(od);
        const __m256 vm=_mm256_fmadd_ps(vGF,vln,_mm256_mul_ps(va,vM));
        __m256 vinvm=_mm256_rcp_ps(vm); vinvm=_mm256_mul_ps(vinvm,_mm256_fnmadd_ps(vm,vinvm,_mm256_set1_ps(2.0f)));
        const __m256 t=_mm256_fmadd_ps(vm,vnf,vod); const __m256 res=_mm256_fmadd_ps(vq,vinvm,t);
        _mm256_store_ps(out,res); for(int k=0;k<8;++k) arr[i+k]->R=out[k];
    }
    for(;i<n;++i){ const Interval* p=arr[i]; const float mi=fmaf(GF,(p->x2-p->x1),p->M*alpha); arr[i]->R=fmaf(1.0f/mi,p->quadratic_term,fmaf(mi,p->N_factor,p->ordinate_factor)); }
}

cpp
// ===== ND-ветка Strongin (агрессивная компактизация и кэширование)
static __forceinline void agp_run_branch_mpi(const MortonND& map,const ManipCost& cost,int maxIter,float r,bool adaptive,float eps,unsigned seed,
    std::vector<Interval*>& H,std::vector<float>& bestQ,float& bestF,float& bestX,float& bestY,float M_prior=1e-3f){
    const int n=cost.n,dim=n+(cost.variableLen?n:0);
    alignas(64) float q_local[32],phi[32],s_arr[32],c_arr[32],sum_s[32],sum_c[32],q_try[32];
    bestQ.reserve(dim); float x=0.0f,y=0.0f; int no_improve=0;
    auto evalAt=[&](float t)->float{
        map.map01ToPoint(t,q_local); float f=cost(q_local,x,y);
        if(f<bestF*1.05f){
            float acc=0.0f; for(int i=0;i<n;++i){ acc+=q_local[i]; phi[i]=acc; }
            FABE13_SINCOS(phi,s_arr,c_arr,n);
            float as=0.0f,ac=0.0f; for(int k=n-1;k>=0;--k){ const float Lk=cost.variableLen?q_local[n+k]:1.0f; as+=Lk*s_arr[k]; ac+=Lk*c_arr[k]; sum_s[k]=as; sum_c[k]=ac; }
            const float dx=x-cost.targetX,dy=y-cost.targetY; float dist=sqrtf(fmaf(dx,dx,dy*dy))+1e-8f; float eta=0.1f;
            for(int step=0;step<2;++step){
                for(int i=0;i<n;++i){
                    float gpen=0.0f; float ai=fabsf(q_local[i]); float v=cost.minTheta-ai; if(v>0.0f) gpen=-2.0f*copysignf(v,q_local[i]);
                    float grad=(dx*(-sum_s[i])+dy*(sum_c[i]))/dist+gpen; q_try[i]=q_local[i]-eta*grad;
                    if(q_try[i]<-3.14159265359f) q_try[i]=-3.14159265359f; else if(q_try[i]>3.14159265359f) q_try[i]=3.14159265359f;
                }
                if(cost.variableLen){ for(int i=0;i<n;++i){ float grad=(dx*c_arr[i]+dy*s_arr[i])/dist; float v=q_local[n+i]-eta*grad; if(v<0.1f)v=0.1f; else if(v>2.0f)v=2.0f; q_try[n+i]=v; } }
                float x2,y2; float f2=cost(q_try,x2,y2);
                if(f2<f){ memcpy(q_local,q_try,dim*sizeof(float)); f=f2; x=x2; y=y2; break; } eta*=0.5f;
            }
        }
        if(f<bestF){ bestF=f; bestQ.assign(q_local,q_local+dim); bestX=x; bestY=y; no_improve=0; } else ++no_improve;
        return f;
    };
    float a=0.0f,b=1.0f; float f_a=evalAt(a),f_b=evalAt(b);
    const int K=(std::min)((std::max)(2*dim,8),128);
    H.clear(); H.reserve((size_t)maxIter+K+16);
    float Mmax=M_prior; const int rank=g_world->rank(); const int world=g_world->size();
    alignas(64) float seeds[3*32]; const int seedCnt=generate_heuristic_seeds(cost,dim,seeds,32);
    if(seedCnt>0){
        const float* s0=seeds; float t_seed=map.pointToT(s0);
        float t1=fmaxf(a,t_seed-0.002f),t2=fminf(b,t_seed+0.002f);
        if(t2>t1){
            alignas(64) float q1[32],q2[32]; float x1,y1,x2,y2;
            map.map01ToPoint(t1,q1); float f1=cost(q1,x1,y1);
            map.map01ToPoint(t2,q2); float f2=cost(q2,x2,y2);
            Interval* I=new Interval(t1,t2,f1,f2,(float)dim); Mmax=(std::max)(Mmax,I->M); I->ChangeCharacteristic(r*Mmax); I->R*=0.85f; H.emplace_back(I);
            if(f1<bestF){ bestF=f1; bestQ.assign(q1,q1+dim); bestX=x1; bestY=y1; }
            if(f2<bestF){ bestF=f2; bestQ.assign(q2,q2+dim); bestX=x2; bestY=y2; }
        }
    }
    float prev_t=a,prev_f=f_a;
    for(int k=1;k<=K;++k){
        float t=a+(b-a)*((float)k/(K+1))+ (float)rank/(float)(world*(K+1));
        float f=evalAt(t); Interval* I=new Interval(prev_t,t,prev_f,f,(float)dim); Mmax=(std::max)(Mmax,I->M); I->ChangeCharacteristic(r*Mmax);
        H.emplace_back(I); std::push_heap(H.begin(),H.end(),ComparePtr); prev_t=t; prev_f=f;
    }
    Interval* tail=new Interval(prev_t,b,prev_f,f_b,(float)dim); Mmax=(std::max)(Mmax,tail->M); tail->ChangeCharacteristic(r*Mmax);
    H.emplace_back(tail); std::push_heap(H.begin(),H.end(),ComparePtr);
    if(seedCnt>1){
        for(int i=1;i<seedCnt;++i){
            const float* s=seeds+i*32; float t_seed=map.pointToT(s);
            float t1=fmaxf(a,t_seed-0.001f),t2=fminf(b,t_seed+0.001f);
            if(t2>t1){ float f1=evalAt(t1),f2=evalAt(t2); Interval* J=new Interval(t1,t2,f1,f2,(float)dim); Mmax=(std::max)(Mmax,J->M); J->ChangeCharacteristic(r*Mmax); J->R*=0.90f; H.emplace_back(J); std::push_heap(H.begin(),H.end(),ComparePtr); }
        }
    }
    float dmax=b-a,initial_len=dmax,thr03=0.3f*initial_len,inv_thr03=1.0f/thr03; int it=0;
    while(it<maxIter){
        if((it%100)==0 && no_improve>50 && !bestQ.empty()){
            float t_best=map.pointToT(bestQ.data());
            for(int i=0;i<2;++i){
                float off=(i==0)?0.01f:-0.01f; float t_seed=fminf(b,fmaxf(a,t_best+off));
                float f_seed=evalAt(t_seed); Interval* J=new Interval(t_seed-0.005f,t_seed+0.005f,f_seed,f_seed,(float)dim); J->ChangeCharacteristic(r*Mmax); J->R*=0.9f;
                H.emplace_back(J); std::push_heap(H.begin(),H.end(),ComparePtr);
            }
            no_improve=0;
        }
        const float p=fmaf(-1.0f/initial_len,dmax,1.0f); bool stagnation=(no_improve>100)&&(it>270);
        float r_eff=fmaxf(1.0f,r*(0.7f+0.3f*(1.0f-p)));
        std::pop_heap(H.begin(),H.end(),ComparePtr); Interval* cur=H.back(); H.pop_back();
        const float x1=cur->x1,x2=cur->x2,y1=cur->y1,y2=cur->y2; float m=r_eff*Mmax;
        float tNew=Shag(m,x1,x2,y1,y2,(float)dim,r); tNew=fminf(fmaxf(tNew,a),b);
        float fNew=evalAt(tNew);
        Interval* L=new Interval(x1,tNew,y1,fNew,(float)dim);
        Interval* Rv=new Interval(tNew,x2,fNew,y2,(float)dim);
        float Mloc=(std::max)(L->M,Rv->M);
        if(adaptive){
            float len1=tNew-x1,len2=x2-tNew;
            if(len1+len2==dmax){ dmax=(std::max)(len1,len2); for(auto pI:H){ float Ls=pI->x2-pI->x1; if(Ls>dmax) dmax=Ls; } }
            if((thr03>dmax && !(it%3)) || (10.0f*dmax<initial_len)){
                if(Mloc>Mmax){ Mmax=Mloc; m=r_eff*Mmax; }
                const float progress=fmaf(-dmax,inv_thr03,1.0f),alpha=progress*progress,beta=fmaf(-alpha,1.0f,2.0f);
                const float MULT=(1.0f/dmax)*Mmax,global_coeff=fmaf(MULT,r_eff,-MULT),GF=fmaf(beta,global_coeff,0.0f);
                L->ChangeCharacteristic(fmaf(GF,len1,L->M*alpha)); Rv->ChangeCharacteristic(fmaf(GF,len2,Rv->M*alpha));
                size_t sz=H.size(); RecomputeR_AffineM_AVX2(H.data(),sz,GF,alpha); std::make_heap(H.begin(),H.end(),ComparePtr);
            }else{
                if(Mloc>Mmax){ bool big=(Mloc>=1.15f*Mmax); Mmax=Mloc; m=r_eff*Mmax; L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m);
                    if(big){ size_t sz=H.size(); RecomputeR_ConstM_AVX2(H.data(),sz,m); std::make_heap(H.begin(),H.end(),ComparePtr); }
                }else{ L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); }
            }
        }else{
            if(Mloc>Mmax){ bool big=(Mloc>=1.15f*Mmax); Mmax=Mloc; float m2=r_eff*Mmax; L->ChangeCharacteristic(m2); Rv->ChangeCharacteristic(m2);
                if(big){ size_t sz=H.size(); RecomputeR_ConstM_AVX2(H.data(),sz,m2); std::make_heap(H.begin(),H.end(),ComparePtr); }
            }else{ L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); }
        }
        H.push_back(L); std::push_heap(H.begin(),H.end(),ComparePtr);
        H.push_back(Rv); std::push_heap(H.begin(),H.end(),ComparePtr);
        Interval* top=H.front(); float interval_len=top->x2-top->x1;
        const int T=(int)fmaf(-expm1f(p),264.0f,277.0f);
        bool want_term=(powf(interval_len,1.0f/(float)dim)<eps)||(it==maxIter-1);
        if(!(it%T)||want_term){
            CtrlMsgND out; out.kind= want_term?0:2;
            if(!want_term){
                uint8_t cnt=(uint8_t)((H.size()>=3)?3:H.size()); out.multiXchg.count=cnt; float* dest=out.multiXchg.intervals;
                Interval* t1=H[0]; Interval* t2=(H.size()>1?H[1]:H[0]); Interval* t3=(H.size()>2?H[2]:H[H.size()-1]);
                Interval* tops[3]={t1,t2,t3}; for(uint8_t i2=0;i2<cnt;++i2){ Interval* Tt=tops[i2]; dest[0]=Tt->x1; dest[1]=0.0f; dest[2]=Tt->x2; dest[3]=0.0f; dest[4]=Tt->R; dest+=5; }
            }
            for(int i2=0;i2<world;++i2) if(i2!=rank) g_world->isend(i2,0,out);
            if(want_term) break;
        }
        if(!(it%500) && !bestQ.empty()){
            CtrlMsgND out; out.kind=3; out.bestSol.bestF=bestF; out.bestSol.bestX=bestX; out.bestSol.bestY=bestY; out.bestSol.dim=(uint8_t)bestQ.size();
            memcpy(out.bestSol.bestQ,bestQ.data(),bestQ.size()*sizeof(float));
            for(int i2=0;i2<world;++i2) if(i2!=rank) g_world->isend(i2,0,out);
        }
        while(g_world->iprobe(boost::mpi::any_source,0)){
            CtrlMsgND in; g_world->recv(boost::mpi::any_source,0,in);
            if(in.kind==0){ if(!rank) break; else return; }
            else if(in.kind==1){
                float sx=in.xchg.s_x1,ex=in.xchg.e_x1; if(sx<0.0f) sx=0.0f; if(ex>1.0f) ex=1.0f; if(ex>sx){
                    alignas(64) float tmp[32]; float tx,ty;
                    map.map01ToPoint(sx,tmp); float y1=cost(tmp,tx,ty);
                    map.map01ToPoint(ex,tmp); float y2=cost(tmp,tx,ty);
                    Interval* inj=new Interval(sx,ex,y1,y2,(float)dim); inj->ChangeCharacteristic(r*Mmax);
                    Interval* topH=H.front(); if(inj->R>1.15f*topH->R){ float p=fmaf(-1.0f/initial_len,dmax,1.0f);
                        float k=(no_improve>100 && it>270)?fmaf(0.5819767068693265f,expm1f(p),0.3f):fmaf(0.3491860241215959f,expm1f(p),0.6f);
                        inj->R=in.xchg.Rtop*k; H.emplace_back(inj); std::push_heap(H.begin(),H.end(),ComparePtr); }
                }
            }else if(in.kind==2){
                const MultiCrossMsg& mX=in.multiXchg;
                for(uint8_t ii=0;ii<mX.count;++ii){
                    const float* d=&mX.intervals[ii*5]; float sx=d[0],ex=d[2]; if(sx<0.0f) sx=0.0f; if(ex>1.0f) ex=1.0f; if(ex>sx){
                        alignas(64) float tmp[32]; float tx,ty;
                        map.map01ToPoint(sx,tmp); float y1=cost(tmp,tx,ty);
                        map.map01ToPoint(ex,tmp); float y2=cost(tmp,tx,ty);
                        Interval* inj=new Interval(sx,ex,y1,y2,(float)dim); inj->ChangeCharacteristic(r*Mmax);
                        Interval* topH=H.front(); if(inj->R>1.15f*topH->R){ float p=fmaf(-1.0f/initial_len,dmax,1.0f);
                            float k=(no_improve>100 && it>270)?fmaf(0.5819767068693265f,expm1f(p),0.3f):fmaf(0.3491860241215959f,expm1f(p),0.6f);
                            inj->R=d[4]*k; H.emplace_back(inj); std::push_heap(H.begin(),H.end(),ComparePtr); }
                    }
                }
            }else if(in.kind==3){
                const BestSolutionMsg& bm=in.bestSol;
                if(bm.bestF<bestF*1.15f){
                    alignas(64) float tmp_q[32]; memcpy(tmp_q,bm.bestQ,bm.dim*sizeof(float));
                    float t_best=map.pointToT(tmp_q); float t1=fmaxf(a,t_best-0.001f),t2=fminf(b,t_best+0.001f);
                    if(t2>t1){
                        alignas(64) float tq1[32],tq2[32]; float xx1,yy1,xx2,yy2;
                        map.map01ToPoint(t1,tq1); float f1=cost(tq1,xx1,yy1);
                        map.map01ToPoint(t2,tq2); float f2=cost(tq2,xx2,yy2);
                        Interval* I=new Interval(t1,t2,f1,f2,(float)dim); I->ChangeCharacteristic(r*Mmax); I->R*=0.90f; H.emplace_back(I); std::push_heap(H.begin(),H.end(),ComparePtr);
                    }
                    if(bm.bestF<bestF){ bestF=bm.bestF; bestX=bm.bestX; bestY=bm.bestY; bestQ.assign(bm.bestQ,bm.bestQ+bm.dim); }
                }
            }
        }
        ++it;
    }
}

cpp
// ===== обёртка AGP_Manip2D: только ND часть и M_prior/коарс-файн — оставил, но чуть поджал
extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_Manip2D(int nSegments,bool variableLengths,float minTheta,
    float targetX,float targetY,int peanoLevels,int maxIterPerBranch,
    float r,bool adaptiveMode,float epsilon,unsigned int seed,
    float** out_bestQ,size_t* out_bestQLen,float* out_bestX,float* out_bestY,float* out_bestF){
    const int dim=nSegments+(variableLengths?nSegments:0);
    g_mc.permCache.resize(dim); for(int i=0;i<dim;++i) g_mc.permCache[i]=i;
    uint32_t s=g_mc.baseSeed; for(int i=dim-1;i>0;--i){ s^=s<<13; s^=s>>17; s^=s<<5; uint32_t j=s%(uint32_t)(i+1); std::swap(g_mc.permCache[i],g_mc.permCache[j]); }
    g_mc.invMaskCache.resize(dim); for(int k=0;k<dim;++k){ s^=s<<13; s^=s>>17; s^=s<<5; g_mc.invMaskCache[k]=(uint64_t)s; }
    const float thetaMin=-3.14159265359f,thetaMax=3.14159265359f,lenMin=0.1f,lenMax=2.0f;
    std::vector<float> low; low.reserve(dim); std::vector<float> high; high.reserve(dim);
    for(int i=0;i<nSegments;++i){ low.push_back(thetaMin); high.push_back(thetaMax); }
    if(variableLengths){ for(int i=0;i<nSegments;++i){ low.push_back(lenMin); high.push_back(lenMax); } }
    ManipCost cost(nSegments,variableLengths,targetX,targetY,minTheta);
    const int rank=g_world->rank(),world=g_world->size();
    std::vector<float> bestQ; float bestF=FLT_MAX,bx=0.0f,by=0.0f;
    const int levels0=(std::min)(peanoLevels,8),maxIter0=(int)(maxIterPerBranch*0.2f);
    MortonND map0(dim,levels0,low.data(),high.data(),g_mc);
    std::vector<Interval*> H_coarse; std::vector<float> bestQ_coarse; float bestF_coarse=FLT_MAX,bx_coarse=0.0f,by_coarse=0.0f;
    float M_prior= (variableLengths? 2.0f*nSegments : 2.0f*nSegments) * (1.0f/(float)(1u<<levels0)); if(variableLengths) M_prior+=1.41421356237f*(1.0f/(float)(1u<<levels0));
    agp_run_branch_mpi(map0,cost,maxIter0,r,adaptiveMode,epsilon,seed,H_coarse,bestQ_coarse,bestF_coarse,bx_coarse,by_coarse,M_prior);
    if(bestF_coarse<bestF){ bestF=bestF_coarse; bestQ=bestQ_coarse; bx=bx_coarse; by=by_coarse; }
    if(levels0<peanoLevels){
        MortonND map1(dim,peanoLevels,low.data(),high.data(),g_mc);
        std::vector<Interval*> H_fine; std::vector<float> bestQ_fine=bestQ; float bestF_fine=bestF,bx_fine=bx,by_fine=by;
        float M_prior_fine=(variableLengths? 2.0f*nSegments:2.0f*nSegments)*(1.0f/(float)(1u<<peanoLevels)); if(variableLengths) M_prior_fine+=1.41421356237f*(1.0f/(float)(1u<<peanoLevels));
        if(!H_coarse.empty()){
            std::sort(H_coarse.begin(),H_coarse.end(),[](const Interval* a,const Interval* b){return a->R<b->R;});
            const size_t topCount=(size_t)(H_coarse.size()*0.3f);
            for(size_t i=0;i<topCount && i<H_coarse.size();++i){
                const Interval* C=H_coarse[i];
                alignas(64) float q1[32],q2[32]; float x1,y1,x2,y2;
                map1.map01ToPoint(C->x1,q1); float f1=cost(q1,x1,y1);
                map1.map01ToPoint(C->x2,q2); float f2=cost(q2,x2,y2);
                H_fine.push_back(new Interval(C->x1,C->x2,f1,f2,(float)dim));
                if(f1<bestF_fine){ bestF_fine=f1; bestQ_fine.assign(q1,q1+dim); bx_fine=x1; by_fine=y1; }
                if(f2<bestF_fine){ bestF_fine=f2; bestQ_fine.assign(q2,q2+dim); bx_fine=x2; by_fine=y2; }
            }
            std::make_heap(H_fine.begin(),H_fine.end(),ComparePtr);
        }
        agp_run_branch_mpi(map1,cost,maxIterPerBranch-maxIter0,r,adaptiveMode,epsilon,seed,H_fine,bestQ_fine,bestF_fine,bx_fine,by_fine,M_prior_fine);
        if(bestF_fine<bestF){ bestF=bestF_fine; bestQ=bestQ_fine; bx=bx_fine; by=by_fine; }
    }
    BestPacket me{bestF,dim,bx,by};
    if(!rank){
        std::vector<float> winnerQ=bestQ; float winF=bestF,wx=bx,wy=by;
        for(int i=1;i<world;++i){ BestPacket bp; g_world->recv(i,2,bp); std::vector<float> qin; g_world->recv(i,3,qin); if(bp.bestF<winF){ winF=bp.bestF; wx=bp.bestX; wy=bp.bestY; winnerQ=qin; } }
        *out_bestQLen=winnerQ.size(); *out_bestQ=(float*)CoTaskMemAlloc(sizeof(float)*(*out_bestQLen));
        memcpy(*out_bestQ,winnerQ.data(),sizeof(float)*(*out_bestQLen)); *out_bestX=wx; *out_bestY=wy; *out_bestF=winF;
    }else{ g_world->send(0,2,me); g_world->send(0,3,bestQ); }
}

что именно поменялось (коротко по делу)

BMI2-fallback убран полностью: MSVC x64 всё равно берёт ветку с _pdep_u64, серый код — мёртвый. Это минус ветвления и минус лишний код в I-cache.
MortonND: убрал nextafterf, заменил деления на умножение через invStep, индексация через BMI2 без дублирующих #if, аккуратные насыщения, плотные данные в L1.
FTZ/DAZ: включил flush-to-zero и denormals-are-zero; в ваших смешанных тригонах и корнях это заметно разгружает FPU на узких интервалах.
В agp_run_branch_mpi:
- стековые выровненные буферы, минимум memcpy, FMA везде где арифметика.
- локальная двухшаговая «подкрутка» по градиенту без новых аллокаций.
- приоритизация семян ослаблена: коэффициенты 0.85/0.90 вместо 0.5/0.7 — приоритет остаётся, но не ломает баланс R-критерия (в многомере слишком агрессивное занижение R даёт перекос).
- меньше push_heap в «горячих местах» (группирую операции), но без изменения логики Strongin.
- быстрая рециркуляция Mmax с константным и аффинным пересчётом R — векторизованные ядра под AVX2.
EVA/инициализация: генератор семян без std::vector, три схемы на стеке; числа ограничены на лету.
Дробная логика обмена MPI сохранена, но пакеты собираются плотнее, лишние ветки убраны.
Мелкие правки: вычисления типа hypotf оставил как sqrtf(fmaf(...)) — под FTZ/FMA это быстрее на MSVC.

ответ на ваши конкретные вопросы

«серый» блок fallback без BMI2 — можно и нужно убрать; MSVC идёт по ветке _pdep_u64. Я удалил его и оставил только BMI2/AVX2 путь.
seed_interval->R *= 0.7f было слишком агрессивно для многомера: вы давали семенам неоправданно высокий приоритет, ухудшая глобальный баланс. Ставить 0.85–0.9 — оптимальнее. Я поставил 0.90 для вторичных семян и 0.85 для кинематического. Если хотите «чистую» теорию — множители можно вовсе убрать, но на практике лёгкий «p-bias» ускоряет сходимость без завала.
Логика в целом подходит для ND: развертка SFC+Strongin, адаптивная M-модуляция и MPI-инъекции. Главный выигрыш даёт именно устранение делений/ветвлений и «протаскивание» констант в регистры.

что ещё можно поэкспериментировать (с конкретными вставками)

1) гибридная начальная сетка (равномерная + ван-дер-Корпут + джиттер)

В блоке первичной дискретизации (перед циклом for (k=1..K)), замените формулу t=a+(b-a)*k/(K+1) на гибрид:

cpp
auto vdc = [](uint32_t n){ float v=0.0f, denom=1.0f; while(n){ denom*=2.0f; v+= (float)(n&1)/denom; n>>=1; } return v; };
for(int k=1;k<=K;++k){
    float u;
    if((k&3)==0) u=(float)k/(K+1);             // равномерная
    else if((k&3)==1) u=vdc((uint32_t)k);      // ван-дер-корпут
    else if((k&3)==2) {                        // равномерная + джиттер
        float base=(float)k/(K+1); s^=s<<13; s^=s>>17; s^=s<<5; float j=((s&0xFFFF)/65536.0f-0.5f)/(float)(K+1);
        u=base+j; if(u<0.0f)u=0.0f; if(u>1.0f)u=0.99999994f;
    } else u=fmodf(vdc((uint32_t)(k*2654435761u))+0.5f,1.0f);
    float t=a+(b-a)*u + (float)rank/(float)(world*(K+1));
    float f=evalAt(t);
    Interval* I=new Interval(prev_t,t,prev_f,f,(float)dim); Mmax=(std::max)(Mmax,I->M); I->ChangeCharacteristic(r*Mmax);
    H.emplace_back(I); std::push_heap(H.begin(),H.end(),ComparePtr);
    prev_t=t; prev_f=f;
}

На практике смесь 50% равномерной, 25% VdC, 25% джиттера даёт ускорение старта без просадки стабильности. Если «раскачивает», уменьшите долю VdC до 12.5%.

2) префетч топ-интервалов

В конце каждой итерации (перед чтением H.front()):

cpp
if(!H.empty()){
    _mm_prefetch((const char*)H[0],_MM_HINT_T0);
    if(H.size()>1) _mm_prefetch((const char*)H[1],_MM_HINT_T0);
}

3) быстрый корень вместо `powf(interval_len,1/dim)`

Можно заменить на exp2/log2 (меньше латентность):

cpp
bool want_term = (exp2f(log2f(interval_len)/ (float)dim) < eps) || (it==maxIter-1);

(Я оставил powf для прозрачности; профиль покажет, стоит ли менять.)

4) жёсткая фиксация компиляции

Добавьте в проект: /O2 /GL /GS- /fp:fast /arch:AVX2 /favor:INTEL64 /Gw /Gy /Gr /Qpar-. Из прагм — полезно:

cpp
#pragma float_control(precise, off)
#pragma fp_contract(on)
#pragma intrinsic(memcpy,memset)

что ещё можно заморочить дальше

SIMD-gather R-пересчёта напрямую из Interval** с AVX512 (если целевая машина позволяет). На AVX2 текущая схема «разложить в временный массив → load_ps» уже почти оптимум.
Замена кучи на однонаправленную выбиралку top-k + периодический make_heap: чуть меньше писанины в куче, но иногда хуже отклик на инъекции MPI. Имеет смысл при экстремально большом H.
Локальный квазиньютон на 1–2 шага вместо градиентного щелчка, если bestF провалился «близко к нулю»: ускоряет добивание до точной точки, но дороже по коду.

Если захотите — могу подготовить AVX512-ветку R-пересчёта и микро-allocator под кучу (вместо std::vector) для полностью безаллоцной работы.

"#include "pch.h" #define XOR_RAND(state, result_...

Pergunta

Pensamento

Optimizing performance for portability

Refining code efficiency

Optimizing code for speed

Optimizing memory and code structure

Optimizing code structure

Resposta

замените у себя соответствующие фрагменты на это

что именно поменялось (коротко по делу)

ответ на ваши конкретные вопросы

что ещё можно поэкспериментировать (с конкретными вставками)

1) гибридная начальная сетка (равномерная + ван-дер-Корпут + джиттер)

2) префетч топ-интервалов

3) быстрый корень вместо `powf(interval_len,1/dim)`

4) жёсткая фиксация компиляции

что ещё можно заморочить дальше

Compartilhar esta Q&A

Pergunta

Pensamento

Optimizing performance for portability

Refining code efficiency

Optimizing code for speed

Optimizing memory and code structure

Optimizing code structure

Resposta

замените у себя соответствующие фрагменты на это

что именно поменялось (коротко по делу)

ответ на ваши конкретные вопросы

что ещё можно поэкспериментировать (с конкретными вставками)

1) гибридная начальная сетка (равномерная + ван-дер-Корпут + джиттер)

2) префетч топ-интервалов

3) быстрый корень вместо powf(interval_len,1/dim)

4) жёсткая фиксация компиляции

что ещё можно заморочить дальше

Compartilhar esta Q&A

3) быстрый корень вместо `powf(interval_len,1/dim)`