сейчас я скину тебе длинный код и перед тобой буде...

Question

сейчас я скину тебе длинный код и перед тобой будет стоят следующая задача: 1) - это задача по оформлению - в коде ты найдешь участки кода которые написаны очень компактно плотно упакованы - нужно будет переоформить весь код в таком стиле, не меняя логику, функциональность и т д - только пробелы переносы строк пробелы в строках и т д, удалить из кода все комменты - и все ненужные лишние пробелы мешающие компактности, 2) - тебе нужно будет провести перестановку отдельных крупных функций структур и блоков кода чтобы обеспечить лучшую локальность и лучшую смысловую составляющую - то есть блоки кода которые потом дальше вместе используются или что один блок используется в другом сразу же - такие блоки должны стоять вместе - близко - чтобы обеспечить лучшую локальность данных, то же самое и с объявлением статических переменных и переменных внутри функций - можно переставлять строки кода где происходит объявление переменных или операции с памятью - то есть однотипные операции можно ставить вместе близко, или операции задействующие одинаковые вычислительные устройства процессора чтобы обеспечить лучшую локальность данных или лучшее использование цпу или кеша - в конечном счёте цель одна - ускорение - так что перемещай строки или блоки кода только если это приведёт к ускорению или потенциально может привести к ускорению - но самое важное - все изменения не должны ломать логику, тебе запрещено менять сам код, его функциональную составляющую - ты не можешь влиять на код никаким образом кроме как убирать пробелы отступы комменты, переставлять местами строки и блоки кода, главное чтобы это не отразилось на функциональности кода - чтобы все твои изменения были эквивалентными, 3) последнее требование - это внимательно проверить все структуры массивы векторы и т д на выравнивание и провести глубокий анализ какое выравнивание будет оптимально для каждой структуры с учётом того что код используется в управляемой среде где предпочтительнее выравнивание по 16 а не 64, но это касается не всех мест кода - в некоторых местах где используется векторизация или важно попадание структуры в кеш-линию - возможно оптимальнее будет выравнивание по 32 или 64, в вопросах выравнивания ты должен быть крайне осторожен - так это может существенно повлиять на скорость, так что ты должен долго тщательно взвешивать каждое решение, мне верни полный исправленный код, без пропусков, код: #include "pch.h"

#define XOR_RAND(state, result_var)
do {
int s = (state);
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = state * 0x1.0p-32f;
} while (0)

#define XOR_RAND_GRSH(state, result_var)
do {
int s = (state);
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = fmaf(state, 0x1.0p-31f, -1.0f);
} while (0)

#define FABE13_COS(x, result_var)
do {
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530718f);
if (r > 3.14159265359f) r = 6.28318530718f - r;
if (r < 1.57079632679f) {
const float t2 = r * r;
const float t4 = t2 * t2;
result_var = fmaf(t4, fmaf(t2, -0.0013888889f, 0.0416666667f), fmaf(t2, -0.5f, 1.0f));
} else {
r = 3.14159265359f - r;
const float t2 = r * r;
const float t4 = t2 * t2;
result_var = -fmaf(t4, fmaf(t2, -0.0013888889f, 0.0416666667f), fmaf(t2, -0.5f, 1.0f));
}
} while (0)

#define FABE13_SIN(x, result_var)
do {
const float x = (x);
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530718f);
bool sfl = r > 3.14159265359f;
if (sfl) r = 6.28318530718f - r;
bool cfl = r > 1.57079632679f;
if (cfl) r = 3.14159265359f - r;
const float t2 = r * r;
float _s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
result_var = ((x < 0.0f) ^ sfl) ? -_s : _s;
} while (0)

#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
const int limit = (n) & ~7;
if ((n) >= 8) {
static __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f);
static __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f);
static __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f);
static __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f);
static __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f);
static __declspec(align(32)) const __m256 VEC_COS_P5 = _mm256_set1_ps(-0.0013888889f);
static __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f);
static __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f);
static __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_SIN_P5 = _mm256_set1_ps(-0.0001984127f);
static __declspec(align(32)) const __m256 VEC_SIN_P3 = _mm256_set1_ps(0.0083333333f);
static __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f);
static __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps();
while (i < limit) {
const __m256 vx = _mm256_load_ps(&(in)[i]);
const __m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
const __m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
const __m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
const __m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
const __m256 t2 = _mm256_mul_ps(r2, r2);
const __m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P5, VEC_COS_P3), VEC_COS_P1), VEC_COS_P0);
const __m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_SIN_P5, VEC_SIN_P3), VEC_SIN_P1), VEC_SIN_P0), r2);
const __m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
const __m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ), _mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
_mm256_store_ps(&(cos_out)[i], _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip));
_mm256_store_ps(&(sin_out)[i], _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip));
i += 8;
}
}
while (i < (n)) {
const float x = (in)[i];
const float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
const bool sflip = r > 3.14159265359f;
if (sflip) r = 6.28318530718f - r;
const bool cflip = r > 1.57079632679f;
if (cflip) r = 3.14159265359f - r;
const float t2 = r * r;
const float c = fmaf(t2, fmaf(t2, fmaf(t2, -0.0013888889f, 0.0416666667f), -0.5f), 1.0f);
const float s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
(cos_out)[i] = cflip ? -c : c;
(sin_out)[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while (0)

enum List : uint8_t { Top = 0b00u, Down = 0b01u, Left = 0b10u, Right = 0b11u };

__declspec(align(4)) struct Step final { const uint8_t next; const uint8_t dx; const uint8_t dy; };
__declspec(align(4)) struct InvStep final { const uint8_t q; const uint8_t next; };

__declspec(align(64)) static const Step g_step_tbl[4][4] = {
{ {Right, 0u, 0u}, {Top, 0u, 1u}, {Top, 1u, 1u}, {Left, 1u, 0u} },
{ {Left, 1u, 1u}, {Down, 1u, 0u}, {Down, 0u, 0u}, {Right, 0u, 1u} },
{ {Down, 1u, 1u}, {Left, 0u, 1u}, {Left, 0u, 0u}, {Top, 1u, 0u} },
{ {Top, 0u, 0u}, {Right, 1u, 0u}, {Right, 1u, 1u}, {Down, 0u, 1u} }
};

__declspec(align(64)) static const InvStep g_inv_tbl[4][4] = {
{ {0u, Right}, {1u, Top}, {3u, Left}, {2u, Top} },
{ {2u, Down}, {3u, Right}, {1u, Down}, {0u, Left} },
{ {2u, Left}, {1u, Left}, {3u, Top}, {0u, Down} },
{ {0u, Top}, {3u, Down}, {1u, Right}, {2u, Right} }
};

static const boost::mpi::environment* g_env;
static const boost::mpi::communicator* g_world;

__declspec(align(16)) struct CrossMsg final {
float s_x1, s_x2;
float e_x1, e_x2;
float Rtop;
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }
};

__declspec(align(16)) struct CtrlMsg final {
bool kind;
CrossMsg xchg;
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& kind& xchg; }
};

__declspec(align(16)) struct Slab final {
char* const base;
char* current;
char* const end;
__forceinline Slab(void* const memory, const size_t usable) : base((char*)memory), current(base), end(base + (usable & ~(size_t)63u)) {}
};

static tbb::enumerable_thread_specific<Slab*> tls( noexcept {
void* memory = _aligned_malloc(16777216u, 64u);
Slab* slab = (Slab*)_aligned_malloc(32u, 64u);
new (slab) Slab(memory, 16777216u);
char* p = slab->base;
while (p < slab->end) { *p = 0; p += 4096u; }
return slab;
});

__declspec(align(16)) struct Peano2DMap final {
const int levels;
const float a, b, c, d;
const float lenx, leny;
const float inv_lenx;
const uint32_t scale;
const uint8_t start;
__forceinline Peano2DMap(int L, float _a, float _b, float _c, float _d, uint8_t st)
: levels(L), a(_a), b(_b), c(_c), d(_d), lenx(_b - _a), leny(_d - _c), inv_lenx(1.0f / (_b - _a)), scale((uint32_t)1u << (L << 1)), start(st) {
}
};

static Peano2DMap gActiveMap(0, 0, 0, 0, 0, 0);

__forceinline bool ComparePtr(const struct Interval* a, const struct Interval* b) noexcept;

__declspec(align(64)) struct Interval final {
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float ordinate_factor;
const float N_factor;
const float quadratic_term;
const float M;
float R;
__forceinline void* operator new(size_t) noexcept { Slab* s = tls.local(); char* r = s->current; s->current += 64u; return r; }
__forceinline Interval(float _x1, float _x2, float _y1, float _y2, float _N) noexcept
: x1(_x1), x2(_x2), y1(_y1), y2(_y2),
delta_y(_y2 - _y1),
ordinate_factor(-(y1 + y2) * 2.0f),
N_factor(_N == 1.0f ? _x2 - _x1 : _N == 2.0f ? sqrtf(_x2 - _x1) : powf(_x2 - _x1, 1.0f / _N)),
quadratic_term((1.0f / N_factor)* delta_y* delta_y),
M((1.0f / N_factor)* fabsf(delta_y)) {
}
__forceinline void ChangeCharacteristic(float _m) noexcept { R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor)); }
};

__forceinline bool ComparePtr(const Interval* a, const Interval* b) noexcept { return a->R < b->R; }

__forceinline void RecomputeR_ConstM_AVX2(Interval* const* arr, size_t n, float m) {
const __m256 vm = _mm256_set1_ps(m); __m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
size_t i = 0; const size_t limit = n & ~7ull;
alignas(32) float q[8], nf[8], od[8], out[8];
for (; i < limit; i += 8) {
for (int k = 0; k < 8; ++k) { const Interval* p = arr[i + k]; q[k] = p->quadratic_term; nf[k] = p->N_factor; od[k] = p->ordinate_factor; }
const __m256 vq = _mm256_load_ps(q), vnf = _mm256_load_ps(nf), vod = _mm256_load_ps(od);
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod); const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
_mm256_store_ps(out, res); for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k];
}
for (; i < n; ++i) arr[i]->ChangeCharacteristic(m);
}
__forceinline void RecomputeR_AffineM_AVX2(Interval* const* arr, size_t n, float GF, float alpha) {
const __m256 vGF = _mm256_set1_ps(GF), va = _mm256_set1_ps(alpha);
size_t i = 0; const size_t limit = n & ~7ull; alignas(32) float ln[8], Mv[8], q[8], nf[8], od[8], out[8];
for (; i < limit; i += 8) {
for (int k = 0; k < 8; ++k) { const Interval* p = arr[i + k]; ln[k] = p->x2 - p->x1; Mv[k] = p->M; q[k] = p->quadratic_term; nf[k] = p->N_factor; od[k] = p->ordinate_factor; }
const __m256 vln = _mm256_load_ps(ln), vM = _mm256_load_ps(Mv), vq = _mm256_load_ps(q), vnf = _mm256_load_ps(nf), vod = _mm256_load_ps(od);
const __m256 vm = _mm256_fmadd_ps(vGF, vln, _mm256_mul_ps(va, vM));
__m256 vinvm = _mm256_rcp_ps(vm); vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod); const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
_mm256_store_ps(out, res); for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k];
}
for (; i < n; ++i) { const Interval* p = arr[i]; const float mi = fmaf(GF, (p->x2 - p->x1), p->M * alpha); arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor)); }
}

__forceinline float fast_pow_int(float v, int n) {
float r;
switch (n) {
case 1: r = v; break;
case 2: r = v * v; break;
case 3: { float v2 = v * v; r = v2 * v; } break;
case 4: { float v2 = v * v; r = v2 * v2; } break;
case 5: { float v2 = v * v; r = v2 * v2 * v; } break;
case 6: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v2; } break;
case 7: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v2 * v; } break;
case 8: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v4; } break;
case 9: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v3; } break;
case 10: { float v2 = v * v; float v4 = v2 * v2; float v8 = v4 * v4; r = v8 * v2; } break;
case 11: { float v2 = v * v; float v4 = v2 * v2; float v8 = v4 * v4; r = v8 * v2 * v; } break;
case 12: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v6; } break;
case 13: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v6 * v; } break;
case 14: { float v7 = v * v * v * v * v * v * v; r = v7 * v7; } break;
default: { float v7 = v * v * v * v * v * v * v; r = v7 * v7 * v; } break;
}
return r;
}

__forceinline float Shag(float _m, float x1, float x2, float y1, float y2, float _N, float _r) {
const float diff = y2 - y1;
const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(0x3F800000u | (((((uint32_t)&diff)) & 0x80000000u) ^ 0x80000000u))));
const int Ni = (int)(_N + 0.5f);
if (Ni == 1) return fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f;
if (Ni == 2) return fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f;
const float invmN = 1.0f / fast_pow_int(_m, Ni);
const float dN = fast_pow_int(fabsf(diff), Ni);
return fmaf(sign_mult * invmN, dN * _r, x1 + x2) * 0.5f;
}

struct MortonCachePerRank { std::vector<int> permCache; std::vector<uint64_t> invMaskCache; uint32_t baseSeed; };
static MortonCachePerRank g_mc;

struct MortonND final {
int dim, levels; uint64_t scale;
std::vector<float> low, high, step, invStep, baseOff;
std::vector<int> perm; std::vector<uint64_t> invMask, pextMask;
float invScaleLevel;
__forceinline MortonND(int D, int L, const float* lows, const float* highs, const MortonCachePerRank& mc)
:dim(D), levels(L), scale(1ull << ((uint64_t)D * L)), low(lows, lows + D), high(highs, highs + D),
step(D, 0.0f), invStep(D, 0.0f), baseOff(D, 0.0f), perm(mc.permCache.begin(), mc.permCache.begin() + D),
invMask(mc.invMaskCache.begin(), mc.invMaskCache.begin() + D), pextMask(D, 0ull), invScaleLevel(1.0f / (float)((uint64_t)1 << L)) {
uint64_t level_mask = (1ull << L) - 1ull; for (int k = 0; k < D; ++k) invMask[k] &= level_mask;
for (int d = 0; d < dim; ++d) { float rng = high[d] - low[d]; float st = rng * invScaleLevel; step[d] = st; invStep[d] = 1.0f / st; baseOff[d] = fmaf(0.5f, st, low[d]); }
for (int d = 0; d < dim; ++d) { uint64_t m = 0ull, bitpos = (uint64_t)d; for (int b = 0; b < levels; ++b) { m |= 1ull << bitpos; bitpos += (uint64_t)dim; } pextMask[d] = m; }
}
__forceinline void map01ToPoint(float t, float* __restrict out)const noexcept {
if (t <= 0.0f) t = 0.0f; else if (t >= 1.0f) t = 0x1.fffffep-1f;
uint64_t idx = (uint64_t)((double)t * (double)scale); if (idx >= scale) idx = scale - 1ull;
for (int d = 0; d < dim; ++d) {
int pd = perm[d]; uint64_t bits = _pext_u64(idx, pextMask[d]); if (invMask[pd]) bits ^= invMask[pd];
out[pd] = fmaf(step[pd], (float)bits, baseOff[pd]);
}
}
__forceinline uint64_t clamp_bits(int64_t v, int bits)const noexcept {
if (v < 0) v = 0; int64_t maxv = ((int64_t)1 << bits) - 1; if (v > maxv) v = maxv; return (uint64_t)v;
}
__forceinline float pointToT(const float* __restrict q)const noexcept {
const int bits = levels; uint64_t idx = 0ull;
for (int d = 0; d < dim; ++d) {
int pd = perm[d];
float v = (q[pd] - baseOff[pd]) * invStep[pd];
int64_t cell = (int64_t)_mm_cvt_ss2si(_mm_round_ss(_mm_setzero_ps(), _mm_set_ss(v), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
uint64_t b = clamp_bits(cell, bits); if (invMask[pd]) b ^= invMask[pd]; idx |= _pdep_u64(b, pextMask[d]);
}
return ((float)idx + 0.5f) / (float)scale;
}
};

struct ManipCost final {
int n;
bool variableLen;
float targetX, targetY;
float minTheta;
__forceinline ManipCost(int _n, bool _variableLen, float _targetX, float _targetY, float _minTheta)
: n(_n), variableLen(_variableLen), targetX(_targetX), targetY(_targetY), minTheta(_minTheta) {
}
__forceinline float operator()(const float* __restrict q, float& out_x, float& out_y) const noexcept {
const float* th = q;
const float* L = variableLen ? (q + n) : nullptr;
__declspec(align(64)) float phi[16];
__declspec(align(64)) float s_arr[16];
__declspec(align(64)) float c_arr[16];
float x = 0.0f, y = 0.0f, phi_acc = 0.0f, penC = 0.0f;
for (int i = 0; i < n; ++i) { phi_acc += th[i]; phi[i] = phi_acc; }
FABE13_SINCOS(phi, s_arr, c_arr, n);
const float Lc = 1.0f;
if (variableLen) {
for (int i = 0; i < n; ++i) { float Li = L[i]; x = fmaf(Li, c_arr[i], x); y = fmaf(Li, s_arr[i], y); }
}
else {
for (int i = 0; i < n; ++i) { x = fmaf(Lc, c_arr[i], x); y = fmaf(Lc, s_arr[i], y); }
}
for (int i = 0; i < n; ++i) { float ai = fabsf(th[i]); float v = minTheta - ai; float u = v > 0.0f ? v : 0.0f; penC = fmaf(u, u, penC); }
float dx = x - targetX;
float dy = y - targetY;
float dist = sqrtf(fmaf(dx, dx, dy * dy));
out_x = x; out_y = y;
return dist + penC;
}
};

__forceinline void HitTest2D_analytic(float x_param, float& out_x1, float& out_x2) {
const float a = gActiveMap.a;
const float inv_lenx = gActiveMap.inv_lenx;
const uint32_t scale = gActiveMap.scale;
const uint32_t scale_minus_1 = scale - 1u;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const float c = gActiveMap.c;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;
float norm = (x_param - a) * inv_lenx;
norm = fminf(fmaxf(norm, 0.0f), 0x1.fffffep-1f);
uint32_t idx = (uint32_t)(norm * (float)scale);
idx = idx > scale_minus_1 ? scale_minus_1 : idx;
float sx = lenx, sy = leny;
float x1 = a, x2 = c;
uint8_t type = start;
int l = levels - 1;
while (l >= 0) {
const uint32_t q = (idx >> (l * 2)) & 3u;
const Step s = g_step_tbl[type][q];
type = s.next;
sx *= 0.5f; sy *= 0.5f;
x1 += s.dx ? sx : 0.0f;
x2 += s.dy ? sy : 0.0f;
--l;
}
out_x1 = x1 + sx * 0.5f;
out_x2 = x2 + sy * 0.5f;
}

__forceinline float FindX2D_analytic(float px, float py) {
const float a = gActiveMap.a;
const float b = gActiveMap.b;
const float c = gActiveMap.c;
const float d = gActiveMap.d;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const uint32_t scale = gActiveMap.scale;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;
const float clamped_px = fminf(fmaxf(px, a), b);
const float clamped_py = fminf(fmaxf(py, c), d);
float sx = lenx, sy = leny;
float x0 = a, y0 = c;
uint32_t idx = 0u;
uint8_t type = start;
int l = 0;
while (l < levels) {
sx *= 0.5f; sy *= 0.5f;
const float mx = x0 + sx;
const float my = y0 + sy;
const uint32_t tr = (uint32_t)((clamped_px > mx) & (clamped_py > my));
const uint32_t tl = (uint32_t)((clamped_px < mx) & (clamped_py > my));
const uint32_t dl = (uint32_t)((clamped_px < mx) & (clamped_py < my));
const uint32_t none = (uint32_t)(1u ^ (tr | tl | dl));
const uint32_t dd = (tr << 1) | tr | tl | (none << 1);
const InvStep inv = g_inv_tbl[type][dd];
type = inv.next;
idx = (idx << 2) | inv.q;
const uint32_t dx = dd >> 1;
const uint32_t dy = dd & 1u;
x0 += dx ? sx : 0.0f;
y0 += dy ? sy : 0.0f;
++l;
}
const float scale_recip = 1.0f / (float)scale;
return fmaf((float)idx * scale_recip, lenx, a);
}

// Структура для трех интервалов (простая, без конструкторов/деструкторов)
__declspec(align(16)) struct MultiCrossMsg final {
float intervals[15]; // [s_x1, s_x2, e_x1, e_x2, Rtop] × 3 интервала
uint8_t count;

text
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept {
	ar& intervals& count;
}

};

// Структура для лучшего решения (простая, без векторов)
__declspec(align(16)) struct BestSolutionMsg final {
float bestF;
float bestX, bestY;
float bestQ[32]; // Фиксированный массив
uint8_t dim;

text
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept {
	ar& bestF& bestX& bestY& bestQ& dim;
}

};

// Общая структура для многомерного MPI (совместима с двумерной логикой)
__declspec(align(16)) struct CtrlMsgND final {
uint8_t kind; // 0=term, 1=single, 2=multi, 3=best
CrossMsg xchg;
MultiCrossMsg multiXchg;
BestSolutionMsg bestSol;

text
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept {
	ar& kind;
	if (kind == 1) ar& xchg;
	else if (kind == 2) ar& multiXchg;
	else if (kind == 3) ar& bestSol;
}

};

static __forceinline int generate_heuristic_seeds(const ManipCost& cost, int dim, float* __restrict S, int stride) {
const int n = cost.n; const bool VL = cost.variableLen; const float tx = cost.targetX, ty = cost.targetY;
const float phi = atan2f(ty, tx); const float rho = sqrtf(fmaf(tx, tx, ty * ty));
float len = fminf(fmaxf(rho / (float)n, 0.1f), 2.0f);
float* s0 = S + 0 * stride; for (int i = 0; i < n; ++i) s0[i] = phi / (float)n; if (VL) for (int i = 0; i < n; ++i) s0[n + i] = len;
float* s1 = S + 1 * stride; for (int i = 0; i < n; ++i) s1[i] = 0.5f * phi * ((i & 1) ? -1.0f : 1.0f); if (VL) { for (int i = 0; i < n; ++i) s1[n + i] = len * (0.8f + 0.4f * (float)i / (float)n); }
float* s2 = S + 2 * stride; const float inv = (n > 1) ? 1.0f / (float)(n - 1) : 0.0f; for (int i = 0; i < n; ++i) { float pr = (float)i * inv; s2[i] = phi * (1.0f - 0.3f * pr); }
if (VL) { for (int i = 0; i < n; ++i) { float si = sinf(1.5f * (float)i); s2[n + i] = len * (1.0f + 0.2f * si); } }
return 3;
}

// Модифицируем функцию agp_run_branch_mpi для использования априорной оценки
static __forceinline void agp_run_branch_mpi(const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed,
std::vector<Interval*>& H, std::vector<float>& bestQ, float& bestF, float& bestX, float& bestY, float M_prior = 1e-3f) {
const int n = cost.n, dim = n + (cost.variableLen ? n : 0);
alignas(64) float q_local[32], phi[32], s_arr[32], c_arr[32], sum_s[32], sum_c[32], q_try[32];
bestQ.reserve(dim); float x = 0.0f, y = 0.0f; int no_improve = 0;
auto evalAt = [&](float t)->float {
map.map01ToPoint(t, q_local); float f = cost(q_local, x, y);
if (f < bestF * 1.05f) {
float acc = 0.0f; for (int i = 0; i < n; ++i) { acc += q_local[i]; phi[i] = acc; }
FABE13_SINCOS(phi, s_arr, c_arr, n);
float as = 0.0f, ac = 0.0f; for (int k = n - 1; k >= 0; --k) { const float Lk = cost.variableLen ? q_local[n + k] : 1.0f; as += Lk * s_arr[k]; ac += Lk * c_arr[k]; sum_s[k] = as; sum_c[k] = ac; }
const float dx = x - cost.targetX, dy = y - cost.targetY; float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1e-8f; float eta = 0.1f;
for (int step = 0; step < 2; ++step) {
for (int i = 0; i < n; ++i) {
float gpen = 0.0f; float ai = fabsf(q_local[i]); float v = cost.minTheta - ai; if (v > 0.0f) gpen = -2.0f * copysignf(v, q_local[i]);
float grad = (dx * (-sum_s[i]) + dy * (sum_c[i])) / dist + gpen; q_try[i] = q_local[i] - eta * grad;
if (q_try[i] < -3.14159265359f) q_try[i] = -3.14159265359f; else if (q_try[i] > 3.14159265359f) q_try[i] = 3.14159265359f;
}
if (cost.variableLen) { for (int i = 0; i < n; ++i) { float grad = (dx * c_arr[i] + dy * s_arr[i]) / dist; float v = q_local[n + i] - eta * grad; if (v < 0.1f)v = 0.1f; else if (v > 2.0f)v = 2.0f; q_try[n + i] = v; } }
float x2, y2; float f2 = cost(q_try, x2, y2);
if (f2 < f) { memcpy(q_local, q_try, dim * sizeof(float)); f = f2; x = x2; y = y2; break; } eta = 0.5f;
}
}
if (f < bestF) { bestF = f; bestQ.assign(q_local, q_local + dim); bestX = x; bestY = y; no_improve = 0; }
else ++no_improve;
return f;
};
float a = 0.0f, b = 1.0f; float f_a = evalAt(a), f_b = evalAt(b);
const int K = (std::min)((std::max)(2 * dim, 8), 128);
H.clear(); H.reserve((size_t)maxIter + K + 16);
float Mmax = M_prior; const int rank = g_world->rank(); const int world = g_world->size();
alignas(64) float seeds[3 * 32]; const int seedCnt = generate_heuristic_seeds(cost, dim, seeds, 32);
if (seedCnt > 0) {
const float s0 = seeds; float t_seed = map.pointToT(s0);
float t1 = fmaxf(a, t_seed - 0.002f), t2 = fminf(b, t_seed + 0.002f);
if (t2 > t1) {
alignas(64) float q1[32], q2[32]; float x1, y1, x2, y2;
map.map01ToPoint(t1, q1); float f1 = cost(q1, x1, y1);
map.map01ToPoint(t2, q2); float f2 = cost(q2, x2, y2);
Interval* I = new Interval(t1, t2, f1, f2, (float)dim); Mmax = (std::max)(Mmax, I->M); I->ChangeCharacteristic(r * Mmax); I->R = 0.85f; H.emplace_back(I);
if (f1 < bestF) { bestF = f1; bestQ.assign(q1, q1 + dim); bestX = x1; bestY = y1; }
if (f2 < bestF) { bestF = f2; bestQ.assign(q2, q2 + dim); bestX = x2; bestY = y2; }
}
}
float prev_t = a, prev_f = f_a;
for (int k = 1; k <= K; ++k) {
float t = a + (b - a) * ((float)k / (K + 1)) + (float)rank / (float)(world * (K + 1));
float f = evalAt(t); Interval I = new Interval(prev_t, t, prev_f, f, (float)dim); Mmax = (std::max)(Mmax, I->M); I->ChangeCharacteristic(r * Mmax);
H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtr); prev_t = t; prev_f = f;
}
Interval* tail = new Interval(prev_t, b, prev_f, f_b, (float)dim); Mmax = (std::max)(Mmax, tail->M); tail->ChangeCharacteristic(r * Mmax);
H.emplace_back(tail); std::push_heap(H.begin(), H.end(), ComparePtr);
if (seedCnt > 1) {
for (int i = 1; i < seedCnt; ++i) {
const float* s = seeds + i * 32; float t_seed = map.pointToT(s);
float t1 = fmaxf(a, t_seed - 0.001f), t2 = fminf(b, t_seed + 0.001f);
if (t2 > t1) { float f1 = evalAt(t1), f2 = evalAt(t2); Interval* J = new Interval(t1, t2, f1, f2, (float)dim); Mmax = (std::max)(Mmax, J->M); J->ChangeCharacteristic(r * Mmax); J->R = 0.90f; H.emplace_back(J); std::push_heap(H.begin(), H.end(), ComparePtr); }
}
}
float dmax = b - a, initial_len = dmax, thr03 = 0.3f * initial_len, inv_thr03 = 1.0f / thr03; int it = 0;
while (it < maxIter) {
if ((it % 100) == 0 && no_improve > 50 && !bestQ.empty()) {
float t_best = map.pointToT(bestQ.data());
for (int i = 0; i < 2; ++i) {
float off = (i == 0) ? 0.01f : -0.01f; float t_seed = fminf(b, fmaxf(a, t_best + off));
float f_seed = evalAt(t_seed); Interval J = new Interval(t_seed - 0.005f, t_seed + 0.005f, f_seed, f_seed, (float)dim); J->ChangeCharacteristic(r * Mmax); J->R = 0.9f;
H.emplace_back(J); std::push_heap(H.begin(), H.end(), ComparePtr);
}
no_improve = 0;
}
const float p = fmaf(-1.0f / initial_len, dmax, 1.0f); bool stagnation = (no_improve > 100) && (it > 270);
float r_eff = fmaxf(1.0f, r * (0.7f + 0.3f * (1.0f - p)));
std::pop_heap(H.begin(), H.end(), ComparePtr); Interval cur = H.back(); H.pop_back();
const float x1 = cur->x1, x2 = cur->x2, y1 = cur->y1, y2 = cur->y2; float m = r_eff * Mmax;
float tNew = Shag(m, x1, x2, y1, y2, (float)dim, r); tNew = fminf(fmaxf(tNew, a), b);
float fNew = evalAt(tNew);
Interval* L = new Interval(x1, tNew, y1, fNew, (float)dim);
Interval* Rv = new Interval(tNew, x2, fNew, y2, (float)dim);
float Mloc = (std::max)(L->M, Rv->M);
if (adaptive) {
float len1 = tNew - x1, len2 = x2 - tNew;
if (len1 + len2 == dmax) { dmax = (std::max)(len1, len2); for (auto pI : H) { float Ls = pI->x2 - pI->x1; if (Ls > dmax) dmax = Ls; } }
if ((thr03 > dmax && !(it % 3)) || (10.0f * dmax < initial_len)) {
if (Mloc > Mmax) { Mmax = Mloc; m = r_eff * Mmax; }
const float progress = fmaf(-dmax, inv_thr03, 1.0f), alpha = progress * progress, beta = fmaf(-alpha, 1.0f, 2.0f);
const float MULT = (1.0f / dmax) * Mmax, global_coeff = fmaf(MULT, r_eff, -MULT), GF = fmaf(beta, global_coeff, 0.0f);
L->ChangeCharacteristic(fmaf(GF, len1, L->M * alpha)); Rv->ChangeCharacteristic(fmaf(GF, len2, Rv->M * alpha));
size_t sz = H.size(); RecomputeR_AffineM_AVX2(H.data(), sz, GF, alpha); std::make_heap(H.begin(), H.end(), ComparePtr);
}
else {
if (Mloc > Mmax) {
bool big = (Mloc >= 1.15f * Mmax); Mmax = Mloc; m = r_eff * Mmax; L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m);
if (big) { size_t sz = H.size(); RecomputeR_ConstM_AVX2(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtr); }
}
else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); }
}
}
else {
if (Mloc > Mmax) {
bool big = (Mloc >= 1.15f * Mmax); Mmax = Mloc; float m2 = r_eff * Mmax; L->ChangeCharacteristic(m2); Rv->ChangeCharacteristic(m2);
if (big) { size_t sz = H.size(); RecomputeR_ConstM_AVX2(H.data(), sz, m2); std::make_heap(H.begin(), H.end(), ComparePtr); }
}
else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); }
}
H.push_back(L); std::push_heap(H.begin(), H.end(), ComparePtr);
H.push_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtr);
Interval* top = H.front(); float interval_len = top->x2 - top->x1;
const int T = (int)fmaf(-expm1f(p), 264.0f, 277.0f);
bool want_term = (powf(interval_len, 1.0f / (float)dim) < eps) || (it == maxIter - 1);
if (!(it % T) || want_term) {
CtrlMsgND out; out.kind = want_term ? 0 : 2;
if (!want_term) {
uint8_t cnt = (uint8_t)((H.size() >= 3) ? 3 : H.size()); out.multiXchg.count = cnt; float* dest = out.multiXchg.intervals;
Interval* t1 = H[0]; Interval* t2 = (H.size() > 1 ? H[1] : H[0]); Interval* t3 = (H.size() > 2 ? H[2] : H[H.size() - 1]);
Interval* tops[3] = { t1,t2,t3 }; for (uint8_t i2 = 0; i2 < cnt; ++i2) { Interval* Tt = tops[i2]; dest[0] = Tt->x1; dest[1] = 0.0f; dest[2] = Tt->x2; dest[3] = 0.0f; dest[4] = Tt->R; dest += 5; }
}
for (int i2 = 0; i2 < world; ++i2) if (i2 != rank) g_world->isend(i2, 0, out);
if (want_term) break;
}
if (!(it % 500) && !bestQ.empty()) {
CtrlMsgND out; out.kind = 3; out.bestSol.bestF = bestF; out.bestSol.bestX = bestX; out.bestSol.bestY = bestY; out.bestSol.dim = (uint8_t)bestQ.size();
memcpy(out.bestSol.bestQ, bestQ.data(), bestQ.size() * sizeof(float));
for (int i2 = 0; i2 < world; ++i2) if (i2 != rank) g_world->isend(i2, 0, out);
}
while (g_world->iprobe(boost::mpi::any_source, 0)) {
CtrlMsgND in; g_world->recv(boost::mpi::any_source, 0, in);
if (in.kind == 0) { if (!rank) break; else return; }
else if (in.kind == 1) {
float sx = in.xchg.s_x1, ex = in.xchg.e_x1; if (sx < 0.0f) sx = 0.0f; if (ex > 1.0f) ex = 1.0f; if (ex > sx) {
alignas(64) float tmp[32]; float tx, ty;
map.map01ToPoint(sx, tmp); float y1 = cost(tmp, tx, ty);
map.map01ToPoint(ex, tmp); float y2 = cost(tmp, tx, ty);
Interval* inj = new Interval(sx, ex, y1, y2, (float)dim); inj->ChangeCharacteristic(r * Mmax);
Interval* topH = H.front(); if (inj->R > 1.15f * topH->R) {
float p = fmaf(-1.0f / initial_len, dmax, 1.0f);
float k = (no_improve > 100 && it > 270) ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f);
inj->R = in.xchg.Rtop * k; H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtr);
}
}
}
else if (in.kind == 2) {
const MultiCrossMsg& mX = in.multiXchg;
for (uint8_t ii = 0; ii < mX.count; ++ii) {
const float* d = &mX.intervals[ii * 5]; float sx = d[0], ex = d[2]; if (sx < 0.0f) sx = 0.0f; if (ex > 1.0f) ex = 1.0f; if (ex > sx) {
alignas(64) float tmp[32]; float tx, ty;
map.map01ToPoint(sx, tmp); float y1 = cost(tmp, tx, ty);
map.map01ToPoint(ex, tmp); float y2 = cost(tmp, tx, ty);
Interval* inj = new Interval(sx, ex, y1, y2, (float)dim); inj->ChangeCharacteristic(r * Mmax);
Interval* topH = H.front(); if (inj->R > 1.15f * topH->R) {
float p = fmaf(-1.0f / initial_len, dmax, 1.0f);
float k = (no_improve > 100 && it > 270) ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f);
inj->R = d[4] * k; H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtr);
}
}
}
}
else if (in.kind == 3) {
const BestSolutionMsg& bm = in.bestSol;
if (bm.bestF < bestF * 1.15f) {
alignas(64) float tmp_q[32]; memcpy(tmp_q, bm.bestQ, bm.dim * sizeof(float));
float t_best = map.pointToT(tmp_q); float t1 = fmaxf(a, t_best - 0.001f), t2 = fminf(b, t_best + 0.001f);
if (t2 > t1) {
alignas(64) float tq1[32], tq2[32]; float xx1, yy1, xx2, yy2;
map.map01ToPoint(t1, tq1); float f1 = cost(tq1, xx1, yy1);
map.map01ToPoint(t2, tq2); float f2 = cost(tq2, xx2, yy2);
Interval* I = new Interval(t1, t2, f1, f2, (float)dim); I->ChangeCharacteristic(r * Mmax); I->R *= 0.90f; H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtr);
}
if (bm.bestF < bestF) { bestF = bm.bestF; bestX = bm.bestX; bestY = bm.bestY; bestQ.assign(bm.bestQ, bm.bestQ + bm.dim); }
}
}
}
++it;
}
}

struct BestPacket { float bestF; int dim; float bestX; float bestY; template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar& bestF& dim& bestX& bestY; } };

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_Manip2D(int nSegments, bool variableLengths, float minTheta,
float targetX, float targetY, int peanoLevels, int maxIterPerBranch,
float r, bool adaptiveMode, float epsilon, unsigned int seed,
float** out_bestQ, size_t* out_bestQLen, float* out_bestX, float* out_bestY, float* out_bestF) {
const int dim = nSegments + (variableLengths ? nSegments : 0);
g_mc.permCache.resize(dim); for (int i = 0; i < dim; ++i) g_mc.permCache[i] = i;
uint32_t s = g_mc.baseSeed; for (int i = dim - 1; i > 0; --i) { s ^= s << 13; s ^= s >> 17; s ^= s << 5; uint32_t j = s % (uint32_t)(i + 1); std::swap(g_mc.permCache[i], g_mc.permCache[j]); }
g_mc.invMaskCache.resize(dim); for (int k = 0; k < dim; ++k) { s ^= s << 13; s ^= s >> 17; s ^= s << 5; g_mc.invMaskCache[k] = (uint64_t)s; }
const float thetaMin = -3.14159265359f, thetaMax = 3.14159265359f, lenMin = 0.1f, lenMax = 2.0f;
std::vector<float> low; low.reserve(dim); std::vector<float> high; high.reserve(dim);
for (int i = 0; i < nSegments; ++i) { low.push_back(thetaMin); high.push_back(thetaMax); }
if (variableLengths) { for (int i = 0; i < nSegments; ++i) { low.push_back(lenMin); high.push_back(lenMax); } }
ManipCost cost(nSegments, variableLengths, targetX, targetY, minTheta);
const int rank = g_world->rank(), world = g_world->size();
std::vector<float> bestQ; float bestF = FLT_MAX, bx = 0.0f, by = 0.0f;
const int levels0 = (std::min)(peanoLevels, 8), maxIter0 = (int)(maxIterPerBranch * 0.2f);
MortonND map0(dim, levels0, low.data(), high.data(), g_mc);
std::vector<Interval*> H_coarse; std::vector<float> bestQ_coarse; float bestF_coarse = FLT_MAX, bx_coarse = 0.0f, by_coarse = 0.0f;
float M_prior = (variableLengths ? 2.0f * nSegments : 2.0f * nSegments) * (1.0f / (float)(1u << levels0)); if (variableLengths) M_prior += 1.41421356237f * (1.0f / (float)(1u << levels0));
agp_run_branch_mpi(map0, cost, maxIter0, r, adaptiveMode, epsilon, seed, H_coarse, bestQ_coarse, bestF_coarse, bx_coarse, by_coarse, M_prior);
if (bestF_coarse < bestF) { bestF = bestF_coarse; bestQ = bestQ_coarse; bx = bx_coarse; by = by_coarse; }
if (levels0 < peanoLevels) {
MortonND map1(dim, peanoLevels, low.data(), high.data(), g_mc);
std::vector<Interval*> H_fine; std::vector<float> bestQ_fine = bestQ; float bestF_fine = bestF, bx_fine = bx, by_fine = by;
float M_prior_fine = (variableLengths ? 2.0f * nSegments : 2.0f * nSegments) * (1.0f / (float)(1u << peanoLevels)); if (variableLengths) M_prior_fine += 1.41421356237f * (1.0f / (float)(1u << peanoLevels));
if (!H_coarse.empty()) {
std::sort(H_coarse.begin(), H_coarse.end(), [](const Interval* a, const Interval* b) {return a->R < b->R; });
const size_t topCount = (size_t)(H_coarse.size() * 0.3f);
for (size_t i = 0; i < topCount && i < H_coarse.size(); ++i) {
const Interval* C = H_coarse[i];
alignas(64) float q1[32], q2[32]; float x1, y1, x2, y2;
map1.map01ToPoint(C->x1, q1); float f1 = cost(q1, x1, y1);
map1.map01ToPoint(C->x2, q2); float f2 = cost(q2, x2, y2);
H_fine.push_back(new Interval(C->x1, C->x2, f1, f2, (float)dim));
if (f1 < bestF_fine) { bestF_fine = f1; bestQ_fine.assign(q1, q1 + dim); bx_fine = x1; by_fine = y1; }
if (f2 < bestF_fine) { bestF_fine = f2; bestQ_fine.assign(q2, q2 + dim); bx_fine = x2; by_fine = y2; }
}
std::make_heap(H_fine.begin(), H_fine.end(), ComparePtr);
}
agp_run_branch_mpi(map1, cost, maxIterPerBranch - maxIter0, r, adaptiveMode, epsilon, seed, H_fine, bestQ_fine, bestF_fine, bx_fine, by_fine, M_prior_fine);
if (bestF_fine < bestF) { bestF = bestF_fine; bestQ = bestQ_fine; bx = bx_fine; by = by_fine; }
}
BestPacket me{ bestF,dim,bx,by };
if (!rank) {
std::vector<float> winnerQ = bestQ; float winF = bestF, wx = bx, wy = by;
for (int i = 1; i < world; ++i) { BestPacket bp; g_world->recv(i, 2, bp); std::vector<float> qin; g_world->recv(i, 3, qin); if (bp.bestF < winF) { winF = bp.bestF; wx = bp.bestX; wy = bp.bestY; winnerQ = qin; } }
*out_bestQLen = winnerQ.size(); out_bestQ = (float)CoTaskMemAlloc(sizeof(float) * (*out_bestQLen));
memcpy(*out_bestQ, winnerQ.data(), sizeof(float) * (*out_bestQLen)); *out_bestX = wx; *out_bestY = wy; *out_bestF = winF;
}
else { g_world->send(0, 2, me); g_world->send(0, 3, bestQ); }
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(int peanoLevel, float a, float b, float c, float d) {
g_env = new boost::mpi::environment();
g_world = new boost::mpi::communicator();
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
const int rank = g_world->rank();
const int world_size = g_world->size();
if (world_size == 4) {
new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, rank & 3);
}
g_mc.baseSeed = fmaf(0x9E3779B9u, rank, 0x9E3779B9u);
return rank;
}

__forceinline float ShekelFunc(float x, float seed) {
int i = 0; float st = seed, r1, r2, res = 0.0f;
while (i < 10) {
XOR_RAND(st, r1);
float xp = fmaf(-r1, 10.0f, x);
XOR_RAND(st, r1);
XOR_RAND(st, r2);
float d = fmaf(fmaf(r1, 20.0f, 5.0f), xp * xp, fmaf(r2, 0.2f, 1.0f));
d = copysignf(fmaxf(fabsf(d), FLT_MIN), d);
res -= 1.0f / d;
++i;
}
return res;
}

__forceinline float RastriginFunc(float x1, float x2) {
const float t = fmaf(x1, x1, x2 * x2);
float c1, c2;
FABE13_COS(6.28318530717958647692f * x1, c1);
FABE13_COS(6.28318530717958647692f * x2, c2);
return (t - fmaf(c1 + c2, 10.0f, -14.6f)) * fmaf(-t, 0.25f, 18.42f);
}

__forceinline float HillFunc(float x, float seed) {
int j = 0;
__declspec(align(32)) float ang[14u];
float st = 6.28318530717958647692f * x;
while (j < 14) { ang[j] = st * (float)(j + 1); ++j; }
__declspec(align(32)) float sv[14u], cv[14u];
FABE13_SINCOS(ang, sv, cv, 14u);
float state = seed, r1, r2;
XOR_RAND(state, r1);
float res = fmaf(r1, 2.0f, -1.1f);
--j;
while (j >= 0) {
XOR_RAND(state, r1);
XOR_RAND(state, r2);
res += fmaf(fmaf(r1, 2.0f, -1.1f), sv[j], fmaf(r2, 2.0f, -1.1f) * cv[j]);
--j;
}
return res;
}

__forceinline float GrishaginFunc(float x1, float x2, float seed) {
int j = 0;
__declspec(align(32)) float aj[8u], ak[8u];
while (j < 8) { float pj = 3.14159265358979323846f * (float)(j + 1); aj[j] = pj * x1; ak[j] = pj * x2; ++j; }
__declspec(align(32)) float sj[8u], cj[8u], sk[8u], ck[8u];
FABE13_SINCOS(aj, sj, cj, 8u);
FABE13_SINCOS(ak, sk, ck, 8u);
--j;
float p1 = 0.0f, p2 = 0.0f;
float st = seed, r1, r2;
while (j >= 0) {
size_t k = 0u;
while (k < 8u) {
float s = sj[j] * sj[j];
float c = ck[k] * ck[k];
XOR_RAND_GRSH(st, r1);
XOR_RAND_GRSH(st, r2);
p1 = fmaf(r1, s, fmaf(r2, c, p1));
XOR_RAND_GRSH(st, r1);
XOR_RAND_GRSH(st, r2);
p2 = fmaf(-r1, c, fmaf(r2, s, p2));
++k;
}
--j;
}
return -sqrtf(fmaf(p1, p1, p2 * p2));
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_1D(float global_iterations, float a, float b, float r, bool mode, float epsilon, float seed, float** out_data, size_t* out_len) {
Slab* slab = tls.local();
slab->current = slab->base;
int schetchick = 0;
const float initial_length = b - a;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length;
const float inv_threshold_03 = 1.0f / threshold_03;
const float start_val = ShekelFunc(a, seed);
float best_f = ShekelFunc(b, seed);
float x_Rmax_1 = a;
float x_Rmax_2 = b;
float y_Rmax_1 = start_val;
float y_Rmax_2 = best_f;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
std::vector<Interval*, boost::alignment::aligned_allocator<Interval*, 64u>> R;
Extr.reserve((size_t)global_iterations << 2u);
R.reserve((size_t)global_iterations << 1u);
R.emplace_back(new Interval(a, b, start_val, best_f, 1.0f));
float Mmax = R.front()->M;
float m = r * Mmax;
while (true) {
const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r);
const float new_value = ShekelFunc(new_point, seed);
if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_point); }
std::pop_heap(R.begin(), R.end(), ComparePtr);
const Interval* pro = R.back();
const float new_x1 = pro->x1;
const float new_x2 = pro->x2;
const float len2 = new_x2 - new_point;
const float len1 = new_point - new_x1;
const float interval_len = len1 < len2 ? len1 : len2;
if (++schetchick == (int)global_iterations || interval_len < epsilon) {
Extr.emplace_back((float)schetchick);
Extr.emplace_back(interval_len);
*out_len = Extr.size();
out_data = (float)CoTaskMemAlloc(sizeof(float) * (out_len));
memcpy(out_data, Extr.data(), sizeof(float) * (out_len));
return;
}
Interval curr = new Interval(new_x1, new_point, pro->y1, new_value, 1.0f);
Interval curr1 = new Interval(new_point, new_x2, new_value, pro->y2, 1.0f);
const float currM = curr->M > curr1->M ? curr->M : curr1->M;
const size_t r_size = R.size();
if (mode) {
if (len2 + len1 == dmax) { dmax = len2 > len1 ? len2 : len1; for (auto p : R) { float L = p->x2 - p->x1; if (L > dmax) dmax = L; } }
if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) {
if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
const float alpha = progress * progress;
const float betta = 2.0f - alpha;
const float MULT = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULT, r, -MULT);
const float GF = betta * global_coeff;
curr->ChangeCharacteristic(fmaf(GF, len1, curr->M * alpha));
curr1->ChangeCharacteristic(fmaf(GF, len2, curr1->M * alpha));
RecomputeR_AffineM_AVX2(R.data(), r_size, GF, alpha);
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); }
}
else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
}
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); }
}
else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
}
R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr);
R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr);
const Interval top = R.front();
x_Rmax_1 = top->x1; x_Rmax_2 = top->x2; y_Rmax_1 = top->y1; y_Rmax_2 = top->y2;
}
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_2D(
const float N,
const float global_iterations,
const float a,
const float b,
const float c,
const float d,
const float r,
const bool mode,
const float epsilon,
const float seed,
float** const __restrict out_data,
size_t* const __restrict out_len) noexcept
{
Slab* const __restrict slab = tls.local();
slab->current = slab->base;
int schetchick = 0;
int no_improve = 0;
const int rank = g_world->rank();
const int world_size = g_world->size();
while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg dummy; g_world->recv(boost::mpi::any_source, 0, dummy); }
const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1));
const float x_addition = (b - a) * inv_divider;
const float y_addition = (d - c) * inv_divider;
const float true_start = a + x_addition;
const float true_end = b - x_addition;
float x_Rmax_1 = true_start;
float x_Rmax_2 = true_end;
const float initial_length = x_Rmax_2 - x_Rmax_1;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length;
const float inv_threshold_03 = 1.0f / threshold_03;
const float start_val = rank % 3 ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition);
float best_f = rank % 2 ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition);
float y_Rmax_1 = start_val;
float y_Rmax_2 = best_f;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R;
Extr.clear(); Extr.reserve(static_cast<size_t>(global_iterations) << 2u);
R.clear(); R.reserve(static_cast<size_t>(global_iterations) << 1u);
R.emplace_back(new Interval(true_start, true_end, start_val, best_f, 2.0f));
const Interval* __restrict top_ptr;
float Mmax = R.front()->M;
float m = r * Mmax;
while (true) {
const float interval_len = x_Rmax_2 - x_Rmax_1;
const bool stagnation = no_improve > 100 && schetchick > 270;
const float p = fmaf(-1.0f / initial_length, dmax, 1.0f);
while (g_world->iprobe(boost::mpi::any_source, 0)) {
CtrlMsg in; g_world->recv(boost::mpi::any_source, 0, in);
if (in.kind) {
if (!rank) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (out_len)); }
return;
}
const float sx = FindX2D_analytic(in.xchg.s_x1, in.xchg.s_x2);
const float ex = FindX2D_analytic(in.xchg.e_x1, in.xchg.e_x2);
Interval const __restrict injected = new Interval(sx, ex, RastriginFunc(in.xchg.s_x1, in.xchg.s_x2), RastriginFunc(in.xchg.e_x1, in.xchg.e_x2), 2.0f);
injected->ChangeCharacteristic(m);
if (injected->R > 1.15f * top_ptr->R) {
const float k = stagnation ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f);
injected->R = in.xchg.Rtop * k;
R.emplace_back(injected);
std::push_heap(R.begin(), R.end(), ComparePtr);
}
}
const int T = static_cast<int>(fmaf(-expm1f(p), 264.0f, 277.0f));
const bool want_term = interval_len < epsilon || schetchick == static_cast<int>(global_iterations);
if (!(++schetchick % T) || stagnation || want_term) {
CtrlMsg out; out.kind = want_term;
if (!out.kind) {
float s_x1, s_x2, e_x1, e_x2;
HitTest2D_analytic(top_ptr->x1, s_x1, s_x2);
HitTest2D_analytic(top_ptr->x2, e_x1, e_x2);
out.xchg = CrossMsg{ s_x1, s_x2, e_x1, e_x2, top_ptr->R };
}
for (int i = 0; i < world_size; ++i) { if (i != rank) g_world->isend(i, 0, out); }
if (out.kind) {
if (!rank) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (out_len))); memcpy(out_data, Extr.data(), sizeof(float) * (out_len)); }
return;
}
}
const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r);
float new_x1_val, new_x2_val;
HitTest2D_analytic(new_point, new_x1_val, new_x2_val);
const float new_value = RastriginFunc(new_x1_val, new_x2_val);
if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_x1_val); Extr.emplace_back(new_x2_val); no_improve = 0; }
else { ++no_improve; }
std::pop_heap(R.begin(), R.end(), ComparePtr);
Interval const __restrict promejutochny_otrezok = R.back();
const float segment_x1 = promejutochny_otrezok->x1;
const float segment_x2 = promejutochny_otrezok->x2;
const float len2 = segment_x2 - new_point;
const float len1 = new_point - segment_x1;
Interval const __restrict curr = new Interval(segment_x1, new_point, promejutochny_otrezok->y1, new_value, 2.0f);
Interval const __restrict curr1 = new Interval(new_point, segment_x2, new_value, promejutochny_otrezok->y2, 2.0f);
const float currM = (std::max)(curr->M, curr1->M);
const size_t r_size = R.size();
if (mode) {
if (len2 + len1 == dmax) { dmax = (std::max)(len1, len2); for (auto pI : R) { float L = pI->x2 - pI->x1; if (L > dmax) dmax = L; } }
if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) {
if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
const float alpha = progress * progress;
const float betta = 2.0f - alpha;
const float MULTIPLIER = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
const float GLOBAL_FACTOR = betta * global_coeff;
curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));
RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha);
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); }
}
else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
}
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); }
}
else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
}
R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr);
R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr);
top_ptr = R.front();
x_Rmax_1 = top_ptr->x1; x_Rmax_2 = top_ptr->x2; y_Rmax_1 = top_ptr->y1; y_Rmax_2 = top_ptr->y2;
}
}

struct RunParams { int nSegments; uint8_t varLen; float minTheta; float tx, ty; int levels, maxIter; float r; uint8_t adaptive; float eps; unsigned int seed; template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar& nSegments& varLen& minTheta& tx& ty& levels& maxIter& r& adaptive& eps& seed; } };

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartManipND(int nSegments, bool variableLengths, float minTheta, float targetX, float targetY, int peanoLevels, int maxIterPerBranch, float r, bool adaptiveMode, float epsilon, unsigned int seed) {
RunParams p; p.nSegments = nSegments; p.varLen = (uint8_t)variableLengths; p.minTheta = minTheta; p.tx = targetX; p.ty = targetY; p.levels = peanoLevels; p.maxIter = maxIterPerBranch; p.r = r; p.adaptive = (uint8_t)adaptiveMode; p.eps = epsilon; p.seed = seed;
const int rank = g_world->rank(); const int world = g_world->size();
for (int i = 1; i < world; ++i) { if (i != rank) g_world->isend(i, 1, p); }
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() {
while (true) {
if (g_world->iprobe(0, 1)) {
RunParams p; g_world->recv(0, 1, p);
float* q = nullptr; size_t qlen = 0; float bx = 0, by = 0, bf = 0;
AGP_Manip2D(p.nSegments, (bool)p.varLen, p.minTheta, p.tx, p.ty, p.levels, p.maxIter, p.r, (bool)p.adaptive, p.eps, p.seed, &q, &qlen, &bx, &by, &bf);
if (q) CoTaskMemFree(q);
}
Sleep(0);
}
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartWorkers() { int i = 1; const int world = g_world->size(); while (i < world) { g_world->isend(i, 1, 0); ++i; } }

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_Free(float* p) { CoTaskMemFree(p); }

Answer 1

cpp
#include "pch.h"

#define XOR_RAND(state,result_var) do{int s=(state);s^=s<<13;s^=s>>17;s^=s<<5;state=s;result_var=state*0x1.0p-32f;}while(0)
#define XOR_RAND_GRSH(state,result_var) do{int s=(state);s^=s<<13;s^=s>>17;s^=s<<5;state=s;result_var=fmaf(state,0x1.0p-31f,-1.0f);}while(0)
#define FABE13_COS(x,result_var) do{const float _ax_=fabsf(x);float _r_=fmodf(_ax_,6.28318530718f);if(_r_>3.14159265359f)_r_=6.28318530718f-_r_;if(_r_<1.57079632679f){const float _t2_=_r_*_r_;const float _t4_=_t2_*_t2_;result_var=fmaf(_t4_,fmaf(_t2_,-0.0013888889f,0.0416666667f),fmaf(_t2_,-0.5f,1.0f));}else{_r_=3.14159265359f-_r_;const float _t2_=_r_*_r_;const float _t4_=_t2_*_t2_;result_var=-fmaf(_t4_,fmaf(_t2_,-0.0013888889f,0.0416666667f),fmaf(_t2_,-0.5f,1.0f));}}while(0)
#define FABE13_SIN(x,result_var) do{const float _x_=(x);const float _ax_=fabsf(_x_);float _r_=fmodf(_ax_,6.28318530718f);bool _sfl_=_r_>3.14159265359f;if(_sfl_)_r_=6.28318530718f-_r_;bool _cfl_=_r_>1.57079632679f;if(_cfl_)_r_=3.14159265359f-_r_;const float _t2_=_r_*_r_;float _s=fmaf(_t2_,fmaf(_t2_,fmaf(_t2_,-0.0001984127f,0.0083333333f),-0.16666666f),1.0f)*_r_;result_var=((_x_<0.0f)^_sfl_)?-_s:_s;}while(0)
#define FABE13_SINCOS(in,sin_out,cos_out,n) do{int i=0;const int limit=(n)&~7;if((n)>=8){static __declspec(align(32)) const __m256 VEC_TWOPI=_mm256_set1_ps(6.28318530718f);static __declspec(align(32)) const __m256 VEC_PI=_mm256_set1_ps(3.14159265359f);static __declspec(align(32)) const __m256 VEC_PI_2=_mm256_set1_ps(1.57079632679f);static __declspec(align(32)) const __m256 INV_TWOPI=_mm256_set1_ps(0.15915494309189535f);static __declspec(align(32)) const __m256 BIAS=_mm256_set1_ps(12582912.0f);static __declspec(align(32)) const __m256 VEC_COS_P5=_mm256_set1_ps(-0.0013888889f);static __declspec(align(32)) const __m256 VEC_COS_P3=_mm256_set1_ps(0.0416666667f);static __declspec(align(32)) const __m256 VEC_COS_P1=_mm256_set1_ps(-0.5f);static __declspec(align(32)) const __m256 VEC_COS_P0=_mm256_set1_ps(1.0f);static __declspec(align(32)) const __m256 VEC_SIN_P5=_mm256_set1_ps(-0.0001984127f);static __declspec(align(32)) const __m256 VEC_SIN_P3=_mm256_set1_ps(0.0083333333f);static __declspec(align(32)) const __m256 VEC_SIN_P1=_mm256_set1_ps(-0.16666666f);static __declspec(align(32)) const __m256 VEC_SIN_P0=_mm256_set1_ps(1.0f);static __declspec(align(32)) const __m256 VEC_ZERO=_mm256_setzero_ps();while(i<limit){const __m256 vx=_mm256_load_ps(&(in)[i]);const __m256 vax=_mm256_andnot_ps(_mm256_set1_ps(-0.0f),vx);__m256 q=_mm256_fmadd_ps(vax,INV_TWOPI,BIAS);q=_mm256_sub_ps(q,BIAS);const __m256 r=_mm256_fnmadd_ps(VEC_TWOPI,q,vax);const __m256 r1=_mm256_min_ps(r,_mm256_sub_ps(VEC_TWOPI,r));const __m256 r2=_mm256_min_ps(r1,_mm256_sub_ps(VEC_PI,r1));const __m256 t2=_mm256_mul_ps(r2,r2);const __m256 cosv=_mm256_fmadd_ps(t2,_mm256_fmadd_ps(t2,_mm256_fmadd_ps(t2,VEC_COS_P5,VEC_COS_P3),VEC_COS_P1),VEC_COS_P0);const __m256 sinv=_mm256_mul_ps(_mm256_fmadd_ps(t2,_mm256_fmadd_ps(t2,_mm256_fmadd_ps(t2,VEC_SIN_P5,VEC_SIN_P3),VEC_SIN_P1),VEC_SIN_P0),r2);const __m256 cflip=_mm256_cmp_ps(r1,VEC_PI_2,_CMP_GT_OQ);const __m256 sflip=_mm256_xor_ps(_mm256_cmp_ps(vx,VEC_ZERO,_CMP_LT_OQ),_mm256_cmp_ps(r,VEC_PI,_CMP_GT_OQ));_mm256_store_ps(&(cos_out)[i],_mm256_blendv_ps(cosv,_mm256_sub_ps(VEC_ZERO,cosv),cflip));_mm256_store_ps(&(sin_out)[i],_mm256_blendv_ps(sinv,_mm256_sub_ps(VEC_ZERO,sinv),sflip));i+=8;}}while(i<(n)){const float x=(in)[i];const float ax=fabsf(x);float q=fmaf(ax,0.15915494309189535f,12582912.0f);q-=12582912.0f;float r=fmaf(-6.28318530718f,q,ax);const bool sflip=r>3.14159265359f;if(sflip)r=6.28318530718f-r;const bool cflip=r>1.57079632679f;if(cflip)r=3.14159265359f-r;const float t2=r*r;const float c=fmaf(t2,fmaf(t2,fmaf(t2,-0.0013888889f,0.0416666667f),-0.5f),1.0f);const float s=fmaf(t2,fmaf(t2,fmaf(t2,-0.0001984127f,0.0083333333f),-0.16666666f),1.0f)*r;(cos_out)[i]=cflip?-c:c;(sin_out)[i]=((x<0.0f)^sflip)?-s:s;++i;}}while(0)

enum List:uint8_t{Top=0b00u,Down=0b01u,Left=0b10u,Right=0b11u};
__declspec(align(4)) struct Step final{const uint8_t next;const uint8_t dx;const uint8_t dy;};
__declspec(align(4)) struct InvStep final{const uint8_t q;const uint8_t next;};

__declspec(align(64)) static const Step g_step_tbl[4][4]={
{{Right,0u,0u},{Top,0u,1u},{Top,1u,1u},{Left,1u,0u}},
{{Left,1u,1u},{Down,1u,0u},{Down,0u,0u},{Right,0u,1u}},
{{Down,1u,1u},{Left,0u,1u},{Left,0u,0u},{Top,1u,0u}},
{{Top,0u,0u},{Right,1u,0u},{Right,1u,1u},{Down,0u,1u}}
};
__declspec(align(64)) static const InvStep g_inv_tbl[4][4]={
{{0u,Right},{1u,Top},{3u,Left},{2u,Top}},
{{2u,Down},{3u,Right},{1u,Down},{0u,Left}},
{{2u,Left},{1u,Left},{3u,Top},{0u,Down}},
{{0u,Top},{3u,Down},{1u,Right},{2u,Right}}
};

static const boost::mpi::environment* g_env;
static const boost::mpi::communicator* g_world;

__declspec(align(16)) struct CrossMsg final{float s_x1,s_x2;float e_x1,e_x2;float Rtop;template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar,const unsigned int)noexcept{ar&s_x1&s_x2&e_x1&e_x2&Rtop;}};
__declspec(align(16)) struct CtrlMsg final{bool kind;CrossMsg xchg;template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar,const unsigned int)noexcept{ar&kind&xchg;}};

__declspec(align(16)) struct Slab final{char* const base;char* current;char* const end;__forceinline Slab(void* const memory,const size_t usable):base((char*)memory),current(base),end(base+(usable&~(size_t)63u)){}};
static tbb::enumerable_thread_specific<Slab*> tls([]()noexcept{void* memory=_aligned_malloc(16777216u,64u);Slab* slab=(Slab*)_aligned_malloc(32u,64u);new(slab) Slab(memory,16777216u);char* p=slab->base;while(p<slab->end){*p=0;p+=4096u;}return slab;});

__declspec(align(16)) struct Peano2DMap final{const int levels;const float a,b,c,d;const float lenx,leny;const float inv_lenx;const uint32_t scale;const uint8_t start;__forceinline Peano2DMap(int L,float _a,float _b,float _c,float _d,uint8_t st):levels(L),a(_a),b(_b),c(_c),d(_d),lenx(_b-_a),leny(_d-_c),inv_lenx(1.0f/(_b-_a)),scale((uint32_t)1u<<(L<<1)),start(st){}};
static Peano2DMap gActiveMap(0,0,0,0,0,0);

__forceinline bool ComparePtr(const struct Interval* a,const struct Interval* b)noexcept;

__declspec(align(64)) struct Interval final{
const float x1,x2,y1,y2,delta_y,ordinate_factor,N_factor,quadratic_term,M;float R;
__forceinline void* operator new(size_t)noexcept{Slab* s=tls.local();char* r=s->current;s->current+=64u;return r;}
__forceinline Interval(float _x1,float _x2,float _y1,float _y2,float _N)noexcept:x1(_x1),x2(_x2),y1(_y1),y2(_y2),delta_y(_y2-_y1),ordinate_factor(-(y1+y2)*2.0f),N_factor(_N==1.0f?_x2-_x1:_N==2.0f?sqrtf(_x2-_x1):powf(_x2-_x1,1.0f/_N)),quadratic_term((1.0f/N_factor)*delta_y*delta_y),M((1.0f/N_factor)*fabsf(delta_y)){}
__forceinline void ChangeCharacteristic(float _m)noexcept{R=fmaf(1.0f/_m,quadratic_term,fmaf(_m,N_factor,ordinate_factor));}
};
__forceinline bool ComparePtr(const Interval* a,const Interval* b)noexcept{return a->R<b->R;}

__forceinline void RecomputeR_ConstM_AVX2(Interval* const* arr,size_t n,float m){const __m256 vm=_mm256_set1_ps(m);__m256 vinvm=_mm256_rcp_ps(vm);vinvm=_mm256_mul_ps(vinvm,_mm256_fnmadd_ps(vm,vinvm,_mm256_set1_ps(2.0f)));size_t i=0;const size_t limit=n&~7ull;alignas(32) float q[8],nf[8],od[8],out[8];for(;i<limit;i+=8){for(int k=0;k<8;++k){const Interval* p=arr[i+k];q[k]=p->quadratic_term;nf[k]=p->N_factor;od[k]=p->ordinate_factor;}const __m256 vq=_mm256_load_ps(q),vnf=_mm256_load_ps(nf),vod=_mm256_load_ps(od);const __m256 t=_mm256_fmadd_ps(vm,vnf,vod);const __m256 res=_mm256_fmadd_ps(vq,vinvm,t);_mm256_store_ps(out,res);for(int k=0;k<8;++k)arr[i+k]->R=out[k];}for(;i<n;++i)arr[i]->ChangeCharacteristic(m);}
__forceinline void RecomputeR_AffineM_AVX2(Interval* const* arr,size_t n,float GF,float alpha){const __m256 vGF=_mm256_set1_ps(GF),va=_mm256_set1_ps(alpha);size_t i=0;const size_t limit=n&~7ull;alignas(32) float ln[8],Mv[8],q[8],nf[8],od[8],out[8];for(;i<limit;i+=8){for(int k=0;k<8;++k){const Interval* p=arr[i+k];ln[k]=p->x2-p->x1;Mv[k]=p->M;q[k]=p->quadratic_term;nf[k]=p->N_factor;od[k]=p->ordinate_factor;}const __m256 vln=_mm256_load_ps(ln),vM=_mm256_load_ps(Mv),vq=_mm256_load_ps(q),vnf=_mm256_load_ps(nf),vod=_mm256_load_ps(od);const __m256 vm=_mm256_fmadd_ps(vGF,vln,_mm256_mul_ps(va,vM));__m256 vinvm=_mm256_rcp_ps(vm);vinvm=_mm256_mul_ps(vinvm,_mm256_fnmadd_ps(vm,vinvm,_mm256_set1_ps(2.0f)));const __m256 t=_mm256_fmadd_ps(vm,vnf,vod);const __m256 res=_mm256_fmadd_ps(vq,vinvm,t);_mm256_store_ps(out,res);for(int k=0;k<8;++k)arr[i+k]->R=out[k];}for(;i<n;++i){const Interval* p=arr[i];const float mi=fmaf(GF,(p->x2-p->x1),p->M*alpha);arr[i]->R=fmaf(1.0f/mi,p->quadratic_term,fmaf(mi,p->N_factor,p->ordinate_factor));}}

__forceinline float fast_pow_int(float v,int n){float r;switch(n){case 1:r=v;break;case 2:r=v*v;break;case 3:{float v2=v*v;r=v2*v;}break;case 4:{float v2=v*v;r=v2*v2;}break;case 5:{float v2=v*v;r=v2*v2*v;}break;case 6:{float v2=v*v;float v4=v2*v2;r=v4*v2;}break;case 7:{float v2=v*v;float v4=v2*v2;r=v4*v2*v;}break;case 8:{float v2=v*v;float v4=v2*v2;r=v4*v4;}break;case 9:{float v3=v*v*v;float v6=v3*v3;r=v6*v3;}break;case 10:{float v2=v*v;float v4=v2*v2;float v8=v4*v4;r=v8*v2;}break;case 11:{float v2=v*v;float v4=v2*v2;float v8=v4*v4;r=v8*v2*v;}break;case 12:{float v3=v*v*v;float v6=v3*v3;r=v6*v6;}break;case 13:{float v3=v*v*v;float v6=v3*v3;r=v6*v6*v;}break;case 14:{float v7=v*v*v*v*v*v*v;r=v7*v7;}break;default:{float v7=v*v*v*v*v*v*v;r=v7*v7*v;}break;}return r;}
__forceinline float Shag(float _m,float x1,float x2,float y1,float y2,float _N,float _r){const float diff=y2-y1;const float sign_mult=_mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(0x3F800000u|(((*((uint32_t*)&diff))&0x80000000u)^0x80000000u))));const int Ni=(int)(_N+0.5f);if(Ni==1)return fmaf(-(1.0f/_m),diff,x1+x2)*0.5f;if(Ni==2)return fmaf(sign_mult/(_m*_m),diff*diff*_r,x1+x2)*0.5f;const float invmN=1.0f/fast_pow_int(_m,Ni);const float dN=fast_pow_int(fabsf(diff),Ni);return fmaf(sign_mult*invmN,dN*_r,x1+x2)*0.5f;}

struct MortonCachePerRank{std::vector<int> permCache;std::vector<uint64_t> invMaskCache;uint32_t baseSeed;};
static MortonCachePerRank g_mc;

struct MortonND final{
int dim,levels;uint64_t scale;std::vector<float> low,high,step,invStep,baseOff;std::vector<int> perm;std::vector<uint64_t> invMask,pextMask;float invScaleLevel;
__forceinline MortonND(int D,int L,const float* lows,const float* highs,const MortonCachePerRank& mc):dim(D),levels(L),scale(1ull<<((uint64_t)D*L)),low(lows,lows+D),high(highs,highs+D),step(D,0.0f),invStep(D,0.0f),baseOff(D,0.0f),perm(mc.permCache.begin(),mc.permCache.begin()+D),invMask(mc.invMaskCache.begin(),mc.invMaskCache.begin()+D),pextMask(D,0ull),invScaleLevel(1.0f/(float)((uint64_t)1<<L)){uint64_t level_mask=(1ull<<L)-1ull;for(int k=0;k<D;++k)invMask[k]&=level_mask;for(int d=0;d<dim;++d){float rng=high[d]-low[d];float st=rng*invScaleLevel;step[d]=st;invStep[d]=1.0f/st;baseOff[d]=fmaf(0.5f,st,low[d]);}for(int d=0;d<dim;++d){uint64_t m=0ull,bitpos=(uint64_t)d;for(int b=0;b<levels;++b){m|=1ull<<bitpos;bitpos+=(uint64_t)dim;}pextMask[d]=m;}}
__forceinline void map01ToPoint(float t,float* __restrict out)const noexcept{if(t<=0.0f)t=0.0f;else if(t>=1.0f)t=0x1.fffffep-1f;uint64_t idx=(uint64_t)((double)t*(double)scale);if(idx>=scale)idx=scale-1ull;for(int d=0;d<dim;++d){int pd=perm[d];uint64_t bits=_pext_u64(idx,pextMask[d]);if(invMask[pd])bits^=invMask[pd];out[pd]=fmaf(step[pd],(float)bits,baseOff[pd]);}}
__forceinline uint64_t clamp_bits(int64_t v,int bits)const noexcept{if(v<0)v=0;int64_t maxv=((int64_t)1<<bits)-1;if(v>maxv)v=maxv;return (uint64_t)v;}
__forceinline float pointToT(const float* __restrict q)const noexcept{const int bits=levels;uint64_t idx=0ull;for(int d=0;d<dim;++d){int pd=perm[d];float v=(q[pd]-baseOff[pd])*invStep[pd];int64_t cell=(int64_t)_mm_cvt_ss2si(_mm_round_ss(_mm_setzero_ps(),_mm_set_ss(v),_MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));uint64_t b=clamp_bits(cell,bits);if(invMask[pd])b^=invMask[pd];idx|=_pdep_u64(b,pextMask[d]);}return((float)idx+0.5f)/(float)scale;}
};

struct ManipCost final{
int n;bool variableLen;float targetX,targetY;float minTheta;
__forceinline ManipCost(int _n,bool _variableLen,float _targetX,float _targetY,float _minTheta):n(_n),variableLen(_variableLen),targetX(_targetX),targetY(_targetY),minTheta(_minTheta){}
__forceinline float operator()(const float* __restrict q,float& out_x,float& out_y)const noexcept{const float* th=q;const float* L=variableLen?(q+n):nullptr;__declspec(align(64)) float phi[16],s_arr[16],c_arr[16];float x=0.0f,y=0.0f,phi_acc=0.0f,penC=0.0f;for(int i=0;i<n;++i){phi_acc+=th[i];phi[i]=phi_acc;}FABE13_SINCOS(phi,s_arr,c_arr,n);const float Lc=1.0f;if(variableLen){for(int i=0;i<n;++i){float Li=L[i];x=fmaf(Li,c_arr[i],x);y=fmaf(Li,s_arr[i],y);}}else{for(int i=0;i<n;++i){x=fmaf(Lc,c_arr[i],x);y=fmaf(Lc,s_arr[i],y);}}for(int i=0;i<n;++i){float ai=fabsf(th[i]);float v=minTheta-ai;float u=v>0.0f?v:0.0f;penC=fmaf(u,u,penC);}float dx=x-targetX;float dy=y-targetY;float dist=sqrtf(fmaf(dx,dx,dy*dy));out_x=x;out_y=y;return dist+penC;}
};

__forceinline void HitTest2D_analytic(float x_param,float& out_x1,float& out_x2){const float a=gActiveMap.a;const float inv_lenx=gActiveMap.inv_lenx;const uint32_t scale=gActiveMap.scale;const uint32_t scale_minus_1=scale-1u;const float lenx=gActiveMap.lenx;const float leny=gActiveMap.leny;const float c=gActiveMap.c;const uint8_t start=gActiveMap.start;const int levels=gActiveMap.levels;float norm=(x_param-a)*inv_lenx;norm=fminf(fmaxf(norm,0.0f),0x1.fffffep-1f);uint32_t idx=(uint32_t)(norm*(float)scale);idx=idx>scale_minus_1?scale_minus_1:idx;float sx=lenx,sy=leny;float x1=a,x2=c;uint8_t type=start;int l=levels-1;while(l>=0){const uint32_t q=(idx>>(l*2))&3u;const Step s=g_step_tbl[type][q];type=s.next;sx*=0.5f;sy*=0.5f;x1+=s.dx?sx:0.0f;x2+=s.dy?sy:0.0f;--l;}out_x1=x1+sx*0.5f;out_x2=x2+sy*0.5f;}
__forceinline float FindX2D_analytic(float px,float py){const float a=gActiveMap.a;const float b=gActiveMap.b;const float c=gActiveMap.c;const float d=gActiveMap.d;const float lenx=gActiveMap.lenx;const float leny=gActiveMap.leny;const uint32_t scale=gActiveMap.scale;const uint8_t start=gActiveMap.start;const int levels=gActiveMap.levels;const float clamped_px=fminf(fmaxf(px,a),b);const float clamped_py=fminf(fmaxf(py,c),d);float sx=lenx,sy=leny;float x0=a,y0=c;uint32_t idx=0u;uint8_t type=start;int l=0;while(l<levels){sx*=0.5f;sy*=0.5f;const float mx=x0+sx;const float my=y0+sy;const uint32_t tr=(uint32_t)((clamped_px>mx)&(clamped_py>my));const uint32_t tl=(uint32_t)((clamped_px<mx)&(clamped_py>my));const uint32_t dl=(uint32_t)((clamped_px<mx)&(clamped_py<my));const uint32_t none=(uint32_t)(1u^(tr|tl|dl));const uint32_t dd=(tr<<1)|tr|tl|(none<<1);const InvStep inv=g_inv_tbl[type][dd];type=inv.next;idx=(idx<<2)|inv.q;const uint32_t dx=dd>>1;const uint32_t dy=dd&1u;x0+=dx?sx:0.0f;y0+=dy?sy:0.0f;++l;}const float scale_recip=1.0f/(float)scale;return fmaf((float)idx*scale_recip,lenx,a);}

__declspec(align(16)) struct MultiCrossMsg final{float intervals[15];uint8_t count;template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar,const unsigned int)noexcept{ar&intervals&count;}};
__declspec(align(16)) struct BestSolutionMsg final{float bestF;float bestX,bestY;float bestQ[32];uint8_t dim;template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar,const unsigned int)noexcept{ar&bestF&bestX&bestY&bestQ&dim;}};
__declspec(align(16)) struct CtrlMsgND final{uint8_t kind;CrossMsg xchg;MultiCrossMsg multiXchg;BestSolutionMsg bestSol;template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar,const unsigned int)noexcept{ar&kind;if(kind==1)ar&xchg;else if(kind==2)ar&multiXchg;else if(kind==3)ar&bestSol;}};

static __forceinline int generate_heuristic_seeds(const ManipCost& cost,int dim,float* __restrict S,int stride){const int n=cost.n;const bool VL=cost.variableLen;const float tx=cost.targetX,ty=cost.targetY;const float phi=atan2f(ty,tx);const float rho=sqrtf(fmaf(tx,tx,ty*ty));float len=fminf(fmaxf(rho/(float)n,0.1f),2.0f);float* s0=S+0*stride;for(int i=0;i<n;++i)s0[i]=phi/(float)n;if(VL)for(int i=0;i<n;++i)s0[n+i]=len;float* s1=S+1*stride;for(int i=0;i<n;++i)s1[i]=0.5f*phi*((i&1)?-1.0f:1.0f);if(VL){for(int i=0;i<n;++i)s1[n+i]=len*(0.8f+0.4f*(float)i/(float)n);}float* s2=S+2*stride;const float inv=(n>1)?1.0f/(float)(n-1):0.0f;for(int i=0;i<n;++i){float pr=(float)i*inv;s2[i]=phi*(1.0f-0.3f*pr);}if(VL){for(int i=0;i<n;++i){float si=sinf(1.5f*(float)i);s2[n+i]=len*(1.0f+0.2f*si);}}return 3;}

static __forceinline void agp_run_branch_mpi(const MortonND& map,const ManipCost& cost,int maxIter,float r,bool adaptive,float eps,unsigned seed,std::vector<Interval*>& H,std::vector<float>& bestQ,float& bestF,float& bestX,float& bestY,float M_prior=1e-3f){
const int n=cost.n,dim=n+(cost.variableLen?n:0);alignas(64) float q_local[32],phi[32],s_arr[32],c_arr[32],sum_s[32],sum_c[32],q_try[32];bestQ.reserve(dim);float x=0.0f,y=0.0f;int no_improve=0;
auto evalAt=[&](float t)->float{map.map01ToPoint(t,q_local);float f=cost(q_local,x,y);if(f<bestF*1.05f){float acc=0.0f;for(int i=0;i<n;++i){acc+=q_local[i];phi[i]=acc;}FABE13_SINCOS(phi,s_arr,c_arr,n);float as=0.0f,ac=0.0f;for(int k=n-1;k>=0;--k){const float Lk=cost.variableLen?q_local[n+k]:1.0f;as+=Lk*s_arr[k];ac+=Lk*c_arr[k];sum_s[k]=as;sum_c[k]=ac;}const float dx=x-cost.targetX,dy=y-cost.targetY;float dist=sqrtf(fmaf(dx,dx,dy*dy))+1e-8f;float eta=0.1f;for(int step=0;step<2;++step){for(int i=0;i<n;++i){float gpen=0.0f;float ai=fabsf(q_local[i]);float v=cost.minTheta-ai;if(v>0.0f)gpen=-2.0f*copysignf(v,q_local[i]);float grad=(dx*(-sum_s[i])+dy*(sum_c[i]))/dist+gpen;q_try[i]=q_local[i]-eta*grad;if(q_try[i]<-3.14159265359f)q_try[i]=-3.14159265359f;else if(q_try[i]>3.14159265359f)q_try[i]=3.14159265359f;}if(cost.variableLen){for(int i=0;i<n;++i){float grad=(dx*c_arr[i]+dy*s_arr[i])/dist;float v=q_local[n+i]-eta*grad;if(v<0.1f)v=0.1f;else if(v>2.0f)v=2.0f;q_try[n+i]=v;}}float x2,y2;float f2=cost(q_try,x2,y2);if(f2<f){memcpy(q_local,q_try,dim*sizeof(float));f=f2;x=x2;y=y2;break;}eta*=0.5f;}}if(f<bestF){bestF=f;bestQ.assign(q_local,q_local+dim);bestX=x;bestY=y;no_improve=0;}else ++no_improve;return f;};
float a=0.0f,b=1.0f;float f_a=evalAt(a),f_b=evalAt(b);const int K=(std::min)((std::max)(2*dim,8),128);H.clear();H.reserve((size_t)maxIter+K+16);float Mmax=M_prior;const int rank=g_world->rank();const int world=g_world->size();alignas(64) float seeds[3*32];const int seedCnt=generate_heuristic_seeds(cost,dim,seeds,32);
if(seedCnt>0){const float* s0=seeds;float t_seed=map.pointToT(s0);float t1=fmaxf(a,t_seed-0.002f),t2=fminf(b,t_seed+0.002f);if(t2>t1){alignas(64) float q1[32],q2[32];float x1,y1,x2,y2;map.map01ToPoint(t1,q1);float f1=cost(q1,x1,y1);map.map01ToPoint(t2,q2);float f2=cost(q2,x2,y2);Interval* I=new Interval(t1,t2,f1,f2,(float)dim);Mmax=(std::max)(Mmax,I->M);I->ChangeCharacteristic(r*Mmax);I->R*=0.85f;H.emplace_back(I);if(f1<bestF){bestF=f1;bestQ.assign(q1,q1+dim);bestX=x1;bestY=y1;}if(f2<bestF){bestF=f2;bestQ.assign(q2,q2+dim);bestX=x2;bestY=y2;}}}
float prev_t=a,prev_f=f_a;
for(int k=1;k<=K;++k){float t=a+(b-a)*((float)k/(K+1))+(float)rank/(float)(world*(K+1));float f=evalAt(t);Interval* I=new Interval(prev_t,t,prev_f,f,(float)dim);Mmax=(std::max)(Mmax,I->M);I->ChangeCharacteristic(r*Mmax);H.emplace_back(I);std::push_heap(H.begin(),H.end(),ComparePtr);prev_t=t;prev_f=f;}
Interval* tail=new Interval(prev_t,b,prev_f,f_b,(float)dim);Mmax=(std::max)(Mmax,tail->M);tail->ChangeCharacteristic(r*Mmax);H.emplace_back(tail);std::push_heap(H.begin(),H.end(),ComparePtr);
if(seedCnt>1){for(int i=1;i<seedCnt;++i){const float* s=seeds+i*32;float t_seed=map.pointToT(s);float t1=fmaxf(a,t_seed-0.001f),t2=fminf(b,t_seed+0.001f);if(t2>t1){float f1=evalAt(t1),f2=evalAt(t2);Interval* J=new Interval(t1,t2,f1,f2,(float)dim);Mmax=(std::max)(Mmax,J->M);J->ChangeCharacteristic(r*Mmax);J->R*=0.90f;H.emplace_back(J);std::push_heap(H.begin(),H.end(),ComparePtr);}}}
float dmax=b-a,initial_len=dmax,thr03=0.3f*initial_len,inv_thr03=1.0f/thr03;int it=0;
while(it<maxIter){
if((it%100)==0&&no_improve>50&&!bestQ.empty()){float t_best=map.pointToT(bestQ.data());for(int i=0;i<2;++i){float off=(i==0)?0.01f:-0.01f;float t_seed=fminf(b,fmaxf(a,t_best+off));float f_seed=evalAt(t_seed);Interval* J=new Interval(t_seed-0.005f,t_seed+0.005f,f_seed,f_seed,(float)dim);J->ChangeCharacteristic(r*Mmax);J->R*=0.9f;H.emplace_back(J);std::push_heap(H.begin(),H.end(),ComparePtr);}no_improve=0;}
const float p=fmaf(-1.0f/initial_len,dmax,1.0f);bool stagnation=(no_improve>100)&&(it>270);float r_eff=fmaxf(1.0f,r*(0.7f+0.3f*(1.0f-p)));std::pop_heap(H.begin(),H.end(),ComparePtr);Interval* cur=H.back();H.pop_back();const float x1=cur->x1,x2=cur->x2,y1=cur->y1,y2=cur->y2;float m=r_eff*Mmax;float tNew=Shag(m,x1,x2,y1,y2,(float)dim,r);tNew=fminf(fmaxf(tNew,a),b);float fNew=evalAt(tNew);
Interval* L=new Interval(x1,tNew,y1,fNew,(float)dim);Interval* Rv=new Interval(tNew,x2,fNew,y2,(float)dim);float Mloc=(std::max)(L->M,Rv->M);
if(adaptive){float len1=tNew-x1,len2=x2-tNew;if(len1+len2==dmax){dmax=(std::max)(len1,len2);for(auto pI:H){float Ls=pI->x2-pI->x1;if(Ls>dmax)dmax=Ls;}}if((thr03>dmax&&!(it%3))||(10.0f*dmax<initial_len)){if(Mloc>Mmax){Mmax=Mloc;m=r_eff*Mmax;}const float progress=fmaf(-dmax,inv_thr03,1.0f),alpha=progress*progress,beta=fmaf(-alpha,1.0f,2.0f);const float MULT=(1.0f/dmax)*Mmax,global_coeff=fmaf(MULT,r_eff,-MULT),GF=fmaf(beta,global_coeff,0.0f);L->ChangeCharacteristic(fmaf(GF,len1,L->M*alpha));Rv->ChangeCharacteristic(fmaf(GF,len2,Rv->M*alpha));size_t sz=H.size();RecomputeR_AffineM_AVX2(H.data(),sz,GF,alpha);std::make_heap(H.begin(),H.end(),ComparePtr);}else{if(Mloc>Mmax){bool big=(Mloc>=1.15f*Mmax);Mmax=Mloc;m=r_eff*Mmax;L->ChangeCharacteristic(m);Rv->ChangeCharacteristic(m);if(big){size_t sz=H.size();RecomputeR_ConstM_AVX2(H.data(),sz,m);std::make_heap(H.begin(),H.end(),ComparePtr);}}else{L->ChangeCharacteristic(m);Rv->ChangeCharacteristic(m);}}}
else{if(Mloc>Mmax){bool big=(Mloc>=1.15f*Mmax);Mmax=Mloc;float m2=r_eff*Mmax;L->ChangeCharacteristic(m2);Rv->ChangeCharacteristic(m2);if(big){size_t sz=H.size();RecomputeR_ConstM_AVX2(H.data(),sz,m2);std::make_heap(H.begin(),H.end(),ComparePtr);}}else{L->ChangeCharacteristic(m);Rv->ChangeCharacteristic(m);}}
H.push_back(L);std::push_heap(H.begin(),H.end(),ComparePtr);H.push_back(Rv);std::push_heap(H.begin(),H.end(),ComparePtr);Interval* top=H.front();float interval_len=top->x2-top->x1;const int T=(int)fmaf(-expm1f(p),264.0f,277.0f);bool want_term=(powf(interval_len,1.0f/(float)dim)<eps)||(it==maxIter-1);
if(!(it%T)||want_term){CtrlMsgND out;out.kind=want_term?0:2;if(!want_term){uint8_t cnt=(uint8_t)((H.size()>=3)?3:H.size());out.multiXchg.count=cnt;float* dest=out.multiXchg.intervals;Interval* t1=H[0];Interval* t2=(H.size()>1?H[1]:H[0]);Interval* t3=(H.size()>2?H[2]:H[H.size()-1]);Interval* tops[3]={t1,t2,t3};for(uint8_t i2=0;i2<cnt;++i2){Interval* Tt=tops[i2];dest[0]=Tt->x1;dest[1]=0.0f;dest[2]=Tt->x2;dest[3]=0.0f;dest[4]=Tt->R;dest+=5;}}for(int i2=0;i2<world;++i2)if(i2!=rank)g_world->isend(i2,0,out);if(want_term)break;}
if(!(it%500)&&!bestQ.empty()){CtrlMsgND out;out.kind=3;out.bestSol.bestF=bestF;out.bestSol.bestX=bestX;out.bestSol.bestY=bestY;out.bestSol.dim=(uint8_t)bestQ.size();memcpy(out.bestSol.bestQ,bestQ.data(),bestQ.size()*sizeof(float));for(int i2=0;i2<world;++i2)if(i2!=rank)g_world->isend(i2,0,out);}
while(g_world->iprobe(boost::mpi::any_source,0)){CtrlMsgND in;g_world->recv(boost::mpi::any_source,0,in);
if(in.kind==0){if(!rank)break;else return;}
else if(in.kind==1){float sx=in.xchg.s_x1,ex=in.xchg.e_x1;if(sx<0.0f)sx=0.0f;if(ex>1.0f)ex=1.0f;if(ex>sx){alignas(64) float tmp[32];float tx,ty;map.map01ToPoint(sx,tmp);float y1=cost(tmp,tx,ty);map.map01ToPoint(ex,tmp);float y2=cost(tmp,tx,ty);Interval* inj=new Interval(sx,ex,y1,y2,(float)dim);inj->ChangeCharacteristic(r*Mmax);Interval* topH=H.front();if(inj->R>1.15f*topH->R){float p=fmaf(-1.0f/initial_len,dmax,1.0f);float k=(no_improve>100&&it>270)?fmaf(0.5819767068693265f,expm1f(p),0.3f):fmaf(0.3491860241215959f,expm1f(p),0.6f);inj->R=in.xchg.Rtop*k;H.emplace_back(inj);std::push_heap(H.begin(),H.end(),ComparePtr);}}}
else if(in.kind==2){const MultiCrossMsg& mX=in.multiXchg;for(uint8_t ii=0;ii<mX.count;++ii){const float* d=&mX.intervals[ii*5];float sx=d[0],ex=d[2];if(sx<0.0f)sx=0.0f;if(ex>1.0f)ex=1.0f;if(ex>sx){alignas(64) float tmp[32];float tx,ty;map.map01ToPoint(sx,tmp);float y1=cost(tmp,tx,ty);map.map01ToPoint(ex,tmp);float y2=cost(tmp,tx,ty);Interval* inj=new Interval(sx,ex,y1,y2,(float)dim);inj->ChangeCharacteristic(r*Mmax);Interval* topH=H.front();if(inj->R>1.15f*topH->R){float p=fmaf(-1.0f/initial_len,dmax,1.0f);float k=(no_improve>100&&it>270)?fmaf(0.5819767068693265f,expm1f(p),0.3f):fmaf(0.3491860241215959f,expm1f(p),0.6f);inj->R=d[4]*k;H.emplace_back(inj);std::push_heap(H.begin(),H.end(),ComparePtr);}}}}
else if(in.kind==3){const BestSolutionMsg& bm=in.bestSol;if(bm.bestF<bestF*1.15f){alignas(64) float tmp_q[32];memcpy(tmp_q,bm.bestQ,bm.dim*sizeof(float));float t_best=map.pointToT(tmp_q);float t1=fmaxf(a,t_best-0.001f),t2=fminf(b,t_best+0.001f);if(t2>t1){alignas(64) float tq1[32],tq2[32];float xx1,yy1,xx2,yy2;map.map01ToPoint(t1,tq1);float f1=cost(tq1,xx1,yy1);map.map01ToPoint(t2,tq2);float f2=cost(tq2,xx2,yy2);Interval* I=new Interval(t1,t2,f1,f2,(float)dim);I->ChangeCharacteristic(r*Mmax);I->R*=0.90f;H.emplace_back(I);std::push_heap(H.begin(),H.end(),ComparePtr);}if(bm.bestF<bestF){bestF=bm.bestF;bestX=bm.bestX;bestY=bm.bestY;bestQ.assign(bm.bestQ,bm.bestQ+bm.dim);}}}}
++it;}
}

struct BestPacket{float bestF;int dim;float bestX;float bestY;template<typename Archive> void serialize(Archive& ar,const unsigned int){ar&bestF&dim&bestX&bestY;}};

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_Manip2D(int nSegments,bool variableLengths,float minTheta,float targetX,float targetY,int peanoLevels,int maxIterPerBranch,float r,bool adaptiveMode,float epsilon,unsigned int seed,float** out_bestQ,size_t* out_bestQLen,float* out_bestX,float* out_bestY,float* out_bestF){
const int dim=nSegments+(variableLengths?nSegments:0);g_mc.permCache.resize(dim);for(int i=0;i<dim;++i)g_mc.permCache[i]=i;uint32_t s=g_mc.baseSeed;for(int i=dim-1;i>0;--i){s^=s<<13;s^=s>>17;s^=s<<5;uint32_t j=s%(uint32_t)(i+1);std::swap(g_mc.permCache[i],g_mc.permCache[j]);}g_mc.invMaskCache.resize(dim);for(int k=0;k<dim;++k){s^=s<<13;s^=s>>17;s^=s<<5;g_mc.invMaskCache[k]=(uint64_t)s;}
const float thetaMin=-3.14159265359f,thetaMax=3.14159265359f,lenMin=0.1f,lenMax=2.0f;std::vector<float> low;low.reserve(dim);std::vector<float> high;high.reserve(dim);for(int i=0;i<nSegments;++i){low.push_back(thetaMin);high.push_back(thetaMax);}if(variableLengths){for(int i=0;i<nSegments;++i){low.push_back(lenMin);high.push_back(lenMax);}}
ManipCost cost(nSegments,variableLengths,targetX,targetY,minTheta);const int rank=g_world->rank(),world=g_world->size();std::vector<float> bestQ;float bestF=FLT_MAX,bx=0.0f,by=0.0f;
const int levels0=(std::min)(peanoLevels,8),maxIter0=(int)(maxIterPerBranch*0.2f);MortonND map0(dim,levels0,low.data(),high.data(),g_mc);std::vector<Interval*> H_coarse;std::vector<float> bestQ_coarse;float bestF_coarse=FLT_MAX,bx_coarse=0.0f,by_coarse=0.0f;
float M_prior=(variableLengths?2.0f*nSegments:2.0f*nSegments)*(1.0f/(float)(1u<<levels0));if(variableLengths)M_prior+=1.41421356237f*(1.0f/(float)(1u<<levels0));
agp_run_branch_mpi(map0,cost,maxIter0,r,adaptiveMode,epsilon,seed,H_coarse,bestQ_coarse,bestF_coarse,bx_coarse,by_coarse,M_prior);
if(bestF_coarse<bestF){bestF=bestF_coarse;bestQ=bestQ_coarse;bx=bx_coarse;by=by_coarse;}
if(levels0<peanoLevels){MortonND map1(dim,peanoLevels,low.data(),high.data(),g_mc);std::vector<Interval*> H_fine;std::vector<float> bestQ_fine=bestQ;float bestF_fine=bestF,bx_fine=bx,by_fine=by;float M_prior_fine=(variableLengths?2.0f*nSegments:2.0f*nSegments)*(1.0f/(float)(1u<<peanoLevels));if(variableLengths)M_prior_fine+=1.41421356237f*(1.0f/(float)(1u<<peanoLevels));
if(!H_coarse.empty()){std::sort(H_coarse.begin(),H_coarse.end(),[](const Interval* a,const Interval* b){return a->R<b->R;});const size_t topCount=(size_t)(H_coarse.size()*0.3f);for(size_t i=0;i<topCount&&i<H_coarse.size();++i){const Interval* C=H_coarse[i];alignas(64) float q1[32],q2[32];float x1,y1,x2,y2;map1.map01ToPoint(C->x1,q1);float f1=cost(q1,x1,y1);map1.map01ToPoint(C->x2,q2);float f2=cost(q2,x2,y2);H_fine.push_back(new Interval(C->x1,C->x2,f1,f2,(float)dim));if(f1<bestF_fine){bestF_fine=f1;bestQ_fine.assign(q1,q1+dim);bx_fine=x1;by_fine=y1;}if(f2<bestF_fine){bestF_fine=f2;bestQ_fine.assign(q2,q2+dim);bx_fine=x2;by_fine=y2;}}std::make_heap(H_fine.begin(),H_fine.end(),ComparePtr);}
agp_run_branch_mpi(map1,cost,maxIterPerBranch-maxIter0,r,adaptiveMode,epsilon,seed,H_fine,bestQ_fine,bestF_fine,bx_fine,by_fine,M_prior_fine);
if(bestF_fine<bestF){bestF=bestF_fine;bestQ=bestQ_fine;bx=bx_fine;by=by_fine;}}
BestPacket me{bestF,dim,bx,by};
if(!rank){std::vector<float> winnerQ=bestQ;float winF=bestF,wx=bx,wy=by;for(int i=1;i<world;++i){BestPacket bp;g_world->recv(i,2,bp);std::vector<float> qin;g_world->recv(i,3,qin);if(bp.bestF<winF){winF=bp.bestF;wx=bp.bestX;wy=bp.bestY;winnerQ=qin;}}*out_bestQLen=winnerQ.size();*out_bestQ=(float*)CoTaskMemAlloc(sizeof(float)*(*out_bestQLen));memcpy(*out_bestQ,winnerQ.data(),sizeof(float)*(*out_bestQLen));*out_bestX=wx;*out_bestY=wy;*out_bestF=winF;}
else{g_world->send(0,2,me);g_world->send(0,3,bestQ);}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(int peanoLevel,float a,float b,float c,float d){
g_env=new boost::mpi::environment();g_world=new boost::mpi::communicator();_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);const int rank=g_world->rank();const int world_size=g_world->size();if(world_size==4){new(&gActiveMap) Peano2DMap(peanoLevel,a,b,c,d,rank&3);}g_mc.baseSeed=fmaf(0x9E3779B9u,rank,0x9E3779B9u);return rank;
}

__forceinline float ShekelFunc(float x,float seed){int i=0;float st=seed,r1,r2,res=0.0f;while(i<10){XOR_RAND(st,r1);float xp=fmaf(-r1,10.0f,x);XOR_RAND(st,r1);XOR_RAND(st,r2);float d=fmaf(fmaf(r1,20.0f,5.0f),xp*xp,fmaf(r2,0.2f,1.0f));d=copysignf(fmaxf(fabsf(d),FLT_MIN),d);res-=1.0f/d;++i;}return res;}
__forceinline float RastriginFunc(float x1,float x2){const float t=fmaf(x1,x1,x2*x2);float c1,c2;FABE13_COS(6.28318530717958647692f*x1,c1);FABE13_COS(6.28318530717958647692f*x2,c2);return (t-fmaf(c1+c2,10.0f,-14.6f))*fmaf(-t,0.25f,18.42f);}
__forceinline float HillFunc(float x,float seed){int j=0;__declspec(align(32)) float ang[14u];float st=6.28318530717958647692f*x;while(j<14){ang[j]=st*(float)(j+1);++j;}__declspec(align(32)) float sv[14u],cv[14u];FABE13_SINCOS(ang,sv,cv,14u);float state=seed,r1,r2;XOR_RAND(state,r1);float res=fmaf(r1,2.0f,-1.1f);--j;while(j>=0){XOR_RAND(state,r1);XOR_RAND(state,r2);res+=fmaf(fmaf(r1,2.0f,-1.1f),sv[j],fmaf(r2,2.0f,-1.1f)*cv[j]);--j;}return res;}
__forceinline float GrishaginFunc(float x1,float x2,float seed){int j=0;__declspec(align(32)) float aj[8u],ak[8u];while(j<8){float pj=3.14159265358979323846f*(float)(j+1);aj[j]=pj*x1;ak[j]=pj*x2;++j;}__declspec(align(32)) float sj[8u],cj[8u],sk[8u],ck[8u];FABE13_SINCOS(aj,sj,cj,8u);FABE13_SINCOS(ak,sk,ck,8u);--j;float p1=0.0f,p2=0.0f;float st=seed,r1,r2;while(j>=0){size_t k=0u;while(k<8u){float s=sj[j]*sj[j];float c=ck[k]*ck[k];XOR_RAND_GRSH(st,r1);XOR_RAND_GRSH(st,r2);p1=fmaf(r1,s,fmaf(r2,c,p1));XOR_RAND_GRSH(st,r1);XOR_RAND_GRSH(st,r2);p2=fmaf(-r1,c,fmaf(r2,s,p2));++k;}--j;}return -sqrtf(fmaf(p1,p1,p2*p2));}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_1D(float global_iterations,float a,float b,float r,bool mode,float epsilon,float seed,float** out_data,size_t* out_len){
Slab* slab=tls.local();slab->current=slab->base;int schetchick=0;const float initial_length=b-a;float dmax=initial_length;const float threshold_03=0.3f*initial_length;const float inv_threshold_03=1.0f/threshold_03;const float start_val=ShekelFunc(a,seed);float best_f=ShekelFunc(b,seed);float x_Rmax_1=a;float x_Rmax_2=b;float y_Rmax_1=start_val;float y_Rmax_2=best_f;std::vector<float,boost::alignment::aligned_allocator<float,16u>> Extr;std::vector<Interval*,boost::alignment::aligned_allocator<Interval*,64u>> R;Extr.reserve((size_t)global_iterations<<2u);R.reserve((size_t)global_iterations<<1u);R.emplace_back(new Interval(a,b,start_val,best_f,1.0f));float Mmax=R.front()->M;float m=r*Mmax;
while(true){
const float new_point=Shag(m,x_Rmax_1,x_Rmax_2,y_Rmax_1,y_Rmax_2,1.0f,r);const float new_value=ShekelFunc(new_point,seed);if(new_value<best_f){best_f=new_value;Extr.emplace_back(best_f);Extr.emplace_back(new_point);}
std::pop_heap(R.begin(),R.end(),ComparePtr);const Interval* pro=R.back();const float new_x1=pro->x1;const float new_x2=pro->x2;const float len2=new_x2-new_point;const float len1=new_point-new_x1;const float interval_len=len1<len2?len1:len2;
if(++schetchick==(int)global_iterations||interval_len<epsilon){Extr.emplace_back((float)schetchick);Extr.emplace_back(interval_len);*out_len=Extr.size();*out_data=(float*)CoTaskMemAlloc(sizeof(float)*(*out_len));memcpy(*out_data,Extr.data(),sizeof(float)*(*out_len));return;}
Interval* curr=new Interval(new_x1,new_point,pro->y1,new_value,1.0f);Interval* curr1=new Interval(new_point,new_x2,new_value,pro->y2,1.0f);const float currM=curr->M>curr1->M?curr->M:curr1->M;const size_t r_size=R.size();
if(mode){if(len2+len1==dmax){dmax=len2>len1?len2:len1;for(auto p:R){float L=p->x2-p->x1;if(L>dmax)dmax=L;}}if(threshold_03>dmax&&!(schetchick%3)||10.0f*dmax<initial_length){if(currM>Mmax){Mmax=currM;m=r*Mmax;}const float progress=fmaf(-inv_threshold_03,dmax,1.0f);const float alpha=progress*progress;const float betta=2.0f-alpha;const float MULT=(1.0f/dmax)*Mmax;const float global_coeff=fmaf(MULT,r,-MULT);const float GF=betta*global_coeff;curr->ChangeCharacteristic(fmaf(GF,len1,curr->M*alpha));curr1->ChangeCharacteristic(fmaf(GF,len2,curr1->M*alpha));RecomputeR_AffineM_AVX2(R.data(),r_size,GF,alpha);std::make_heap(R.begin(),R.end(),ComparePtr);}else{if(currM>Mmax){if(currM<1.15f*Mmax){Mmax=currM;m=r*Mmax;curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);}else{Mmax=currM;m=r*Mmax;curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);RecomputeR_ConstM_AVX2(R.data(),r_size,m);std::make_heap(R.begin(),R.end(),ComparePtr);}}else{curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);}}}
else{if(currM>Mmax){if(currM<1.15f*Mmax){Mmax=currM;m=r*Mmax;curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);}else{Mmax=currM;m=r*Mmax;curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);RecomputeR_ConstM_AVX2(R.data(),r_size,m);std::make_heap(R.begin(),R.end(),ComparePtr);}}else{curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);}}
R.back()=curr;std::push_heap(R.begin(),R.end(),ComparePtr);R.emplace_back(curr1);std::push_heap(R.begin(),R.end(),ComparePtr);const Interval* top=R.front();x_Rmax_1=top->x1;x_Rmax_2=top->x2;y_Rmax_1=top->y1;y_Rmax_2=top->y2;}
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_2D(const float N,const float global_iterations,const float a,const float b,const float c,const float d,const float r,const bool mode,const float epsilon,const float seed,float** const __restrict out_data,size_t* const __restrict out_len)noexcept{
Slab* const __restrict slab=tls.local();slab->current=slab->base;int schetchick=0;int no_improve=0;const int rank=g_world->rank();const int world_size=g_world->size();while(g_world->iprobe(boost::mpi::any_source,0)){CtrlMsg dummy;g_world->recv(boost::mpi::any_source,0,dummy);}
const float inv_divider=ldexpf(1.0f,-((gActiveMap.levels<<1)+1));const float x_addition=(b-a)*inv_divider;const float y_addition=(d-c)*inv_divider;const float true_start=a+x_addition;const float true_end=b-x_addition;float x_Rmax_1=true_start;float x_Rmax_2=true_end;const float initial_length=x_Rmax_2-x_Rmax_1;float dmax=initial_length;const float threshold_03=0.3f*initial_length;const float inv_threshold_03=1.0f/threshold_03;const float start_val=rank%3?RastriginFunc(true_end,d-y_addition):RastriginFunc(true_start,c+y_addition);float best_f=rank%2?RastriginFunc(true_start,d-y_addition):RastriginFunc(true_end,c+y_addition);float y_Rmax_1=start_val;float y_Rmax_2=best_f;
std::vector<float,boost::alignment::aligned_allocator<float,16u>> Extr;std::vector<Interval* __restrict,boost::alignment::aligned_allocator<Interval* __restrict,64u>> R;Extr.clear();Extr.reserve(static_cast<size_t>(global_iterations)<<2u);R.clear();R.reserve(static_cast<size_t>(global_iterations)<<1u);R.emplace_back(new Interval(true_start,true_end,start_val,best_f,2.0f));const Interval* __restrict top_ptr;float Mmax=R.front()->M;float m=r*Mmax;
while(true){
const float interval_len=x_Rmax_2-x_Rmax_1;const bool stagnation=no_improve>100&&schetchick>270;const float p=fmaf(-1.0f/initial_length,dmax,1.0f);
while(g_world->iprobe(boost::mpi::any_source,0)){CtrlMsg in;g_world->recv(boost::mpi::any_source,0,in);if(in.kind){if(!rank){Extr.emplace_back(static_cast<float>(schetchick));Extr.emplace_back(interval_len);*out_len=Extr.size();*out_data=reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float)*(*out_len)));memcpy(*out_data,Extr.data(),sizeof(float)*(*out_len));}return;}const float sx=FindX2D_analytic(in.xchg.s_x1,in.xchg.s_x2);const float ex=FindX2D_analytic(in.xchg.e_x1,in.xchg.e_x2);Interval* const __restrict injected=new Interval(sx,ex,RastriginFunc(in.xchg.s_x1,in.xchg.s_x2),RastriginFunc(in.xchg.e_x1,in.xchg.e_x2),2.0f);injected->ChangeCharacteristic(m);if(injected->R>1.15f*top_ptr->R){const float k=stagnation?fmaf(0.5819767068693265f,expm1f(p),0.3f):fmaf(0.3491860241215959f,expm1f(p),0.6f);injected->R=in.xchg.Rtop*k;R.emplace_back(injected);std::push_heap(R.begin(),R.end(),ComparePtr);}}
const int T=static_cast<int>(fmaf(-expm1f(p),264.0f,277.0f));const bool want_term=interval_len<epsilon||schetchick==static_cast<int>(global_iterations);
if(!(++schetchick%T)||stagnation||want_term){CtrlMsg out;out.kind=want_term;if(!out.kind){float s_x1,s_x2,e_x1,e_x2;HitTest2D_analytic(top_ptr->x1,s_x1,s_x2);HitTest2D_analytic(top_ptr->x2,e_x1,e_x2);out.xchg=CrossMsg{s_x1,s_x2,e_x1,e_x2,top_ptr->R};}for(int i=0;i<world_size;++i){if(i!=rank)g_world->isend(i,0,out);}if(out.kind){if(!rank){Extr.emplace_back(static_cast<float>(schetchick));Extr.emplace_back(interval_len);*out_len=Extr.size();*out_data=reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float)*(*out_len)));memcpy(*out_data,Extr.data(),sizeof(float)*(*out_len));}return;}}
const float new_point=Shag(m,x_Rmax_1,x_Rmax_2,y_Rmax_1,y_Rmax_2,2.0f,r);float new_x1_val,new_x2_val;HitTest2D_analytic(new_point,new_x1_val,new_x2_val);const float new_value=RastriginFunc(new_x1_val,new_x2_val);if(new_value<best_f){best_f=new_value;Extr.emplace_back(best_f);Extr.emplace_back(new_x1_val);Extr.emplace_back(new_x2_val);no_improve=0;}else{++no_improve;}
std::pop_heap(R.begin(),R.end(),ComparePtr);Interval* const __restrict promejutochny_otrezok=R.back();const float segment_x1=promejutochny_otrezok->x1;const float segment_x2=promejutochny_otrezok->x2;const float len2=segment_x2-new_point;const float len1=new_point-segment_x1;Interval* const __restrict curr=new Interval(segment_x1,new_point,promejutochny_otrezok->y1,new_value,2.0f);Interval* const __restrict curr1=new Interval(new_point,segment_x2,new_value,promejutochny_otrezok->y2,2.0f);const float currM=(std::max)(curr->M,curr1->M);const size_t r_size=R.size();
if(mode){if(len2+len1==dmax){dmax=(std::max)(len1,len2);for(auto pI:R){float L=pI->x2-pI->x1;if(L>dmax)dmax=L;}}if(threshold_03>dmax&&!(schetchick%3)||10.0f*dmax<initial_length){if(currM>Mmax){Mmax=currM;m=r*Mmax;}const float progress=fmaf(-inv_threshold_03,dmax,1.0f);const float alpha=progress*progress;const float betta=2.0f-alpha;const float MULTIPLIER=(1.0f/dmax)*Mmax;const float global_coeff=fmaf(MULTIPLIER,r,-MULTIPLIER);const float GLOBAL_FACTOR=betta*global_coeff;curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR,len1,curr->M*alpha));curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR,len2,curr1->M*alpha));RecomputeR_AffineM_AVX2(R.data(),r_size,GLOBAL_FACTOR,alpha);std::make_heap(R.begin(),R.end(),ComparePtr);}else{if(currM>Mmax){if(currM<1.15f*Mmax){Mmax=currM;m=r*Mmax;curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);}else{Mmax=currM;m=r*Mmax;curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);RecomputeR_ConstM_AVX2(R.data(),r_size,m);std::make_heap(R.begin(),R.end(),ComparePtr);}}else{curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);}}}
else{if(currM>Mmax){if(currM<1.15f*Mmax){Mmax=currM;m=r*Mmax;curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);}else{Mmax=currM;m=r*Mmax;curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);RecomputeR_ConstM_AVX2(R.data(),r_size,m);std::make_heap(R.begin(),R.end(),ComparePtr);}}else{curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);}}
R.back()=curr;std::push_heap(R.begin(),R.end(),ComparePtr);R.emplace_back(curr1);std::push_heap(R.begin(),R.end(),ComparePtr);top_ptr=R.front();x_Rmax_1=top_ptr->x1;x_Rmax_2=top_ptr->x2;y_Rmax_1=top_ptr->y1;y_Rmax_2=top_ptr->y2;}
}

struct RunParams{int nSegments;uint8_t varLen;float minTheta;float tx,ty;int levels,maxIter;float r;uint8_t adaptive;float eps;unsigned int seed;template<typename Archive> void serialize(Archive& ar,const unsigned int){ar&nSegments&varLen&minTheta&tx&ty&levels&maxIter&r&adaptive&eps&seed;}};

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartManipND(int nSegments,bool variableLengths,float minTheta,float targetX,float targetY,int peanoLevels,int maxIterPerBranch,float r,bool adaptiveMode,float epsilon,unsigned int seed){
RunParams p;p.nSegments=nSegments;p.varLen=(uint8_t)variableLengths;p.minTheta=minTheta;p.tx=targetX;p.ty=targetY;p.levels=peanoLevels;p.maxIter=maxIterPerBranch;p.r=r;p.adaptive=(uint8_t)adaptiveMode;p.eps=epsilon;p.seed=seed;const int rank=g_world->rank();const int world=g_world->size();for(int i=1;i<world;++i){if(i!=rank)g_world->isend(i,1,p);}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun(){
while(true){if(g_world->iprobe(0,1)){RunParams p;g_world->recv(0,1,p);float* q=nullptr;size_t qlen=0;float bx=0,by=0,bf=0;AGP_Manip2D(p.nSegments,(bool)p.varLen,p.minTheta,p.tx,p.ty,p.levels,p.maxIter,p.r,(bool)p.adaptive,p.eps,p.seed,&q,&qlen,&bx,&by,&bf);if(q)CoTaskMemFree(q);}Sleep(0);}
}

extern "C" __declspec(dllexport) __declspec(noalias) void AgpStartWorkers(){int i=1;const int world=g_world->size();while(i<world){g_world->isend(i,1,0);++i;}}
extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_Free(float* p){CoTaskMemFree(p);}

сейчас я скину тебе длинный код и перед тобой буде...

سؤال

الفكرة

Refactoring user code

Optimizing struct alignment and layout

Finalizing alignment choices

Adjusting code layout

Reordering and formatting code

الإجابة

مشاركة هذا السؤال والجواب