сейчас я скину тебе длинный код и перед тобой буде...

تم الإنشاء في: ٢ نوفمبر ٢٠٢٥

تمت الإجابة باستخدام GPT-5 Thinking بواسطة Chat01

سؤال

сейчас я скину тебе длинный код и перед тобой будет стоят следующая задача: 1) - это задача по оформлению - в коде ты найдешь участки кода которые написаны очень компактно плотно упакованы - нужно будет переоформить весь код в таком стиле, не меняя логику, функциональность и т д - только пробелы переносы строк пробелы в строках и т д, удалить из кода все комменты - и все ненужные лишние пробелы мешающие компактности, 2) - тебе нужно будет провести перестановку отдельных крупных функций структур и блоков кода чтобы обеспечить лучшую локальность и лучшую смысловую составляющую - то есть блоки кода которые потом дальше вместе используются или что один блок используется в другом сразу же - такие блоки должны стоять вместе - близко - чтобы обеспечить лучшую локальность данных, то же самое и с объявлением статических переменных и переменных внутри функций - можно переставлять строки кода где происходит объявление переменных или операции с памятью - то есть однотипные операции можно ставить вместе близко, или операции задействующие одинаковые вычислительные устройства процессора чтобы обеспечить лучшую локальность данных или лучшее использование цпу или кеша - в конечном счёте цель одна - ускорение - так что перемещай строки или блоки кода только если это приведёт к ускорению или потенциально может привести к ускорению - но самое важное - все изменения не должны ломать логику, тебе запрещено менять сам код, его функциональную составляющую - ты не можешь влиять на код никаким образом кроме как убирать пробелы отступы комменты, переставлять местами строки и блоки кода, главное чтобы это не отразилось на функциональности кода - чтобы все твои изменения были эквивалентными, 3) последнее требование - это внимательно проверить все структуры массивы векторы и т д на выравнивание и провести глубокий анализ какое выравнивание будет оптимально для каждой структуры с учётом того что код используется в управляемой среде где предпочтительнее выравнивание по 16 а не 64, но это касается не всех мест кода - в некоторых местах где используется векторизация или важно попадание структуры в кеш-линию - возможно оптимальнее будет выравнивание по 32 или 64, в вопросах выравнивания ты должен быть крайне осторожен - так это может существенно повлиять на скорость, так что ты должен долго тщательно взвешивать каждое решение, мне верни полный исправленный код, без пропусков, код: #include "pch.h"

#define XOR_RAND(state, result_var)
do {
int s = (state);
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = state * 0x1.0p-32f;
} while (0)

#define XOR_RAND_GRSH(state, result_var)
do {
int s = (state);
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = fmaf(state, 0x1.0p-31f, -1.0f);
} while (0)

#define FABE13_COS(x, result_var)
do {
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530718f);
if (r > 3.14159265359f) r = 6.28318530718f - r;
if (r < 1.57079632679f) {
const float t2 = r * r;
const float t4 = t2 * t2;
result_var = fmaf(t4, fmaf(t2, -0.0013888889f, 0.0416666667f), fmaf(t2, -0.5f, 1.0f));
} else {
r = 3.14159265359f - r;
const float t2 = r * r;
const float t4 = t2 * t2;
result_var = -fmaf(t4, fmaf(t2, -0.0013888889f, 0.0416666667f), fmaf(t2, -0.5f, 1.0f));
}
} while (0)

#define FABE13_SIN(x, result_var)
do {
const float x = (x);
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530718f);
bool sfl = r > 3.14159265359f;
if (sfl) r = 6.28318530718f - r;
bool cfl = r > 1.57079632679f;
if (cfl) r = 3.14159265359f - r;
const float t2 = r * r;
float _s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
result_var = ((x < 0.0f) ^ sfl) ? -_s : _s;
} while (0)

#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
const int limit = (n) & ~7;
if ((n) >= 8) {
static __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f);
static __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f);
static __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f);
static __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f);
static __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f);
static __declspec(align(32)) const __m256 VEC_COS_P5 = _mm256_set1_ps(-0.0013888889f);
static __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f);
static __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f);
static __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_SIN_P5 = _mm256_set1_ps(-0.0001984127f);
static __declspec(align(32)) const __m256 VEC_SIN_P3 = _mm256_set1_ps(0.0083333333f);
static __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f);
static __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps();
while (i < limit) {
const __m256 vx = _mm256_load_ps(&(in)[i]);
const __m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
const __m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
const __m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
const __m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
const __m256 t2 = _mm256_mul_ps(r2, r2);
const __m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P5, VEC_COS_P3), VEC_COS_P1), VEC_COS_P0);
const __m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_SIN_P5, VEC_SIN_P3), VEC_SIN_P1), VEC_SIN_P0), r2);
const __m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
const __m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ), _mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
_mm256_store_ps(&(cos_out)[i], _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip));
_mm256_store_ps(&(sin_out)[i], _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip));
i += 8;
}
}
while (i < (n)) {
const float x = (in)[i];
const float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
const bool sflip = r > 3.14159265359f;
if (sflip) r = 6.28318530718f - r;
const bool cflip = r > 1.57079632679f;
if (cflip) r = 3.14159265359f - r;
const float t2 = r * r;
const float c = fmaf(t2, fmaf(t2, fmaf(t2, -0.0013888889f, 0.0416666667f), -0.5f), 1.0f);
const float s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
(cos_out)[i] = cflip ? -c : c;
(sin_out)[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while (0)

enum List : uint8_t { Top = 0b00u, Down = 0b01u, Left = 0b10u, Right = 0b11u };

__declspec(align(4)) struct Step final { const uint8_t next; const uint8_t dx; const uint8_t dy; };
__declspec(align(4)) struct InvStep final { const uint8_t q; const uint8_t next; };

__declspec(align(64)) static const Step g_step_tbl[4][4] = {
{ {Right, 0u, 0u}, {Top, 0u, 1u}, {Top, 1u, 1u}, {Left, 1u, 0u} },
{ {Left, 1u, 1u}, {Down, 1u, 0u}, {Down, 0u, 0u}, {Right, 0u, 1u} },
{ {Down, 1u, 1u}, {Left, 0u, 1u}, {Left, 0u, 0u}, {Top, 1u, 0u} },
{ {Top, 0u, 0u}, {Right, 1u, 0u}, {Right, 1u, 1u}, {Down, 0u, 1u} }
};

__declspec(align(64)) static const InvStep g_inv_tbl[4][4] = {
{ {0u, Right}, {1u, Top}, {3u, Left}, {2u, Top} },
{ {2u, Down}, {3u, Right}, {1u, Down}, {0u, Left} },
{ {2u, Left}, {1u, Left}, {3u, Top}, {0u, Down} },
{ {0u, Top}, {3u, Down}, {1u, Right}, {2u, Right} }
};

static const boost::mpi::environment* g_env;
static const boost::mpi::communicator* g_world;

__declspec(align(16)) struct CrossMsg final {
float s_x1, s_x2;
float e_x1, e_x2;
float Rtop;
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }
};

__declspec(align(16)) struct CtrlMsg final {
bool kind;
CrossMsg xchg;
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& kind& xchg; }
};

__declspec(align(16)) struct Slab final {
char* const base;
char* current;
char* const end;
__forceinline Slab(void* const memory, const size_t usable) : base((char*)memory), current(base), end(base + (usable & ~(size_t)63u)) {}
};

static tbb::enumerable_thread_specific<Slab*> tls( noexcept {
void* memory = _aligned_malloc(16777216u, 64u);
Slab* slab = (Slab*)_aligned_malloc(32u, 64u);
new (slab) Slab(memory, 16777216u);
char* p = slab->base;
while (p < slab->end) { *p = 0; p += 4096u; }
return slab;
});

__declspec(align(16)) struct Peano2DMap final {
const int levels;
const float a, b, c, d;
const float lenx, leny;
const float inv_lenx;
const uint32_t scale;
const uint8_t start;
__forceinline Peano2DMap(int L, float _a, float _b, float _c, float _d, uint8_t st)
: levels(L), a(_a), b(_b), c(_c), d(_d), lenx(_b - _a), leny(_d - _c), inv_lenx(1.0f / (_b - _a)), scale((uint32_t)1u << (L << 1)), start(st) {
}
};

static Peano2DMap gActiveMap(0, 0, 0, 0, 0, 0);

__forceinline bool ComparePtr(const struct Interval* a, const struct Interval* b) noexcept;

__declspec(align(64)) struct Interval final {
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float ordinate_factor;
const float N_factor;
const float quadratic_term;
const float M;
float R;
__forceinline void* operator new(size_t) noexcept { Slab* s = tls.local(); char* r = s->current; s->current += 64u; return r; }
__forceinline Interval(float _x1, float _x2, float _y1, float _y2, float _N) noexcept
: x1(_x1), x2(_x2), y1(_y1), y2(_y2),
delta_y(_y2 - _y1),
ordinate_factor(-(y1 + y2) * 2.0f),
N_factor(_N == 1.0f ? _x2 - _x1 : _N == 2.0f ? sqrtf(_x2 - _x1) : powf(_x2 - _x1, 1.0f / _N)),
quadratic_term((1.0f / N_factor)* delta_y* delta_y),
M((1.0f / N_factor)* fabsf(delta_y)) {
}
__forceinline void ChangeCharacteristic(float _m) noexcept { R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor)); }
};

__forceinline bool ComparePtr(const Interval* a, const Interval* b) noexcept { return a->R < b->R; }

__forceinline void RecomputeR_ConstM_AVX2(Interval* const* arr, size_t n, float m) {
const __m256 vm = _mm256_set1_ps(m); __m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
size_t i = 0; const size_t limit = n & ~7ull;
alignas(32) float q[8], nf[8], od[8], out[8];
for (; i < limit; i += 8) {
for (int k = 0; k < 8; ++k) { const Interval* p = arr[i + k]; q[k] = p->quadratic_term; nf[k] = p->N_factor; od[k] = p->ordinate_factor; }
const __m256 vq = _mm256_load_ps(q), vnf = _mm256_load_ps(nf), vod = _mm256_load_ps(od);
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod); const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
_mm256_store_ps(out, res); for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k];
}
for (; i < n; ++i) arr[i]->ChangeCharacteristic(m);
}
__forceinline void RecomputeR_AffineM_AVX2(Interval* const* arr, size_t n, float GF, float alpha) {
const __m256 vGF = _mm256_set1_ps(GF), va = _mm256_set1_ps(alpha);
size_t i = 0; const size_t limit = n & ~7ull; alignas(32) float ln[8], Mv[8], q[8], nf[8], od[8], out[8];
for (; i < limit; i += 8) {
for (int k = 0; k < 8; ++k) { const Interval* p = arr[i + k]; ln[k] = p->x2 - p->x1; Mv[k] = p->M; q[k] = p->quadratic_term; nf[k] = p->N_factor; od[k] = p->ordinate_factor; }
const __m256 vln = _mm256_load_ps(ln), vM = _mm256_load_ps(Mv), vq = _mm256_load_ps(q), vnf = _mm256_load_ps(nf), vod = _mm256_load_ps(od);
const __m256 vm = _mm256_fmadd_ps(vGF, vln, _mm256_mul_ps(va, vM));
__m256 vinvm = _mm256_rcp_ps(vm); vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod); const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
_mm256_store_ps(out, res); for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k];
}
for (; i < n; ++i) { const Interval* p = arr[i]; const float mi = fmaf(GF, (p->x2 - p->x1), p->M * alpha); arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor)); }
}

__forceinline float fast_pow_int(float v, int n) {
float r;
switch (n) {
case 1: r = v; break;
case 2: r = v * v; break;
case 3: { float v2 = v * v; r = v2 * v; } break;
case 4: { float v2 = v * v; r = v2 * v2; } break;
case 5: { float v2 = v * v; r = v2 * v2 * v; } break;
case 6: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v2; } break;
case 7: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v2 * v; } break;
case 8: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v4; } break;
case 9: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v3; } break;
case 10: { float v2 = v * v; float v4 = v2 * v2; float v8 = v4 * v4; r = v8 * v2; } break;
case 11: { float v2 = v * v; float v4 = v2 * v2; float v8 = v4 * v4; r = v8 * v2 * v; } break;
case 12: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v6; } break;
case 13: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v6 * v; } break;
case 14: { float v7 = v * v * v * v * v * v * v; r = v7 * v7; } break;
default: { float v7 = v * v * v * v * v * v * v; r = v7 * v7 * v; } break;
}
return r;
}

__forceinline float Shag(float _m, float x1, float x2, float y1, float y2, float _N, float _r) {
const float diff = y2 - y1;
const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(0x3F800000u | (((((uint32_t)&diff)) & 0x80000000u) ^ 0x80000000u))));
const int Ni = (int)(_N + 0.5f);
if (Ni == 1) return fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f;
if (Ni == 2) return fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f;
const float invmN = 1.0f / fast_pow_int(_m, Ni);
const float dN = fast_pow_int(fabsf(diff), Ni);
return fmaf(sign_mult * invmN, dN * _r, x1 + x2) * 0.5f;
}

struct MortonCachePerRank { std::vector<int> permCache; std::vector<uint64_t> invMaskCache; uint32_t baseSeed; };
static MortonCachePerRank g_mc;

struct MortonND final {
int dim, levels; uint64_t scale;
std::vector<float> low, high, step, invStep, baseOff;
std::vector<int> perm; std::vector<uint64_t> invMask, pextMask;
float invScaleLevel;
__forceinline MortonND(int D, int L, const float* lows, const float* highs, const MortonCachePerRank& mc)
:dim(D), levels(L), scale(1ull << ((uint64_t)D * L)), low(lows, lows + D), high(highs, highs + D),
step(D, 0.0f), invStep(D, 0.0f), baseOff(D, 0.0f), perm(mc.permCache.begin(), mc.permCache.begin() + D),
invMask(mc.invMaskCache.begin(), mc.invMaskCache.begin() + D), pextMask(D, 0ull), invScaleLevel(1.0f / (float)((uint64_t)1 << L)) {
uint64_t level_mask = (1ull << L) - 1ull; for (int k = 0; k < D; ++k) invMask[k] &= level_mask;
for (int d = 0; d < dim; ++d) { float rng = high[d] - low[d]; float st = rng * invScaleLevel; step[d] = st; invStep[d] = 1.0f / st; baseOff[d] = fmaf(0.5f, st, low[d]); }
for (int d = 0; d < dim; ++d) { uint64_t m = 0ull, bitpos = (uint64_t)d; for (int b = 0; b < levels; ++b) { m |= 1ull << bitpos; bitpos += (uint64_t)dim; } pextMask[d] = m; }
}
__forceinline void map01ToPoint(float t, float* __restrict out)const noexcept {
if (t <= 0.0f) t = 0.0f; else if (t >= 1.0f) t = 0x1.fffffep-1f;
uint64_t idx = (uint64_t)((double)t * (double)scale); if (idx >= scale) idx = scale - 1ull;
for (int d = 0; d < dim; ++d) {
int pd = perm[d]; uint64_t bits = _pext_u64(idx, pextMask[d]); if (invMask[pd]) bits ^= invMask[pd];
out[pd] = fmaf(step[pd], (float)bits, baseOff[pd]);
}
}
__forceinline uint64_t clamp_bits(int64_t v, int bits)const noexcept {
if (v < 0) v = 0; int64_t maxv = ((int64_t)1 << bits) - 1; if (v > maxv) v = maxv; return (uint64_t)v;
}
__forceinline float pointToT(const float* __restrict q)const noexcept {
const int bits = levels; uint64_t idx = 0ull;
for (int d = 0; d < dim; ++d) {
int pd = perm[d];
float v = (q[pd] - baseOff[pd]) * invStep[pd];
int64_t cell = (int64_t)_mm_cvt_ss2si(_mm_round_ss(_mm_setzero_ps(), _mm_set_ss(v), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
uint64_t b = clamp_bits(cell, bits); if (invMask[pd]) b ^= invMask[pd]; idx |= _pdep_u64(b, pextMask[d]);
}
return ((float)idx + 0.5f) / (float)scale;
}
};

struct ManipCost final {
int n;
bool variableLen;
float targetX, targetY;
float minTheta;
__forceinline ManipCost(int _n, bool _variableLen, float _targetX, float _targetY, float _minTheta)
: n(_n), variableLen(_variableLen), targetX(_targetX), targetY(_targetY), minTheta(_minTheta) {
}
__forceinline float operator()(const float* __restrict q, float& out_x, float& out_y) const noexcept {
const float* th = q;
const float* L = variableLen ? (q + n) : nullptr;
__declspec(align(64)) float phi[16];
__declspec(align(64)) float s_arr[16];
__declspec(align(64)) float c_arr[16];
float x = 0.0f, y = 0.0f, phi_acc = 0.0f, penC = 0.0f;
for (int i = 0; i < n; ++i) { phi_acc += th[i]; phi[i] = phi_acc; }
FABE13_SINCOS(phi, s_arr, c_arr, n);
const float Lc = 1.0f;
if (variableLen) {
for (int i = 0; i < n; ++i) { float Li = L[i]; x = fmaf(Li, c_arr[i], x); y = fmaf(Li, s_arr[i], y); }
}
else {
for (int i = 0; i < n; ++i) { x = fmaf(Lc, c_arr[i], x); y = fmaf(Lc, s_arr[i], y); }
}
for (int i = 0; i < n; ++i) { float ai = fabsf(th[i]); float v = minTheta - ai; float u = v > 0.0f ? v : 0.0f; penC = fmaf(u, u, penC); }
float dx = x - targetX;
float dy = y - targetY;
float dist = sqrtf(fmaf(dx, dx, dy * dy));
out_x = x; out_y = y;
return dist + penC;
}
};

__forceinline void HitTest2D_analytic(float x_param, float& out_x1, float& out_x2) {
const float a = gActiveMap.a;
const float inv_lenx = gActiveMap.inv_lenx;
const uint32_t scale = gActiveMap.scale;
const uint32_t scale_minus_1 = scale - 1u;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const float c = gActiveMap.c;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;
float norm = (x_param - a) * inv_lenx;
norm = fminf(fmaxf(norm, 0.0f), 0x1.fffffep-1f);
uint32_t idx = (uint32_t)(norm * (float)scale);
idx = idx > scale_minus_1 ? scale_minus_1 : idx;
float sx = lenx, sy = leny;
float x1 = a, x2 = c;
uint8_t type = start;
int l = levels - 1;
while (l >= 0) {
const uint32_t q = (idx >> (l * 2)) & 3u;
const Step s = g_step_tbl[type][q];
type = s.next;
sx *= 0.5f; sy *= 0.5f;
x1 += s.dx ? sx : 0.0f;
x2 += s.dy ? sy : 0.0f;
--l;
}
out_x1 = x1 + sx * 0.5f;
out_x2 = x2 + sy * 0.5f;
}

__forceinline float FindX2D_analytic(float px, float py) {
const float a = gActiveMap.a;
const float b = gActiveMap.b;
const float c = gActiveMap.c;
const float d = gActiveMap.d;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const uint32_t scale = gActiveMap.scale;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;
const float clamped_px = fminf(fmaxf(px, a), b);
const float clamped_py = fminf(fmaxf(py, c), d);
float sx = lenx, sy = leny;
float x0 = a, y0 = c;
uint32_t idx = 0u;
uint8_t type = start;
int l = 0;
while (l < levels) {
sx *= 0.5f; sy *= 0.5f;
const float mx = x0 + sx;
const float my = y0 + sy;
const uint32_t tr = (uint32_t)((clamped_px > mx) & (clamped_py > my));
const uint32_t tl = (uint32_t)((clamped_px < mx) & (clamped_py > my));
const uint32_t dl = (uint32_t)((clamped_px < mx) & (clamped_py < my));
const uint32_t none = (uint32_t)(1u ^ (tr | tl | dl));
const uint32_t dd = (tr << 1) | tr | tl | (none << 1);
const InvStep inv = g_inv_tbl[type][dd];
type = inv.next;
idx = (idx << 2) | inv.q;
const uint32_t dx = dd >> 1;
const uint32_t dy = dd & 1u;
x0 += dx ? sx : 0.0f;
y0 += dy ? sy : 0.0f;
++l;
}
const float scale_recip = 1.0f / (float)scale;
return fmaf((float)idx * scale_recip, lenx, a);
}

// Структура для трех интервалов (простая, без конструкторов/деструкторов)
__declspec(align(16)) struct MultiCrossMsg final {
float intervals[15]; // [s_x1, s_x2, e_x1, e_x2, Rtop] × 3 интервала
uint8_t count;

text
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& intervals& count; }

};

// Структура для лучшего решения (простая, без векторов)
__declspec(align(16)) struct BestSolutionMsg final {
float bestF;
float bestX, bestY;
float bestQ[32]; // Фиксированный массив
uint8_t dim;

text
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& bestF& bestX& bestY& bestQ& dim; }

};

// Общая структура для многомерного MPI (совместима с двумерной логикой)
__declspec(align(16)) struct CtrlMsgND final {
uint8_t kind; // 0=term, 1=single, 2=multi, 3=best
CrossMsg xchg;
MultiCrossMsg multiXchg;
BestSolutionMsg bestSol;

text
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& kind; if (kind == 1) ar& xchg; else if (kind == 2) ar& multiXchg; else if (kind == 3) ar& bestSol; }

};

static __forceinline int generate_heuristic_seeds(const ManipCost& cost, int dim, float* __restrict S, int stride) {
const int n = cost.n; const bool VL = cost.variableLen; const float tx = cost.targetX, ty = cost.targetY;
const float phi = atan2f(ty, tx); const float rho = sqrtf(fmaf(tx, tx, ty * ty));
float len = fminf(fmaxf(rho / (float)n, 0.1f), 2.0f);
float* s0 = S + 0 * stride; for (int i = 0; i < n; ++i) s0[i] = phi / (float)n; if (VL) for (int i = 0; i < n; ++i) s0[n + i] = len;
float* s1 = S + 1 * stride; for (int i = 0; i < n; ++i) s1[i] = 0.5f * phi * ((i & 1) ? -1.0f : 1.0f); if (VL) { for (int i = 0; i < n; ++i) s1[n + i] = len * (0.8f + 0.4f * (float)i / (float)n); }
float* s2 = S + 2 * stride; const float inv = (n > 1) ? 1.0f / (float)(n - 1) : 0.0f; for (int i = 0; i < n; ++i) { float pr = (float)i * inv; s2[i] = phi * (1.0f - 0.3f * pr); }
if (VL) { for (int i = 0; i < n; ++i) { float si = sinf(1.5f * (float)i); s2[n + i] = len * (1.0f + 0.2f * si); } }
return 3;
}

// Модифицируем функцию agp_run_branch_mpi для использования априорной оценки
static __forceinline void agp_run_branch_mpi(const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed,
std::vector<Interval*>& H, std::vector<float>& bestQ, float& bestF, float& bestX, float& bestY, float M_prior = 1e-3f) {
const int n = cost.n, dim = n + (cost.variableLen ? n : 0);
alignas(64) float q_local[32], phi[32], s_arr[32], c_arr[32], sum_s[32], sum_c[32], q_try[32];
bestQ.reserve(dim); float x = 0.0f, y = 0.0f; int no_improve = 0;
auto evalAt = [&](float t)->float {
map.map01ToPoint(t, q_local); float f = cost(q_local, x, y);
if (f < bestF * 1.05f) {
float acc = 0.0f; for (int i = 0; i < n; ++i) { acc += q_local[i]; phi[i] = acc; }
FABE13_SINCOS(phi, s_arr, c_arr, n);
float as = 0.0f, ac = 0.0f; for (int k = n - 1; k >= 0; --k) { const float Lk = cost.variableLen ? q_local[n + k] : 1.0f; as += Lk * s_arr[k]; ac += Lk * c_arr[k]; sum_s[k] = as; sum_c[k] = ac; }
const float dx = x - cost.targetX, dy = y - cost.targetY; float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1e-8f; float eta = 0.1f;
for (int step = 0; step < 2; ++step) {
for (int i = 0; i < n; ++i) {
float gpen = 0.0f; float ai = fabsf(q_local[i]); float v = cost.minTheta - ai; if (v > 0.0f) gpen = -2.0f * copysignf(v, q_local[i]);
float grad = (dx * (-sum_s[i]) + dy * (sum_c[i])) / dist + gpen; q_try[i] = q_local[i] - eta * grad;
if (q_try[i] < -3.14159265359f) q_try[i] = -3.14159265359f; else if (q_try[i] > 3.14159265359f) q_try[i] = 3.14159265359f;
}
if (cost.variableLen) { for (int i = 0; i < n; ++i) { float grad = (dx * c_arr[i] + dy * s_arr[i]) / dist; float v = q_local[n + i] - eta * grad; if (v < 0.1f)v = 0.1f; else if (v > 2.0f)v = 2.0f; q_try[n + i] = v; } }
float x2, y2; float f2 = cost(q_try, x2, y2);
if (f2 < f) { memcpy(q_local, q_try, dim * sizeof(float)); f = f2; x = x2; y = y2; break; } eta = 0.5f;
}
}
if (f < bestF) { bestF = f; bestQ.assign(q_local, q_local + dim); bestX = x; bestY = y; no_improve = 0; }
else ++no_improve;
return f;
};
float a = 0.0f, b = 1.0f; float f_a = evalAt(a), f_b = evalAt(b);
const int K = (std::min)((std::max)(2 * dim, 8), 128);
H.clear(); H.reserve((size_t)maxIter + K + 16);
float Mmax = M_prior; const int rank = g_world->rank(); const int world = g_world->size();
alignas(64) float seeds[3 * 32]; const int seedCnt = generate_heuristic_seeds(cost, dim, seeds, 32);
if (seedCnt > 0) {
const float
s0 = seeds; float t_seed = map.pointToT(s0);
float t1 = fmaxf(a, t_seed - 0.002f), t2 = fminf(b, t_seed + 0.002f);
if (t2 > t1) {
alignas(64) float q1[32], q2[32]; float x1, y1, x2, y2;
map.map01ToPoint(t1, q1); float f1 = cost(q1, x1, y1);
map.map01ToPoint(t2, q2); float f2 = cost(q2, x2, y2);
Interval* I = new Interval(t1, t2, f1, f2, (float)dim); Mmax = (std::max)(Mmax, I->M); I->ChangeCharacteristic(r * Mmax); I->R = 0.85f; H.emplace_back(I);
if (f1 < bestF) { bestF = f1; bestQ.assign(q1, q1 + dim); bestX = x1; bestY = y1; }
if (f2 < bestF) { bestF = f2; bestQ.assign(q2, q2 + dim); bestX = x2; bestY = y2; }
}
}
float prev_t = a, prev_f = f_a;
for (int k = 1; k <= K; ++k) {
float t = a + (b - a) * ((float)k / (K + 1)) + (float)rank / (float)(world * (K + 1));
float f = evalAt(t); Interval
I = new Interval(prev_t, t, prev_f, f, (float)dim); Mmax = (std::max)(Mmax, I->M); I->ChangeCharacteristic(r * Mmax);
H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtr); prev_t = t; prev_f = f;
}
Interval* tail = new Interval(prev_t, b, prev_f, f_b, (float)dim); Mmax = (std::max)(Mmax, tail->M); tail->ChangeCharacteristic(r * Mmax);
H.emplace_back(tail); std::push_heap(H.begin(), H.end(), ComparePtr);
if (seedCnt > 1) {
for (int i = 1; i < seedCnt; ++i) {
const float* s = seeds + i * 32; float t_seed = map.pointToT(s);
float t1 = fmaxf(a, t_seed - 0.001f), t2 = fminf(b, t_seed + 0.001f);
if (t2 > t1) { float f1 = evalAt(t1), f2 = evalAt(t2); Interval* J = new Interval(t1, t2, f1, f2, (float)dim); Mmax = (std::max)(Mmax, J->M); J->ChangeCharacteristic(r * Mmax); J->R = 0.90f; H.emplace_back(J); std::push_heap(H.begin(), H.end(), ComparePtr); }
}
}
float dmax = b - a, initial_len = dmax, thr03 = 0.3f * initial_len, inv_thr03 = 1.0f / thr03; int it = 0;
while (it < maxIter) {
if ((it % 100) == 0 && no_improve > 50 && !bestQ.empty()) {
float t_best = map.pointToT(bestQ.data());
for (int i = 0; i < 2; ++i) {
float off = (i == 0) ? 0.01f : -0.01f; float t_seed = fminf(b, fmaxf(a, t_best + off));
float f_seed = evalAt(t_seed); Interval
J = new Interval(t_seed - 0.005f, t_seed + 0.005f, f_seed, f_seed, (float)dim); J->ChangeCharacteristic(r * Mmax); J->R = 0.9f;
H.emplace_back(J); std::push_heap(H.begin(), H.end(), ComparePtr);
}
no_improve = 0;
}
const float p = fmaf(-1.0f / initial_len, dmax, 1.0f); bool stagnation = (no_improve > 100) && (it > 270);
float r_eff = fmaxf(1.0f, r * (0.7f + 0.3f * (1.0f - p)));
std::pop_heap(H.begin(), H.end(), ComparePtr); Interval
cur = H.back(); H.pop_back();
const float x1 = cur->x1, x2 = cur->x2, y1 = cur->y1, y2 = cur->y2; float m = r_eff * Mmax;
float tNew = Shag(m, x1, x2, y1, y2, (float)dim, r); tNew = fminf(fmaxf(tNew, a), b);
float fNew = evalAt(tNew);
Interval* L = new Interval(x1, tNew, y1, fNew, (float)dim);
Interval* Rv = new Interval(tNew, x2, fNew, y2, (float)dim);
float Mloc = (std::max)(L->M, Rv->M);
if (adaptive) {
float len1 = tNew - x1, len2 = x2 - tNew;
if (len1 + len2 == dmax) { dmax = (std::max)(len1, len2); for (auto pI : H) { float Ls = pI->x2 - pI->x1; if (Ls > dmax) dmax = Ls; } }
if ((thr03 > dmax && !(it % 3)) || (10.0f * dmax < initial_len)) {
if (Mloc > Mmax) { Mmax = Mloc; m = r_eff * Mmax; }
const float progress = fmaf(-dmax, inv_thr03, 1.0f), alpha = progress * progress, beta = fmaf(-alpha, 1.0f, 2.0f);
const float MULT = (1.0f / dmax) * Mmax, global_coeff = fmaf(MULT, r_eff, -MULT), GF = fmaf(beta, global_coeff, 0.0f);
L->ChangeCharacteristic(fmaf(GF, len1, L->M * alpha)); Rv->ChangeCharacteristic(fmaf(GF, len2, Rv->M * alpha));
size_t sz = H.size(); RecomputeR_AffineM_AVX2(H.data(), sz, GF, alpha); std::make_heap(H.begin(), H.end(), ComparePtr);
}
else {
if (Mloc > Mmax) {
bool big = (Mloc >= 1.15f * Mmax); Mmax = Mloc; m = r_eff * Mmax; L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m);
if (big) { size_t sz = H.size(); RecomputeR_ConstM_AVX2(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtr); }
}
else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); }
}
}
else {
if (Mloc > Mmax) {
bool big = (Mloc >= 1.15f * Mmax); Mmax = Mloc; float m2 = r_eff * Mmax; L->ChangeCharacteristic(m2); Rv->ChangeCharacteristic(m2);
if (big) { size_t sz = H.size(); RecomputeR_ConstM_AVX2(H.data(), sz, m2); std::make_heap(H.begin(), H.end(), ComparePtr); }
}
else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); }
}
H.push_back(L); std::push_heap(H.begin(), H.end(), ComparePtr);
H.push_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtr);
Interval* top = H.front(); float interval_len = top->x2 - top->x1;
const int T = (int)fmaf(-expm1f(p), 264.0f, 277.0f);
bool want_term = (powf(interval_len, 1.0f / (float)dim) < eps) || (it == maxIter - 1);
if (!(it % T) || want_term) {
CtrlMsgND out; out.kind = want_term ? 0 : 2;
if (!want_term) {
uint8_t cnt = (uint8_t)((H.size() >= 3) ? 3 : H.size()); out.multiXchg.count = cnt; float* dest = out.multiXchg.intervals;
Interval* t1 = H[0]; Interval* t2 = (H.size() > 1 ? H[1] : H[0]); Interval* t3 = (H.size() > 2 ? H[2] : H[H.size() - 1]);
Interval* tops[3] = { t1,t2,t3 }; for (uint8_t i2 = 0; i2 < cnt; ++i2) { Interval* Tt = tops[i2]; dest[0] = Tt->x1; dest[1] = 0.0f; dest[2] = Tt->x2; dest[3] = 0.0f; dest[4] = Tt->R; dest += 5; }
}
for (int i2 = 0; i2 < world; ++i2) if (i2 != rank) g_world->isend(i2, 0, out);
if (want_term) break;
}
if (!(it % 500) && !bestQ.empty()) {
CtrlMsgND out; out.kind = 3; out.bestSol.bestF = bestF; out.bestSol.bestX = bestX; out.bestSol.bestY = bestY; out.bestSol.dim = (uint8_t)bestQ.size();
memcpy(out.bestSol.bestQ, bestQ.data(), bestQ.size() * sizeof(float));
for (int i2 = 0; i2 < world; ++i2) if (i2 != rank) g_world->isend(i2, 0, out);
}
while (g_world->iprobe(boost::mpi::any_source, 0)) {
CtrlMsgND in; g_world->recv(boost::mpi::any_source, 0, in);
if (in.kind == 0) { if (!rank) break; else return; }
else if (in.kind == 1) {
float sx = in.xchg.s_x1, ex = in.xchg.e_x1; if (sx < 0.0f) sx = 0.0f; if (ex > 1.0f) ex = 1.0f; if (ex > sx) {
alignas(64) float tmp[32]; float tx, ty;
map.map01ToPoint(sx, tmp); float y1 = cost(tmp, tx, ty);
map.map01ToPoint(ex, tmp); float y2 = cost(tmp, tx, ty);
Interval* inj = new Interval(sx, ex, y1, y2, (float)dim); inj->ChangeCharacteristic(r * Mmax);
Interval* topH = H.front(); if (inj->R > 1.15f * topH->R) {
float p = fmaf(-1.0f / initial_len, dmax, 1.0f);
float k = (no_improve > 100 && it > 270) ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f);
inj->R = in.xchg.Rtop * k; H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtr);
}
}
}
else if (in.kind == 2) {
const MultiCrossMsg& mX = in.multiXchg;
for (uint8_t ii = 0; ii < mX.count; ++ii) {
const float* d = &mX.intervals[ii * 5]; float sx = d[0], ex = d[2]; if (sx < 0.0f) sx = 0.0f; if (ex > 1.0f) ex = 1.0f; if (ex > sx) {
alignas(64) float tmp[32]; float tx, ty;
map.map01ToPoint(sx, tmp); float y1 = cost(tmp, tx, ty);
map.map01ToPoint(ex, tmp); float y2 = cost(tmp, tx, ty);
Interval* inj = new Interval(sx, ex, y1, y2, (float)dim); inj->ChangeCharacteristic(r * Mmax);
Interval* topH = H.front(); if (inj->R > 1.15f * topH->R) {
float p = fmaf(-1.0f / initial_len, dmax, 1.0f);
float k = (no_improve > 100 && it > 270) ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f);
inj->R = d[4] * k; H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtr);
}
}
}
}
else if (in.kind == 3) {
const BestSolutionMsg& bm = in.bestSol;
if (bm.bestF < bestF * 1.15f) {
alignas(64) float tmp_q[32]; memcpy(tmp_q, bm.bestQ, bm.dim * sizeof(float));
float t_best = map.pointToT(tmp_q); float t1 = fmaxf(a, t_best - 0.001f), t2 = fminf(b, t_best + 0.001f);
if (t2 > t1) {
alignas(64) float tq1[32], tq2[32]; float xx1, yy1, xx2, yy2;
map.map01ToPoint(t1, tq1); float f1 = cost(tq1, xx1, yy1);
map.map01ToPoint(t2, tq2); float f2 = cost(tq2, xx2, yy2);
Interval* I = new Interval(t1, t2, f1, f2, (float)dim); I->ChangeCharacteristic(r * Mmax); I->R *= 0.90f; H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtr);
}
if (bm.bestF < bestF) { bestF = bm.bestF; bestX = bm.bestX; bestY = bm.bestY; bestQ.assign(bm.bestQ, bm.bestQ + bm.dim); }
}
}
}
++it;
}
}

struct BestPacket { float bestF; int dim; float bestX; float bestY; template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar& bestF& dim& bestX& bestY; } };

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_Manip2D(int nSegments, bool variableLengths, float minTheta,
float targetX, float targetY, int peanoLevels, int maxIterPerBranch,
float r, bool adaptiveMode, float epsilon, unsigned int seed,
float** out_bestQ, size_t* out_bestQLen, float* out_bestX, float* out_bestY, float* out_bestF) {
const int dim = nSegments + (variableLengths ? nSegments : 0);
g_mc.permCache.resize(dim); for (int i = 0; i < dim; ++i) g_mc.permCache[i] = i;
uint32_t s = g_mc.baseSeed; for (int i = dim - 1; i > 0; --i) { s ^= s << 13; s ^= s >> 17; s ^= s << 5; uint32_t j = s % (uint32_t)(i + 1); std::swap(g_mc.permCache[i], g_mc.permCache[j]); }
g_mc.invMaskCache.resize(dim); for (int k = 0; k < dim; ++k) { s ^= s << 13; s ^= s >> 17; s ^= s << 5; g_mc.invMaskCache[k] = (uint64_t)s; }
const float thetaMin = -3.14159265359f, thetaMax = 3.14159265359f, lenMin = 0.1f, lenMax = 2.0f;
std::vector<float> low; low.reserve(dim); std::vector<float> high; high.reserve(dim);
for (int i = 0; i < nSegments; ++i) { low.push_back(thetaMin); high.push_back(thetaMax); }
if (variableLengths) { for (int i = 0; i < nSegments; ++i) { low.push_back(lenMin); high.push_back(lenMax); } }
ManipCost cost(nSegments, variableLengths, targetX, targetY, minTheta);
const int rank = g_world->rank(), world = g_world->size();
std::vector<float> bestQ; float bestF = FLT_MAX, bx = 0.0f, by = 0.0f;
const int levels0 = (std::min)(peanoLevels, 8), maxIter0 = (int)(maxIterPerBranch * 0.2f);
MortonND map0(dim, levels0, low.data(), high.data(), g_mc);
std::vector<Interval*> H_coarse; std::vector<float> bestQ_coarse; float bestF_coarse = FLT_MAX, bx_coarse = 0.0f, by_coarse = 0.0f;
float M_prior = (variableLengths ? 2.0f * nSegments : 2.0f * nSegments) * (1.0f / (float)(1u << levels0)); if (variableLengths) M_prior += 1.41421356237f * (1.0f / (float)(1u << levels0));
agp_run_branch_mpi(map0, cost, maxIter0, r, adaptiveMode, epsilon, seed, H_coarse, bestQ_coarse, bestF_coarse, bx_coarse, by_coarse, M_prior);
if (bestF_coarse < bestF) { bestF = bestF_coarse; bestQ = bestQ_coarse; bx = bx_coarse; by = by_coarse; }
if (levels0 < peanoLevels) {
MortonND map1(dim, peanoLevels, low.data(), high.data(), g_mc);
std::vector<Interval*> H_fine; std::vector<float> bestQ_fine = bestQ; float bestF_fine = bestF, bx_fine = bx, by_fine = by;
float M_prior_fine = (variableLengths ? 2.0f * nSegments : 2.0f * nSegments) * (1.0f / (float)(1u << peanoLevels)); if (variableLengths) M_prior_fine += 1.41421356237f * (1.0f / (float)(1u << peanoLevels));
if (!H_coarse.empty()) {
std::sort(H_coarse.begin(), H_coarse.end(), [](const Interval* a, const Interval* b) {return a->R < b->R; });
const size_t topCount = (size_t)(H_coarse.size() * 0.3f);
for (size_t i = 0; i < topCount && i < H_coarse.size(); ++i) {
const Interval* C = H_coarse[i];
alignas(64) float q1[32], q2[32]; float x1, y1, x2, y2;
map1.map01ToPoint(C->x1, q1); float f1 = cost(q1, x1, y1);
map1.map01ToPoint(C->x2, q2); float f2 = cost(q2, x2, y2);
H_fine.push_back(new Interval(C->x1, C->x2, f1, f2, (float)dim));
if (f1 < bestF_fine) { bestF_fine = f1; bestQ_fine.assign(q1, q1 + dim); bx_fine = x1; by_fine = y1; }
if (f2 < bestF_fine) { bestF_fine = f2; bestQ_fine.assign(q2, q2 + dim); bx_fine = x2; by_fine = y2; }
}
std::make_heap(H_fine.begin(), H_fine.end(), ComparePtr);
}
agp_run_branch_mpi(map1, cost, maxIterPerBranch - maxIter0, r, adaptiveMode, epsilon, seed, H_fine, bestQ_fine, bestF_fine, bx_fine, by_fine, M_prior_fine);
if (bestF_fine < bestF) { bestF = bestF_fine; bestQ = bestQ_fine; bx = bx_fine; by = by_fine; }
}
BestPacket me{ bestF,dim,bx,by };
if (!rank) {
std::vector<float> winnerQ = bestQ; float winF = bestF, wx = bx, wy = by;
for (int i = 1; i < world; ++i) { BestPacket bp; g_world->recv(i, 2, bp); std::vector<float> qin; g_world->recv(i, 3, qin); if (bp.bestF < winF) { winF = bp.bestF; wx = bp.bestX; wy = bp.bestY; winnerQ = qin; } }
*out_bestQLen = winnerQ.size(); out_bestQ = (float)CoTaskMemAlloc(sizeof(float) * (*out_bestQLen));
memcpy(*out_bestQ, winnerQ.data(), sizeof(float) * (*out_bestQLen)); *out_bestX = wx; *out_bestY = wy; *out_bestF = winF;
}
else { g_world->send(0, 2, me); g_world->send(0, 3, bestQ); }
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(int peanoLevel, float a, float b, float c, float d) {
g_env = new boost::mpi::environment();
g_world = new boost::mpi::communicator();
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
const int rank = g_world->rank();
const int world_size = g_world->size();
if (world_size == 4) {
new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, rank & 3);
}
g_mc.baseSeed = fmaf(0x9E3779B9u, rank, 0x9E3779B9u);
return rank;
}

__forceinline float ShekelFunc(float x, float seed) {
int i = 0; float st = seed, r1, r2, res = 0.0f;
while (i < 10) {
XOR_RAND(st, r1);
float xp = fmaf(-r1, 10.0f, x);
XOR_RAND(st, r1);
XOR_RAND(st, r2);
float d = fmaf(fmaf(r1, 20.0f, 5.0f), xp * xp, fmaf(r2, 0.2f, 1.0f));
d = copysignf(fmaxf(fabsf(d), FLT_MIN), d);
res -= 1.0f / d;
++i;
}
return res;
}

__forceinline float RastriginFunc(float x1, float x2) {
const float t = fmaf(x1, x1, x2 * x2);
float c1, c2;
FABE13_COS(6.28318530717958647692f * x1, c1);
FABE13_COS(6.28318530717958647692f * x2, c2);
return (t - fmaf(c1 + c2, 10.0f, -14.6f)) * fmaf(-t, 0.25f, 18.42f);
}

__forceinline float HillFunc(float x, float seed) {
int j = 0;
__declspec(align(32)) float ang[14u];
float st = 6.28318530717958647692f * x;
while (j < 14) { ang[j] = st * (float)(j + 1); ++j; }
__declspec(align(32)) float sv[14u], cv[14u];
FABE13_SINCOS(ang, sv, cv, 14u);
float state = seed, r1, r2;
XOR_RAND(state, r1);
float res = fmaf(r1, 2.0f, -1.1f);
--j;
while (j >= 0) {
XOR_RAND(state, r1);
XOR_RAND(state, r2);
res += fmaf(fmaf(r1, 2.0f, -1.1f), sv[j], fmaf(r2, 2.0f, -1.1f) * cv[j]);
--j;
}
return res;
}

__forceinline float GrishaginFunc(float x1, float x2, float seed) {
int j = 0;
__declspec(align(32)) float aj[8u], ak[8u];
while (j < 8) { float pj = 3.14159265358979323846f * (float)(j + 1); aj[j] = pj * x1; ak[j] = pj * x2; ++j; }
__declspec(align(32)) float sj[8u], cj[8u], sk[8u], ck[8u];
FABE13_SINCOS(aj, sj, cj, 8u);
FABE13_SINCOS(ak, sk, ck, 8u);
--j;
float p1 = 0.0f, p2 = 0.0f;
float st = seed, r1, r2;
while (j >= 0) {
size_t k = 0u;
while (k < 8u) {
float s = sj[j] * sj[j];
float c = ck[k] * ck[k];
XOR_RAND_GRSH(st, r1);
XOR_RAND_GRSH(st, r2);
p1 = fmaf(r1, s, fmaf(r2, c, p1));
XOR_RAND_GRSH(st, r1);
XOR_RAND_GRSH(st, r2);
p2 = fmaf(-r1, c, fmaf(r2, s, p2));
++k;
}
--j;
}
return -sqrtf(fmaf(p1, p1, p2 * p2));
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_1D(float global_iterations, float a, float b, float r, bool mode, float epsilon, float seed, float** out_data, size_t* out_len) {
Slab* slab = tls.local();
slab->current = slab->base;
int schetchick = 0;
const float initial_length = b - a;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length;
const float inv_threshold_03 = 1.0f / threshold_03;
const float start_val = ShekelFunc(a, seed);
float best_f = ShekelFunc(b, seed);
float x_Rmax_1 = a;
float x_Rmax_2 = b;
float y_Rmax_1 = start_val;
float y_Rmax_2 = best_f;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
std::vector<Interval*, boost::alignment::aligned_allocator<Interval*, 64u>> R;
Extr.reserve((size_t)global_iterations << 2u);
R.reserve((size_t)global_iterations << 1u);
R.emplace_back(new Interval(a, b, start_val, best_f, 1.0f));
float Mmax = R.front()->M;
float m = r * Mmax;
while (true) {
const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r);
const float new_value = ShekelFunc(new_point, seed);
if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_point); }
std::pop_heap(R.begin(), R.end(), ComparePtr);
const Interval* pro = R.back();
const float new_x1 = pro->x1;
const float new_x2 = pro->x2;
const float len2 = new_x2 - new_point;
const float len1 = new_point - new_x1;
const float interval_len = len1 < len2 ? len1 : len2;
if (++schetchick == (int)global_iterations || interval_len < epsilon) {
Extr.emplace_back((float)schetchick);
Extr.emplace_back(interval_len);
*out_len = Extr.size();
out_data = (float)CoTaskMemAlloc(sizeof(float) * (out_len));
memcpy(out_data, Extr.data(), sizeof(float) * (out_len));
return;
}
Interval
curr = new Interval(new_x1, new_point, pro->y1, new_value, 1.0f);
Interval
curr1 = new Interval(new_point, new_x2, new_value, pro->y2, 1.0f);
const float currM = curr->M > curr1->M ? curr->M : curr1->M;
const size_t r_size = R.size();
if (mode) {
if (len2 + len1 == dmax) { dmax = len2 > len1 ? len2 : len1; for (auto p : R) { float L = p->x2 - p->x1; if (L > dmax) dmax = L; } }
if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) {
if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
const float alpha = progress * progress;
const float betta = 2.0f - alpha;
const float MULT = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULT, r, -MULT);
const float GF = betta * global_coeff;
curr->ChangeCharacteristic(fmaf(GF, len1, curr->M * alpha));
curr1->ChangeCharacteristic(fmaf(GF, len2, curr1->M * alpha));
RecomputeR_AffineM_AVX2(R.data(), r_size, GF, alpha);
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); }
}
else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
}
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); }
}
else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
}
R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr);
R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr);
const Interval
top = R.front();
x_Rmax_1 = top->x1; x_Rmax_2 = top->x2; y_Rmax_1 = top->y1; y_Rmax_2 = top->y2;
}
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_2D(
const float N,
const float global_iterations,
const float a,
const float b,
const float c,
const float d,
const float r,
const bool mode,
const float epsilon,
const float seed,
float** const __restrict out_data,
size_t* const __restrict out_len) noexcept
{
Slab* const __restrict slab = tls.local();
slab->current = slab->base;
int schetchick = 0;
int no_improve = 0;
const int rank = g_world->rank();
const int world_size = g_world->size();
while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg dummy; g_world->recv(boost::mpi::any_source, 0, dummy); }
const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1));
const float x_addition = (b - a) * inv_divider;
const float y_addition = (d - c) * inv_divider;
const float true_start = a + x_addition;
const float true_end = b - x_addition;
float x_Rmax_1 = true_start;
float x_Rmax_2 = true_end;
const float initial_length = x_Rmax_2 - x_Rmax_1;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length;
const float inv_threshold_03 = 1.0f / threshold_03;
const float start_val = rank % 3 ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition);
float best_f = rank % 2 ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition);
float y_Rmax_1 = start_val;
float y_Rmax_2 = best_f;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R;
Extr.clear(); Extr.reserve(static_cast<size_t>(global_iterations) << 2u);
R.clear(); R.reserve(static_cast<size_t>(global_iterations) << 1u);
R.emplace_back(new Interval(true_start, true_end, start_val, best_f, 2.0f));
const Interval* __restrict top_ptr;
float Mmax = R.front()->M;
float m = r * Mmax;
while (true) {
const float interval_len = x_Rmax_2 - x_Rmax_1;
const bool stagnation = no_improve > 100 && schetchick > 270;
const float p = fmaf(-1.0f / initial_length, dmax, 1.0f);
while (g_world->iprobe(boost::mpi::any_source, 0)) {
CtrlMsg in; g_world->recv(boost::mpi::any_source, 0, in);
if (in.kind) {
if (!rank) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (out_len)); }
return;
}
const float sx = FindX2D_analytic(in.xchg.s_x1, in.xchg.s_x2);
const float ex = FindX2D_analytic(in.xchg.e_x1, in.xchg.e_x2);
Interval
const __restrict injected = new Interval(sx, ex, RastriginFunc(in.xchg.s_x1, in.xchg.s_x2), RastriginFunc(in.xchg.e_x1, in.xchg.e_x2), 2.0f);
injected->ChangeCharacteristic(m);
if (injected->R > 1.15f * top_ptr->R) {
const float k = stagnation ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f);
injected->R = in.xchg.Rtop * k;
R.emplace_back(injected);
std::push_heap(R.begin(), R.end(), ComparePtr);
}
}
const int T = static_cast<int>(fmaf(-expm1f(p), 264.0f, 277.0f));
const bool want_term = interval_len < epsilon || schetchick == static_cast<int>(global_iterations);
if (!(++schetchick % T) || stagnation || want_term) {
CtrlMsg out; out.kind = want_term;
if (!out.kind) {
float s_x1, s_x2, e_x1, e_x2;
HitTest2D_analytic(top_ptr->x1, s_x1, s_x2);
HitTest2D_analytic(top_ptr->x2, e_x1, e_x2);
out.xchg = CrossMsg{ s_x1, s_x2, e_x1, e_x2, top_ptr->R };
}
for (int i = 0; i < world_size; ++i) { if (i != rank) g_world->isend(i, 0, out); }
if (out.kind) {
if (!rank) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (out_len))); memcpy(out_data, Extr.data(), sizeof(float) * (out_len)); }
return;
}
}
const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r);
float new_x1_val, new_x2_val;
HitTest2D_analytic(new_point, new_x1_val, new_x2_val);
const float new_value = RastriginFunc(new_x1_val, new_x2_val);
if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_x1_val); Extr.emplace_back(new_x2_val); no_improve = 0; }
else { ++no_improve; }
std::pop_heap(R.begin(), R.end(), ComparePtr);
Interval
const __restrict promejutochny_otrezok = R.back();
const float segment_x1 = promejutochny_otrezok->x1;
const float segment_x2 = promejutochny_otrezok->x2;
const float len2 = segment_x2 - new_point;
const float len1 = new_point - segment_x1;
Interval
const __restrict curr = new Interval(segment_x1, new_point, promejutochny_otrezok->y1, new_value, 2.0f);
Interval
const __restrict curr1 = new Interval(new_point, segment_x2, new_value, promejutochny_otrezok->y2, 2.0f);
const float currM = (std::max)(curr->M, curr1->M);
const size_t r_size = R.size();
if (mode) {
if (len2 + len1 == dmax) { dmax = (std::max)(len1, len2); for (auto pI : R) { float L = pI->x2 - pI->x1; if (L > dmax) dmax = L; } }
if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) {
if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
const float alpha = progress * progress;
const float betta = 2.0f - alpha;
const float MULTIPLIER = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
const float GLOBAL_FACTOR = betta * global_coeff;
curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));
RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha);
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); }
}
else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
}
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); }
}
else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
}
R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr);
R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr);
top_ptr = R.front();
x_Rmax_1 = top_ptr->x1; x_Rmax_2 = top_ptr->x2; y_Rmax_1 = top_ptr->y1; y_Rmax_2 = top_ptr->y2;
}
}

struct RunParams { int nSegments; uint8_t varLen; float minTheta; float tx, ty; int levels, maxIter; float r; uint8_t adaptive; float eps; unsigned int seed; template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar& nSegments& varLen& minTheta& tx& ty& levels& maxIter& r& adaptive& eps& seed; } };

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartManipND(int nSegments, bool variableLengths, float minTheta, float targetX, float targetY, int peanoLevels, int maxIterPerBranch, float r, bool adaptiveMode, float epsilon, unsigned int seed) {
RunParams p; p.nSegments = nSegments; p.varLen = (uint8_t)variableLengths; p.minTheta = minTheta; p.tx = targetX; p.ty = targetY; p.levels = peanoLevels; p.maxIter = maxIterPerBranch; p.r = r; p.adaptive = (uint8_t)adaptiveMode; p.eps = epsilon; p.seed = seed;
const int rank = g_world->rank(); const int world = g_world->size();
for (int i = 1; i < world; ++i) { if (i != rank) g_world->isend(i, 1, p); }
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() {
while (true) {
if (g_world->iprobe(0, 1)) {
RunParams p; g_world->recv(0, 1, p);
float* q = nullptr; size_t qlen = 0; float bx = 0, by = 0, bf = 0;
AGP_Manip2D(p.nSegments, (bool)p.varLen, p.minTheta, p.tx, p.ty, p.levels, p.maxIter, p.r, (bool)p.adaptive, p.eps, p.seed, &q, &qlen, &bx, &by, &bf);
if (q) CoTaskMemFree(q);
}
Sleep(0);
}
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartWorkers() { int i = 1; const int world = g_world->size(); while (i < world) { g_world->isend(i, 1, 0); ++i; } }

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_Free(float* p) { CoTaskMemFree(p); }

الفكرة

Refactoring user code

The user needs help simplifying and compacting their large C++ code while maintaining functionality. My task is to remove comments, extra spaces, and rearrange functions for better locality, but not alter functionality. I'll also review alignment to optimize for the environment, considering specific size specs like 16/32/64, without affecting ABI.

Optimizing struct alignment and layout

I’ll reorder the code by grouping related structs and macros together, moving small helper functions before usage, and adjusting alignments based on vectorization and cache line considerations.

For example, keeping g_step_tbl and g_inv_tbl aligned at 64-bit for cache efficiency makes sense. I’ll also ensure that Step and InvStep have proper 4-byte alignment, as indicated.

Finalizing alignment choices

The alignment for various components seems purposeful:

  • The Interval struct is aligned at 64 for better cache line performance and vectorization, so I’ll keep it that way.
  • The FABE13_SINCOS function and vectorized arrays use 32-byte alignment, appropriate for AVX, so 32 is fine here.
  • While the user prefers 16 in managed environments, here, the C++ code with AVX benefits from 32 alignment, so I should retain 32 in these cases. The struct alignment at 64 is also crucial for performance.

Adjusting code layout

I’ll keep most existing alignments as they are, while reducing CrossMsg and CtrlMsg alignments from 16 to 16 (no change). The focus will be on reordering for locality. We'll keep macros first, then enums, structs, environment pointers, and various functions in a more logical order, without changing any code functionality. I’ll also be mindful not to add comments, as the user prefers the code comment-free.

Reordering and formatting code

We’re keeping most current alignments and careful about reordering. I’ll remove the commented Russian lines and condense whitespace while ensuring macros maintain readability. We'll adjust logical grouping of definitions for efficiency, while avoiding any changes to the core functionality or renaming variables. I’ll also ensure the relocation of references (e.g., ComparePtr) doesn’t disrupt their usage. Code will be formatted more compactly, keeping definitions organized and readable.

Thought for 32s

الإجابة

cpp
#include "pch.h" #define XOR_RAND(state,result_var) do{int s=(state);s^=s<<13;s^=s>>17;s^=s<<5;state=s;result_var=state*0x1.0p-32f;}while(0) #define XOR_RAND_GRSH(state,result_var) do{int s=(state);s^=s<<13;s^=s>>17;s^=s<<5;state=s;result_var=fmaf(state,0x1.0p-31f,-1.0f);}while(0) #define FABE13_COS(x,result_var) do{const float _ax_=fabsf(x);float _r_=fmodf(_ax_,6.28318530718f);if(_r_>3.14159265359f)_r_=6.28318530718f-_r_;if(_r_<1.57079632679f){const float _t2_=_r_*_r_;const float _t4_=_t2_*_t2_;result_var=fmaf(_t4_,fmaf(_t2_,-0.0013888889f,0.0416666667f),fmaf(_t2_,-0.5f,1.0f));}else{_r_=3.14159265359f-_r_;const float _t2_=_r_*_r_;const float _t4_=_t2_*_t2_;result_var=-fmaf(_t4_,fmaf(_t2_,-0.0013888889f,0.0416666667f),fmaf(_t2_,-0.5f,1.0f));}}while(0) #define FABE13_SIN(x,result_var) do{const float _x_=(x);const float _ax_=fabsf(_x_);float _r_=fmodf(_ax_,6.28318530718f);bool _sfl_=_r_>3.14159265359f;if(_sfl_)_r_=6.28318530718f-_r_;bool _cfl_=_r_>1.57079632679f;if(_cfl_)_r_=3.14159265359f-_r_;const float _t2_=_r_*_r_;float _s=fmaf(_t2_,fmaf(_t2_,fmaf(_t2_,-0.0001984127f,0.0083333333f),-0.16666666f),1.0f)*_r_;result_var=((_x_<0.0f)^_sfl_)?-_s:_s;}while(0) #define FABE13_SINCOS(in,sin_out,cos_out,n) do{int i=0;const int limit=(n)&~7;if((n)>=8){static __declspec(align(32)) const __m256 VEC_TWOPI=_mm256_set1_ps(6.28318530718f);static __declspec(align(32)) const __m256 VEC_PI=_mm256_set1_ps(3.14159265359f);static __declspec(align(32)) const __m256 VEC_PI_2=_mm256_set1_ps(1.57079632679f);static __declspec(align(32)) const __m256 INV_TWOPI=_mm256_set1_ps(0.15915494309189535f);static __declspec(align(32)) const __m256 BIAS=_mm256_set1_ps(12582912.0f);static __declspec(align(32)) const __m256 VEC_COS_P5=_mm256_set1_ps(-0.0013888889f);static __declspec(align(32)) const __m256 VEC_COS_P3=_mm256_set1_ps(0.0416666667f);static __declspec(align(32)) const __m256 VEC_COS_P1=_mm256_set1_ps(-0.5f);static __declspec(align(32)) const __m256 VEC_COS_P0=_mm256_set1_ps(1.0f);static __declspec(align(32)) const __m256 VEC_SIN_P5=_mm256_set1_ps(-0.0001984127f);static __declspec(align(32)) const __m256 VEC_SIN_P3=_mm256_set1_ps(0.0083333333f);static __declspec(align(32)) const __m256 VEC_SIN_P1=_mm256_set1_ps(-0.16666666f);static __declspec(align(32)) const __m256 VEC_SIN_P0=_mm256_set1_ps(1.0f);static __declspec(align(32)) const __m256 VEC_ZERO=_mm256_setzero_ps();while(i<limit){const __m256 vx=_mm256_load_ps(&(in)[i]);const __m256 vax=_mm256_andnot_ps(_mm256_set1_ps(-0.0f),vx);__m256 q=_mm256_fmadd_ps(vax,INV_TWOPI,BIAS);q=_mm256_sub_ps(q,BIAS);const __m256 r=_mm256_fnmadd_ps(VEC_TWOPI,q,vax);const __m256 r1=_mm256_min_ps(r,_mm256_sub_ps(VEC_TWOPI,r));const __m256 r2=_mm256_min_ps(r1,_mm256_sub_ps(VEC_PI,r1));const __m256 t2=_mm256_mul_ps(r2,r2);const __m256 cosv=_mm256_fmadd_ps(t2,_mm256_fmadd_ps(t2,_mm256_fmadd_ps(t2,VEC_COS_P5,VEC_COS_P3),VEC_COS_P1),VEC_COS_P0);const __m256 sinv=_mm256_mul_ps(_mm256_fmadd_ps(t2,_mm256_fmadd_ps(t2,_mm256_fmadd_ps(t2,VEC_SIN_P5,VEC_SIN_P3),VEC_SIN_P1),VEC_SIN_P0),r2);const __m256 cflip=_mm256_cmp_ps(r1,VEC_PI_2,_CMP_GT_OQ);const __m256 sflip=_mm256_xor_ps(_mm256_cmp_ps(vx,VEC_ZERO,_CMP_LT_OQ),_mm256_cmp_ps(r,VEC_PI,_CMP_GT_OQ));_mm256_store_ps(&(cos_out)[i],_mm256_blendv_ps(cosv,_mm256_sub_ps(VEC_ZERO,cosv),cflip));_mm256_store_ps(&(sin_out)[i],_mm256_blendv_ps(sinv,_mm256_sub_ps(VEC_ZERO,sinv),sflip));i+=8;}}while(i<(n)){const float x=(in)[i];const float ax=fabsf(x);float q=fmaf(ax,0.15915494309189535f,12582912.0f);q-=12582912.0f;float r=fmaf(-6.28318530718f,q,ax);const bool sflip=r>3.14159265359f;if(sflip)r=6.28318530718f-r;const bool cflip=r>1.57079632679f;if(cflip)r=3.14159265359f-r;const float t2=r*r;const float c=fmaf(t2,fmaf(t2,fmaf(t2,-0.0013888889f,0.0416666667f),-0.5f),1.0f);const float s=fmaf(t2,fmaf(t2,fmaf(t2,-0.0001984127f,0.0083333333f),-0.16666666f),1.0f)*r;(cos_out)[i]=cflip?-c:c;(sin_out)[i]=((x<0.0f)^sflip)?-s:s;++i;}}while(0) enum List:uint8_t{Top=0b00u,Down=0b01u,Left=0b10u,Right=0b11u}; __declspec(align(4)) struct Step final{const uint8_t next;const uint8_t dx;const uint8_t dy;}; __declspec(align(4)) struct InvStep final{const uint8_t q;const uint8_t next;}; __declspec(align(64)) static const Step g_step_tbl[4][4]={ {{Right,0u,0u},{Top,0u,1u},{Top,1u,1u},{Left,1u,0u}}, {{Left,1u,1u},{Down,1u,0u},{Down,0u,0u},{Right,0u,1u}}, {{Down,1u,1u},{Left,0u,1u},{Left,0u,0u},{Top,1u,0u}}, {{Top,0u,0u},{Right,1u,0u},{Right,1u,1u},{Down,0u,1u}} }; __declspec(align(64)) static const InvStep g_inv_tbl[4][4]={ {{0u,Right},{1u,Top},{3u,Left},{2u,Top}}, {{2u,Down},{3u,Right},{1u,Down},{0u,Left}}, {{2u,Left},{1u,Left},{3u,Top},{0u,Down}}, {{0u,Top},{3u,Down},{1u,Right},{2u,Right}} }; static const boost::mpi::environment* g_env; static const boost::mpi::communicator* g_world; __declspec(align(16)) struct CrossMsg final{float s_x1,s_x2;float e_x1,e_x2;float Rtop;template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar,const unsigned int)noexcept{ar&s_x1&s_x2&e_x1&e_x2&Rtop;}}; __declspec(align(16)) struct CtrlMsg final{bool kind;CrossMsg xchg;template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar,const unsigned int)noexcept{ar&kind&xchg;}}; __declspec(align(16)) struct Slab final{char* const base;char* current;char* const end;__forceinline Slab(void* const memory,const size_t usable):base((char*)memory),current(base),end(base+(usable&~(size_t)63u)){}}; static tbb::enumerable_thread_specific<Slab*> tls([]()noexcept{void* memory=_aligned_malloc(16777216u,64u);Slab* slab=(Slab*)_aligned_malloc(32u,64u);new(slab) Slab(memory,16777216u);char* p=slab->base;while(p<slab->end){*p=0;p+=4096u;}return slab;}); __declspec(align(16)) struct Peano2DMap final{const int levels;const float a,b,c,d;const float lenx,leny;const float inv_lenx;const uint32_t scale;const uint8_t start;__forceinline Peano2DMap(int L,float _a,float _b,float _c,float _d,uint8_t st):levels(L),a(_a),b(_b),c(_c),d(_d),lenx(_b-_a),leny(_d-_c),inv_lenx(1.0f/(_b-_a)),scale((uint32_t)1u<<(L<<1)),start(st){}}; static Peano2DMap gActiveMap(0,0,0,0,0,0); __forceinline bool ComparePtr(const struct Interval* a,const struct Interval* b)noexcept; __declspec(align(64)) struct Interval final{ const float x1,x2,y1,y2,delta_y,ordinate_factor,N_factor,quadratic_term,M;float R; __forceinline void* operator new(size_t)noexcept{Slab* s=tls.local();char* r=s->current;s->current+=64u;return r;} __forceinline Interval(float _x1,float _x2,float _y1,float _y2,float _N)noexcept:x1(_x1),x2(_x2),y1(_y1),y2(_y2),delta_y(_y2-_y1),ordinate_factor(-(y1+y2)*2.0f),N_factor(_N==1.0f?_x2-_x1:_N==2.0f?sqrtf(_x2-_x1):powf(_x2-_x1,1.0f/_N)),quadratic_term((1.0f/N_factor)*delta_y*delta_y),M((1.0f/N_factor)*fabsf(delta_y)){} __forceinline void ChangeCharacteristic(float _m)noexcept{R=fmaf(1.0f/_m,quadratic_term,fmaf(_m,N_factor,ordinate_factor));} }; __forceinline bool ComparePtr(const Interval* a,const Interval* b)noexcept{return a->R<b->R;} __forceinline void RecomputeR_ConstM_AVX2(Interval* const* arr,size_t n,float m){const __m256 vm=_mm256_set1_ps(m);__m256 vinvm=_mm256_rcp_ps(vm);vinvm=_mm256_mul_ps(vinvm,_mm256_fnmadd_ps(vm,vinvm,_mm256_set1_ps(2.0f)));size_t i=0;const size_t limit=n&~7ull;alignas(32) float q[8],nf[8],od[8],out[8];for(;i<limit;i+=8){for(int k=0;k<8;++k){const Interval* p=arr[i+k];q[k]=p->quadratic_term;nf[k]=p->N_factor;od[k]=p->ordinate_factor;}const __m256 vq=_mm256_load_ps(q),vnf=_mm256_load_ps(nf),vod=_mm256_load_ps(od);const __m256 t=_mm256_fmadd_ps(vm,vnf,vod);const __m256 res=_mm256_fmadd_ps(vq,vinvm,t);_mm256_store_ps(out,res);for(int k=0;k<8;++k)arr[i+k]->R=out[k];}for(;i<n;++i)arr[i]->ChangeCharacteristic(m);} __forceinline void RecomputeR_AffineM_AVX2(Interval* const* arr,size_t n,float GF,float alpha){const __m256 vGF=_mm256_set1_ps(GF),va=_mm256_set1_ps(alpha);size_t i=0;const size_t limit=n&~7ull;alignas(32) float ln[8],Mv[8],q[8],nf[8],od[8],out[8];for(;i<limit;i+=8){for(int k=0;k<8;++k){const Interval* p=arr[i+k];ln[k]=p->x2-p->x1;Mv[k]=p->M;q[k]=p->quadratic_term;nf[k]=p->N_factor;od[k]=p->ordinate_factor;}const __m256 vln=_mm256_load_ps(ln),vM=_mm256_load_ps(Mv),vq=_mm256_load_ps(q),vnf=_mm256_load_ps(nf),vod=_mm256_load_ps(od);const __m256 vm=_mm256_fmadd_ps(vGF,vln,_mm256_mul_ps(va,vM));__m256 vinvm=_mm256_rcp_ps(vm);vinvm=_mm256_mul_ps(vinvm,_mm256_fnmadd_ps(vm,vinvm,_mm256_set1_ps(2.0f)));const __m256 t=_mm256_fmadd_ps(vm,vnf,vod);const __m256 res=_mm256_fmadd_ps(vq,vinvm,t);_mm256_store_ps(out,res);for(int k=0;k<8;++k)arr[i+k]->R=out[k];}for(;i<n;++i){const Interval* p=arr[i];const float mi=fmaf(GF,(p->x2-p->x1),p->M*alpha);arr[i]->R=fmaf(1.0f/mi,p->quadratic_term,fmaf(mi,p->N_factor,p->ordinate_factor));}} __forceinline float fast_pow_int(float v,int n){float r;switch(n){case 1:r=v;break;case 2:r=v*v;break;case 3:{float v2=v*v;r=v2*v;}break;case 4:{float v2=v*v;r=v2*v2;}break;case 5:{float v2=v*v;r=v2*v2*v;}break;case 6:{float v2=v*v;float v4=v2*v2;r=v4*v2;}break;case 7:{float v2=v*v;float v4=v2*v2;r=v4*v2*v;}break;case 8:{float v2=v*v;float v4=v2*v2;r=v4*v4;}break;case 9:{float v3=v*v*v;float v6=v3*v3;r=v6*v3;}break;case 10:{float v2=v*v;float v4=v2*v2;float v8=v4*v4;r=v8*v2;}break;case 11:{float v2=v*v;float v4=v2*v2;float v8=v4*v4;r=v8*v2*v;}break;case 12:{float v3=v*v*v;float v6=v3*v3;r=v6*v6;}break;case 13:{float v3=v*v*v;float v6=v3*v3;r=v6*v6*v;}break;case 14:{float v7=v*v*v*v*v*v*v;r=v7*v7;}break;default:{float v7=v*v*v*v*v*v*v;r=v7*v7*v;}break;}return r;} __forceinline float Shag(float _m,float x1,float x2,float y1,float y2,float _N,float _r){const float diff=y2-y1;const float sign_mult=_mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(0x3F800000u|(((*((uint32_t*)&diff))&0x80000000u)^0x80000000u))));const int Ni=(int)(_N+0.5f);if(Ni==1)return fmaf(-(1.0f/_m),diff,x1+x2)*0.5f;if(Ni==2)return fmaf(sign_mult/(_m*_m),diff*diff*_r,x1+x2)*0.5f;const float invmN=1.0f/fast_pow_int(_m,Ni);const float dN=fast_pow_int(fabsf(diff),Ni);return fmaf(sign_mult*invmN,dN*_r,x1+x2)*0.5f;} struct MortonCachePerRank{std::vector<int> permCache;std::vector<uint64_t> invMaskCache;uint32_t baseSeed;}; static MortonCachePerRank g_mc; struct MortonND final{ int dim,levels;uint64_t scale;std::vector<float> low,high,step,invStep,baseOff;std::vector<int> perm;std::vector<uint64_t> invMask,pextMask;float invScaleLevel; __forceinline MortonND(int D,int L,const float* lows,const float* highs,const MortonCachePerRank& mc):dim(D),levels(L),scale(1ull<<((uint64_t)D*L)),low(lows,lows+D),high(highs,highs+D),step(D,0.0f),invStep(D,0.0f),baseOff(D,0.0f),perm(mc.permCache.begin(),mc.permCache.begin()+D),invMask(mc.invMaskCache.begin(),mc.invMaskCache.begin()+D),pextMask(D,0ull),invScaleLevel(1.0f/(float)((uint64_t)1<<L)){uint64_t level_mask=(1ull<<L)-1ull;for(int k=0;k<D;++k)invMask[k]&=level_mask;for(int d=0;d<dim;++d){float rng=high[d]-low[d];float st=rng*invScaleLevel;step[d]=st;invStep[d]=1.0f/st;baseOff[d]=fmaf(0.5f,st,low[d]);}for(int d=0;d<dim;++d){uint64_t m=0ull,bitpos=(uint64_t)d;for(int b=0;b<levels;++b){m|=1ull<<bitpos;bitpos+=(uint64_t)dim;}pextMask[d]=m;}} __forceinline void map01ToPoint(float t,float* __restrict out)const noexcept{if(t<=0.0f)t=0.0f;else if(t>=1.0f)t=0x1.fffffep-1f;uint64_t idx=(uint64_t)((double)t*(double)scale);if(idx>=scale)idx=scale-1ull;for(int d=0;d<dim;++d){int pd=perm[d];uint64_t bits=_pext_u64(idx,pextMask[d]);if(invMask[pd])bits^=invMask[pd];out[pd]=fmaf(step[pd],(float)bits,baseOff[pd]);}} __forceinline uint64_t clamp_bits(int64_t v,int bits)const noexcept{if(v<0)v=0;int64_t maxv=((int64_t)1<<bits)-1;if(v>maxv)v=maxv;return (uint64_t)v;} __forceinline float pointToT(const float* __restrict q)const noexcept{const int bits=levels;uint64_t idx=0ull;for(int d=0;d<dim;++d){int pd=perm[d];float v=(q[pd]-baseOff[pd])*invStep[pd];int64_t cell=(int64_t)_mm_cvt_ss2si(_mm_round_ss(_mm_setzero_ps(),_mm_set_ss(v),_MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC));uint64_t b=clamp_bits(cell,bits);if(invMask[pd])b^=invMask[pd];idx|=_pdep_u64(b,pextMask[d]);}return((float)idx+0.5f)/(float)scale;} }; struct ManipCost final{ int n;bool variableLen;float targetX,targetY;float minTheta; __forceinline ManipCost(int _n,bool _variableLen,float _targetX,float _targetY,float _minTheta):n(_n),variableLen(_variableLen),targetX(_targetX),targetY(_targetY),minTheta(_minTheta){} __forceinline float operator()(const float* __restrict q,float& out_x,float& out_y)const noexcept{const float* th=q;const float* L=variableLen?(q+n):nullptr;__declspec(align(64)) float phi[16],s_arr[16],c_arr[16];float x=0.0f,y=0.0f,phi_acc=0.0f,penC=0.0f;for(int i=0;i<n;++i){phi_acc+=th[i];phi[i]=phi_acc;}FABE13_SINCOS(phi,s_arr,c_arr,n);const float Lc=1.0f;if(variableLen){for(int i=0;i<n;++i){float Li=L[i];x=fmaf(Li,c_arr[i],x);y=fmaf(Li,s_arr[i],y);}}else{for(int i=0;i<n;++i){x=fmaf(Lc,c_arr[i],x);y=fmaf(Lc,s_arr[i],y);}}for(int i=0;i<n;++i){float ai=fabsf(th[i]);float v=minTheta-ai;float u=v>0.0f?v:0.0f;penC=fmaf(u,u,penC);}float dx=x-targetX;float dy=y-targetY;float dist=sqrtf(fmaf(dx,dx,dy*dy));out_x=x;out_y=y;return dist+penC;} }; __forceinline void HitTest2D_analytic(float x_param,float& out_x1,float& out_x2){const float a=gActiveMap.a;const float inv_lenx=gActiveMap.inv_lenx;const uint32_t scale=gActiveMap.scale;const uint32_t scale_minus_1=scale-1u;const float lenx=gActiveMap.lenx;const float leny=gActiveMap.leny;const float c=gActiveMap.c;const uint8_t start=gActiveMap.start;const int levels=gActiveMap.levels;float norm=(x_param-a)*inv_lenx;norm=fminf(fmaxf(norm,0.0f),0x1.fffffep-1f);uint32_t idx=(uint32_t)(norm*(float)scale);idx=idx>scale_minus_1?scale_minus_1:idx;float sx=lenx,sy=leny;float x1=a,x2=c;uint8_t type=start;int l=levels-1;while(l>=0){const uint32_t q=(idx>>(l*2))&3u;const Step s=g_step_tbl[type][q];type=s.next;sx*=0.5f;sy*=0.5f;x1+=s.dx?sx:0.0f;x2+=s.dy?sy:0.0f;--l;}out_x1=x1+sx*0.5f;out_x2=x2+sy*0.5f;} __forceinline float FindX2D_analytic(float px,float py){const float a=gActiveMap.a;const float b=gActiveMap.b;const float c=gActiveMap.c;const float d=gActiveMap.d;const float lenx=gActiveMap.lenx;const float leny=gActiveMap.leny;const uint32_t scale=gActiveMap.scale;const uint8_t start=gActiveMap.start;const int levels=gActiveMap.levels;const float clamped_px=fminf(fmaxf(px,a),b);const float clamped_py=fminf(fmaxf(py,c),d);float sx=lenx,sy=leny;float x0=a,y0=c;uint32_t idx=0u;uint8_t type=start;int l=0;while(l<levels){sx*=0.5f;sy*=0.5f;const float mx=x0+sx;const float my=y0+sy;const uint32_t tr=(uint32_t)((clamped_px>mx)&(clamped_py>my));const uint32_t tl=(uint32_t)((clamped_px<mx)&(clamped_py>my));const uint32_t dl=(uint32_t)((clamped_px<mx)&(clamped_py<my));const uint32_t none=(uint32_t)(1u^(tr|tl|dl));const uint32_t dd=(tr<<1)|tr|tl|(none<<1);const InvStep inv=g_inv_tbl[type][dd];type=inv.next;idx=(idx<<2)|inv.q;const uint32_t dx=dd>>1;const uint32_t dy=dd&1u;x0+=dx?sx:0.0f;y0+=dy?sy:0.0f;++l;}const float scale_recip=1.0f/(float)scale;return fmaf((float)idx*scale_recip,lenx,a);} __declspec(align(16)) struct MultiCrossMsg final{float intervals[15];uint8_t count;template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar,const unsigned int)noexcept{ar&intervals&count;}}; __declspec(align(16)) struct BestSolutionMsg final{float bestF;float bestX,bestY;float bestQ[32];uint8_t dim;template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar,const unsigned int)noexcept{ar&bestF&bestX&bestY&bestQ&dim;}}; __declspec(align(16)) struct CtrlMsgND final{uint8_t kind;CrossMsg xchg;MultiCrossMsg multiXchg;BestSolutionMsg bestSol;template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar,const unsigned int)noexcept{ar&kind;if(kind==1)ar&xchg;else if(kind==2)ar&multiXchg;else if(kind==3)ar&bestSol;}}; static __forceinline int generate_heuristic_seeds(const ManipCost& cost,int dim,float* __restrict S,int stride){const int n=cost.n;const bool VL=cost.variableLen;const float tx=cost.targetX,ty=cost.targetY;const float phi=atan2f(ty,tx);const float rho=sqrtf(fmaf(tx,tx,ty*ty));float len=fminf(fmaxf(rho/(float)n,0.1f),2.0f);float* s0=S+0*stride;for(int i=0;i<n;++i)s0[i]=phi/(float)n;if(VL)for(int i=0;i<n;++i)s0[n+i]=len;float* s1=S+1*stride;for(int i=0;i<n;++i)s1[i]=0.5f*phi*((i&1)?-1.0f:1.0f);if(VL){for(int i=0;i<n;++i)s1[n+i]=len*(0.8f+0.4f*(float)i/(float)n);}float* s2=S+2*stride;const float inv=(n>1)?1.0f/(float)(n-1):0.0f;for(int i=0;i<n;++i){float pr=(float)i*inv;s2[i]=phi*(1.0f-0.3f*pr);}if(VL){for(int i=0;i<n;++i){float si=sinf(1.5f*(float)i);s2[n+i]=len*(1.0f+0.2f*si);}}return 3;} static __forceinline void agp_run_branch_mpi(const MortonND& map,const ManipCost& cost,int maxIter,float r,bool adaptive,float eps,unsigned seed,std::vector<Interval*>& H,std::vector<float>& bestQ,float& bestF,float& bestX,float& bestY,float M_prior=1e-3f){ const int n=cost.n,dim=n+(cost.variableLen?n:0);alignas(64) float q_local[32],phi[32],s_arr[32],c_arr[32],sum_s[32],sum_c[32],q_try[32];bestQ.reserve(dim);float x=0.0f,y=0.0f;int no_improve=0; auto evalAt=[&](float t)->float{map.map01ToPoint(t,q_local);float f=cost(q_local,x,y);if(f<bestF*1.05f){float acc=0.0f;for(int i=0;i<n;++i){acc+=q_local[i];phi[i]=acc;}FABE13_SINCOS(phi,s_arr,c_arr,n);float as=0.0f,ac=0.0f;for(int k=n-1;k>=0;--k){const float Lk=cost.variableLen?q_local[n+k]:1.0f;as+=Lk*s_arr[k];ac+=Lk*c_arr[k];sum_s[k]=as;sum_c[k]=ac;}const float dx=x-cost.targetX,dy=y-cost.targetY;float dist=sqrtf(fmaf(dx,dx,dy*dy))+1e-8f;float eta=0.1f;for(int step=0;step<2;++step){for(int i=0;i<n;++i){float gpen=0.0f;float ai=fabsf(q_local[i]);float v=cost.minTheta-ai;if(v>0.0f)gpen=-2.0f*copysignf(v,q_local[i]);float grad=(dx*(-sum_s[i])+dy*(sum_c[i]))/dist+gpen;q_try[i]=q_local[i]-eta*grad;if(q_try[i]<-3.14159265359f)q_try[i]=-3.14159265359f;else if(q_try[i]>3.14159265359f)q_try[i]=3.14159265359f;}if(cost.variableLen){for(int i=0;i<n;++i){float grad=(dx*c_arr[i]+dy*s_arr[i])/dist;float v=q_local[n+i]-eta*grad;if(v<0.1f)v=0.1f;else if(v>2.0f)v=2.0f;q_try[n+i]=v;}}float x2,y2;float f2=cost(q_try,x2,y2);if(f2<f){memcpy(q_local,q_try,dim*sizeof(float));f=f2;x=x2;y=y2;break;}eta*=0.5f;}}if(f<bestF){bestF=f;bestQ.assign(q_local,q_local+dim);bestX=x;bestY=y;no_improve=0;}else ++no_improve;return f;}; float a=0.0f,b=1.0f;float f_a=evalAt(a),f_b=evalAt(b);const int K=(std::min)((std::max)(2*dim,8),128);H.clear();H.reserve((size_t)maxIter+K+16);float Mmax=M_prior;const int rank=g_world->rank();const int world=g_world->size();alignas(64) float seeds[3*32];const int seedCnt=generate_heuristic_seeds(cost,dim,seeds,32); if(seedCnt>0){const float* s0=seeds;float t_seed=map.pointToT(s0);float t1=fmaxf(a,t_seed-0.002f),t2=fminf(b,t_seed+0.002f);if(t2>t1){alignas(64) float q1[32],q2[32];float x1,y1,x2,y2;map.map01ToPoint(t1,q1);float f1=cost(q1,x1,y1);map.map01ToPoint(t2,q2);float f2=cost(q2,x2,y2);Interval* I=new Interval(t1,t2,f1,f2,(float)dim);Mmax=(std::max)(Mmax,I->M);I->ChangeCharacteristic(r*Mmax);I->R*=0.85f;H.emplace_back(I);if(f1<bestF){bestF=f1;bestQ.assign(q1,q1+dim);bestX=x1;bestY=y1;}if(f2<bestF){bestF=f2;bestQ.assign(q2,q2+dim);bestX=x2;bestY=y2;}}} float prev_t=a,prev_f=f_a; for(int k=1;k<=K;++k){float t=a+(b-a)*((float)k/(K+1))+(float)rank/(float)(world*(K+1));float f=evalAt(t);Interval* I=new Interval(prev_t,t,prev_f,f,(float)dim);Mmax=(std::max)(Mmax,I->M);I->ChangeCharacteristic(r*Mmax);H.emplace_back(I);std::push_heap(H.begin(),H.end(),ComparePtr);prev_t=t;prev_f=f;} Interval* tail=new Interval(prev_t,b,prev_f,f_b,(float)dim);Mmax=(std::max)(Mmax,tail->M);tail->ChangeCharacteristic(r*Mmax);H.emplace_back(tail);std::push_heap(H.begin(),H.end(),ComparePtr); if(seedCnt>1){for(int i=1;i<seedCnt;++i){const float* s=seeds+i*32;float t_seed=map.pointToT(s);float t1=fmaxf(a,t_seed-0.001f),t2=fminf(b,t_seed+0.001f);if(t2>t1){float f1=evalAt(t1),f2=evalAt(t2);Interval* J=new Interval(t1,t2,f1,f2,(float)dim);Mmax=(std::max)(Mmax,J->M);J->ChangeCharacteristic(r*Mmax);J->R*=0.90f;H.emplace_back(J);std::push_heap(H.begin(),H.end(),ComparePtr);}}} float dmax=b-a,initial_len=dmax,thr03=0.3f*initial_len,inv_thr03=1.0f/thr03;int it=0; while(it<maxIter){ if((it%100)==0&&no_improve>50&&!bestQ.empty()){float t_best=map.pointToT(bestQ.data());for(int i=0;i<2;++i){float off=(i==0)?0.01f:-0.01f;float t_seed=fminf(b,fmaxf(a,t_best+off));float f_seed=evalAt(t_seed);Interval* J=new Interval(t_seed-0.005f,t_seed+0.005f,f_seed,f_seed,(float)dim);J->ChangeCharacteristic(r*Mmax);J->R*=0.9f;H.emplace_back(J);std::push_heap(H.begin(),H.end(),ComparePtr);}no_improve=0;} const float p=fmaf(-1.0f/initial_len,dmax,1.0f);bool stagnation=(no_improve>100)&&(it>270);float r_eff=fmaxf(1.0f,r*(0.7f+0.3f*(1.0f-p)));std::pop_heap(H.begin(),H.end(),ComparePtr);Interval* cur=H.back();H.pop_back();const float x1=cur->x1,x2=cur->x2,y1=cur->y1,y2=cur->y2;float m=r_eff*Mmax;float tNew=Shag(m,x1,x2,y1,y2,(float)dim,r);tNew=fminf(fmaxf(tNew,a),b);float fNew=evalAt(tNew); Interval* L=new Interval(x1,tNew,y1,fNew,(float)dim);Interval* Rv=new Interval(tNew,x2,fNew,y2,(float)dim);float Mloc=(std::max)(L->M,Rv->M); if(adaptive){float len1=tNew-x1,len2=x2-tNew;if(len1+len2==dmax){dmax=(std::max)(len1,len2);for(auto pI:H){float Ls=pI->x2-pI->x1;if(Ls>dmax)dmax=Ls;}}if((thr03>dmax&&!(it%3))||(10.0f*dmax<initial_len)){if(Mloc>Mmax){Mmax=Mloc;m=r_eff*Mmax;}const float progress=fmaf(-dmax,inv_thr03,1.0f),alpha=progress*progress,beta=fmaf(-alpha,1.0f,2.0f);const float MULT=(1.0f/dmax)*Mmax,global_coeff=fmaf(MULT,r_eff,-MULT),GF=fmaf(beta,global_coeff,0.0f);L->ChangeCharacteristic(fmaf(GF,len1,L->M*alpha));Rv->ChangeCharacteristic(fmaf(GF,len2,Rv->M*alpha));size_t sz=H.size();RecomputeR_AffineM_AVX2(H.data(),sz,GF,alpha);std::make_heap(H.begin(),H.end(),ComparePtr);}else{if(Mloc>Mmax){bool big=(Mloc>=1.15f*Mmax);Mmax=Mloc;m=r_eff*Mmax;L->ChangeCharacteristic(m);Rv->ChangeCharacteristic(m);if(big){size_t sz=H.size();RecomputeR_ConstM_AVX2(H.data(),sz,m);std::make_heap(H.begin(),H.end(),ComparePtr);}}else{L->ChangeCharacteristic(m);Rv->ChangeCharacteristic(m);}}} else{if(Mloc>Mmax){bool big=(Mloc>=1.15f*Mmax);Mmax=Mloc;float m2=r_eff*Mmax;L->ChangeCharacteristic(m2);Rv->ChangeCharacteristic(m2);if(big){size_t sz=H.size();RecomputeR_ConstM_AVX2(H.data(),sz,m2);std::make_heap(H.begin(),H.end(),ComparePtr);}}else{L->ChangeCharacteristic(m);Rv->ChangeCharacteristic(m);}} H.push_back(L);std::push_heap(H.begin(),H.end(),ComparePtr);H.push_back(Rv);std::push_heap(H.begin(),H.end(),ComparePtr);Interval* top=H.front();float interval_len=top->x2-top->x1;const int T=(int)fmaf(-expm1f(p),264.0f,277.0f);bool want_term=(powf(interval_len,1.0f/(float)dim)<eps)||(it==maxIter-1); if(!(it%T)||want_term){CtrlMsgND out;out.kind=want_term?0:2;if(!want_term){uint8_t cnt=(uint8_t)((H.size()>=3)?3:H.size());out.multiXchg.count=cnt;float* dest=out.multiXchg.intervals;Interval* t1=H[0];Interval* t2=(H.size()>1?H[1]:H[0]);Interval* t3=(H.size()>2?H[2]:H[H.size()-1]);Interval* tops[3]={t1,t2,t3};for(uint8_t i2=0;i2<cnt;++i2){Interval* Tt=tops[i2];dest[0]=Tt->x1;dest[1]=0.0f;dest[2]=Tt->x2;dest[3]=0.0f;dest[4]=Tt->R;dest+=5;}}for(int i2=0;i2<world;++i2)if(i2!=rank)g_world->isend(i2,0,out);if(want_term)break;} if(!(it%500)&&!bestQ.empty()){CtrlMsgND out;out.kind=3;out.bestSol.bestF=bestF;out.bestSol.bestX=bestX;out.bestSol.bestY=bestY;out.bestSol.dim=(uint8_t)bestQ.size();memcpy(out.bestSol.bestQ,bestQ.data(),bestQ.size()*sizeof(float));for(int i2=0;i2<world;++i2)if(i2!=rank)g_world->isend(i2,0,out);} while(g_world->iprobe(boost::mpi::any_source,0)){CtrlMsgND in;g_world->recv(boost::mpi::any_source,0,in); if(in.kind==0){if(!rank)break;else return;} else if(in.kind==1){float sx=in.xchg.s_x1,ex=in.xchg.e_x1;if(sx<0.0f)sx=0.0f;if(ex>1.0f)ex=1.0f;if(ex>sx){alignas(64) float tmp[32];float tx,ty;map.map01ToPoint(sx,tmp);float y1=cost(tmp,tx,ty);map.map01ToPoint(ex,tmp);float y2=cost(tmp,tx,ty);Interval* inj=new Interval(sx,ex,y1,y2,(float)dim);inj->ChangeCharacteristic(r*Mmax);Interval* topH=H.front();if(inj->R>1.15f*topH->R){float p=fmaf(-1.0f/initial_len,dmax,1.0f);float k=(no_improve>100&&it>270)?fmaf(0.5819767068693265f,expm1f(p),0.3f):fmaf(0.3491860241215959f,expm1f(p),0.6f);inj->R=in.xchg.Rtop*k;H.emplace_back(inj);std::push_heap(H.begin(),H.end(),ComparePtr);}}} else if(in.kind==2){const MultiCrossMsg& mX=in.multiXchg;for(uint8_t ii=0;ii<mX.count;++ii){const float* d=&mX.intervals[ii*5];float sx=d[0],ex=d[2];if(sx<0.0f)sx=0.0f;if(ex>1.0f)ex=1.0f;if(ex>sx){alignas(64) float tmp[32];float tx,ty;map.map01ToPoint(sx,tmp);float y1=cost(tmp,tx,ty);map.map01ToPoint(ex,tmp);float y2=cost(tmp,tx,ty);Interval* inj=new Interval(sx,ex,y1,y2,(float)dim);inj->ChangeCharacteristic(r*Mmax);Interval* topH=H.front();if(inj->R>1.15f*topH->R){float p=fmaf(-1.0f/initial_len,dmax,1.0f);float k=(no_improve>100&&it>270)?fmaf(0.5819767068693265f,expm1f(p),0.3f):fmaf(0.3491860241215959f,expm1f(p),0.6f);inj->R=d[4]*k;H.emplace_back(inj);std::push_heap(H.begin(),H.end(),ComparePtr);}}}} else if(in.kind==3){const BestSolutionMsg& bm=in.bestSol;if(bm.bestF<bestF*1.15f){alignas(64) float tmp_q[32];memcpy(tmp_q,bm.bestQ,bm.dim*sizeof(float));float t_best=map.pointToT(tmp_q);float t1=fmaxf(a,t_best-0.001f),t2=fminf(b,t_best+0.001f);if(t2>t1){alignas(64) float tq1[32],tq2[32];float xx1,yy1,xx2,yy2;map.map01ToPoint(t1,tq1);float f1=cost(tq1,xx1,yy1);map.map01ToPoint(t2,tq2);float f2=cost(tq2,xx2,yy2);Interval* I=new Interval(t1,t2,f1,f2,(float)dim);I->ChangeCharacteristic(r*Mmax);I->R*=0.90f;H.emplace_back(I);std::push_heap(H.begin(),H.end(),ComparePtr);}if(bm.bestF<bestF){bestF=bm.bestF;bestX=bm.bestX;bestY=bm.bestY;bestQ.assign(bm.bestQ,bm.bestQ+bm.dim);}}}} ++it;} } struct BestPacket{float bestF;int dim;float bestX;float bestY;template<typename Archive> void serialize(Archive& ar,const unsigned int){ar&bestF&dim&bestX&bestY;}}; extern "C" __declspec(dllexport) __declspec(noalias) void AGP_Manip2D(int nSegments,bool variableLengths,float minTheta,float targetX,float targetY,int peanoLevels,int maxIterPerBranch,float r,bool adaptiveMode,float epsilon,unsigned int seed,float** out_bestQ,size_t* out_bestQLen,float* out_bestX,float* out_bestY,float* out_bestF){ const int dim=nSegments+(variableLengths?nSegments:0);g_mc.permCache.resize(dim);for(int i=0;i<dim;++i)g_mc.permCache[i]=i;uint32_t s=g_mc.baseSeed;for(int i=dim-1;i>0;--i){s^=s<<13;s^=s>>17;s^=s<<5;uint32_t j=s%(uint32_t)(i+1);std::swap(g_mc.permCache[i],g_mc.permCache[j]);}g_mc.invMaskCache.resize(dim);for(int k=0;k<dim;++k){s^=s<<13;s^=s>>17;s^=s<<5;g_mc.invMaskCache[k]=(uint64_t)s;} const float thetaMin=-3.14159265359f,thetaMax=3.14159265359f,lenMin=0.1f,lenMax=2.0f;std::vector<float> low;low.reserve(dim);std::vector<float> high;high.reserve(dim);for(int i=0;i<nSegments;++i){low.push_back(thetaMin);high.push_back(thetaMax);}if(variableLengths){for(int i=0;i<nSegments;++i){low.push_back(lenMin);high.push_back(lenMax);}} ManipCost cost(nSegments,variableLengths,targetX,targetY,minTheta);const int rank=g_world->rank(),world=g_world->size();std::vector<float> bestQ;float bestF=FLT_MAX,bx=0.0f,by=0.0f; const int levels0=(std::min)(peanoLevels,8),maxIter0=(int)(maxIterPerBranch*0.2f);MortonND map0(dim,levels0,low.data(),high.data(),g_mc);std::vector<Interval*> H_coarse;std::vector<float> bestQ_coarse;float bestF_coarse=FLT_MAX,bx_coarse=0.0f,by_coarse=0.0f; float M_prior=(variableLengths?2.0f*nSegments:2.0f*nSegments)*(1.0f/(float)(1u<<levels0));if(variableLengths)M_prior+=1.41421356237f*(1.0f/(float)(1u<<levels0)); agp_run_branch_mpi(map0,cost,maxIter0,r,adaptiveMode,epsilon,seed,H_coarse,bestQ_coarse,bestF_coarse,bx_coarse,by_coarse,M_prior); if(bestF_coarse<bestF){bestF=bestF_coarse;bestQ=bestQ_coarse;bx=bx_coarse;by=by_coarse;} if(levels0<peanoLevels){MortonND map1(dim,peanoLevels,low.data(),high.data(),g_mc);std::vector<Interval*> H_fine;std::vector<float> bestQ_fine=bestQ;float bestF_fine=bestF,bx_fine=bx,by_fine=by;float M_prior_fine=(variableLengths?2.0f*nSegments:2.0f*nSegments)*(1.0f/(float)(1u<<peanoLevels));if(variableLengths)M_prior_fine+=1.41421356237f*(1.0f/(float)(1u<<peanoLevels)); if(!H_coarse.empty()){std::sort(H_coarse.begin(),H_coarse.end(),[](const Interval* a,const Interval* b){return a->R<b->R;});const size_t topCount=(size_t)(H_coarse.size()*0.3f);for(size_t i=0;i<topCount&&i<H_coarse.size();++i){const Interval* C=H_coarse[i];alignas(64) float q1[32],q2[32];float x1,y1,x2,y2;map1.map01ToPoint(C->x1,q1);float f1=cost(q1,x1,y1);map1.map01ToPoint(C->x2,q2);float f2=cost(q2,x2,y2);H_fine.push_back(new Interval(C->x1,C->x2,f1,f2,(float)dim));if(f1<bestF_fine){bestF_fine=f1;bestQ_fine.assign(q1,q1+dim);bx_fine=x1;by_fine=y1;}if(f2<bestF_fine){bestF_fine=f2;bestQ_fine.assign(q2,q2+dim);bx_fine=x2;by_fine=y2;}}std::make_heap(H_fine.begin(),H_fine.end(),ComparePtr);} agp_run_branch_mpi(map1,cost,maxIterPerBranch-maxIter0,r,adaptiveMode,epsilon,seed,H_fine,bestQ_fine,bestF_fine,bx_fine,by_fine,M_prior_fine); if(bestF_fine<bestF){bestF=bestF_fine;bestQ=bestQ_fine;bx=bx_fine;by=by_fine;}} BestPacket me{bestF,dim,bx,by}; if(!rank){std::vector<float> winnerQ=bestQ;float winF=bestF,wx=bx,wy=by;for(int i=1;i<world;++i){BestPacket bp;g_world->recv(i,2,bp);std::vector<float> qin;g_world->recv(i,3,qin);if(bp.bestF<winF){winF=bp.bestF;wx=bp.bestX;wy=bp.bestY;winnerQ=qin;}}*out_bestQLen=winnerQ.size();*out_bestQ=(float*)CoTaskMemAlloc(sizeof(float)*(*out_bestQLen));memcpy(*out_bestQ,winnerQ.data(),sizeof(float)*(*out_bestQLen));*out_bestX=wx;*out_bestY=wy;*out_bestF=winF;} else{g_world->send(0,2,me);g_world->send(0,3,bestQ);} } extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(int peanoLevel,float a,float b,float c,float d){ g_env=new boost::mpi::environment();g_world=new boost::mpi::communicator();_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);const int rank=g_world->rank();const int world_size=g_world->size();if(world_size==4){new(&gActiveMap) Peano2DMap(peanoLevel,a,b,c,d,rank&3);}g_mc.baseSeed=fmaf(0x9E3779B9u,rank,0x9E3779B9u);return rank; } __forceinline float ShekelFunc(float x,float seed){int i=0;float st=seed,r1,r2,res=0.0f;while(i<10){XOR_RAND(st,r1);float xp=fmaf(-r1,10.0f,x);XOR_RAND(st,r1);XOR_RAND(st,r2);float d=fmaf(fmaf(r1,20.0f,5.0f),xp*xp,fmaf(r2,0.2f,1.0f));d=copysignf(fmaxf(fabsf(d),FLT_MIN),d);res-=1.0f/d;++i;}return res;} __forceinline float RastriginFunc(float x1,float x2){const float t=fmaf(x1,x1,x2*x2);float c1,c2;FABE13_COS(6.28318530717958647692f*x1,c1);FABE13_COS(6.28318530717958647692f*x2,c2);return (t-fmaf(c1+c2,10.0f,-14.6f))*fmaf(-t,0.25f,18.42f);} __forceinline float HillFunc(float x,float seed){int j=0;__declspec(align(32)) float ang[14u];float st=6.28318530717958647692f*x;while(j<14){ang[j]=st*(float)(j+1);++j;}__declspec(align(32)) float sv[14u],cv[14u];FABE13_SINCOS(ang,sv,cv,14u);float state=seed,r1,r2;XOR_RAND(state,r1);float res=fmaf(r1,2.0f,-1.1f);--j;while(j>=0){XOR_RAND(state,r1);XOR_RAND(state,r2);res+=fmaf(fmaf(r1,2.0f,-1.1f),sv[j],fmaf(r2,2.0f,-1.1f)*cv[j]);--j;}return res;} __forceinline float GrishaginFunc(float x1,float x2,float seed){int j=0;__declspec(align(32)) float aj[8u],ak[8u];while(j<8){float pj=3.14159265358979323846f*(float)(j+1);aj[j]=pj*x1;ak[j]=pj*x2;++j;}__declspec(align(32)) float sj[8u],cj[8u],sk[8u],ck[8u];FABE13_SINCOS(aj,sj,cj,8u);FABE13_SINCOS(ak,sk,ck,8u);--j;float p1=0.0f,p2=0.0f;float st=seed,r1,r2;while(j>=0){size_t k=0u;while(k<8u){float s=sj[j]*sj[j];float c=ck[k]*ck[k];XOR_RAND_GRSH(st,r1);XOR_RAND_GRSH(st,r2);p1=fmaf(r1,s,fmaf(r2,c,p1));XOR_RAND_GRSH(st,r1);XOR_RAND_GRSH(st,r2);p2=fmaf(-r1,c,fmaf(r2,s,p2));++k;}--j;}return -sqrtf(fmaf(p1,p1,p2*p2));} extern "C" __declspec(dllexport) __declspec(noalias) void AGP_1D(float global_iterations,float a,float b,float r,bool mode,float epsilon,float seed,float** out_data,size_t* out_len){ Slab* slab=tls.local();slab->current=slab->base;int schetchick=0;const float initial_length=b-a;float dmax=initial_length;const float threshold_03=0.3f*initial_length;const float inv_threshold_03=1.0f/threshold_03;const float start_val=ShekelFunc(a,seed);float best_f=ShekelFunc(b,seed);float x_Rmax_1=a;float x_Rmax_2=b;float y_Rmax_1=start_val;float y_Rmax_2=best_f;std::vector<float,boost::alignment::aligned_allocator<float,16u>> Extr;std::vector<Interval*,boost::alignment::aligned_allocator<Interval*,64u>> R;Extr.reserve((size_t)global_iterations<<2u);R.reserve((size_t)global_iterations<<1u);R.emplace_back(new Interval(a,b,start_val,best_f,1.0f));float Mmax=R.front()->M;float m=r*Mmax; while(true){ const float new_point=Shag(m,x_Rmax_1,x_Rmax_2,y_Rmax_1,y_Rmax_2,1.0f,r);const float new_value=ShekelFunc(new_point,seed);if(new_value<best_f){best_f=new_value;Extr.emplace_back(best_f);Extr.emplace_back(new_point);} std::pop_heap(R.begin(),R.end(),ComparePtr);const Interval* pro=R.back();const float new_x1=pro->x1;const float new_x2=pro->x2;const float len2=new_x2-new_point;const float len1=new_point-new_x1;const float interval_len=len1<len2?len1:len2; if(++schetchick==(int)global_iterations||interval_len<epsilon){Extr.emplace_back((float)schetchick);Extr.emplace_back(interval_len);*out_len=Extr.size();*out_data=(float*)CoTaskMemAlloc(sizeof(float)*(*out_len));memcpy(*out_data,Extr.data(),sizeof(float)*(*out_len));return;} Interval* curr=new Interval(new_x1,new_point,pro->y1,new_value,1.0f);Interval* curr1=new Interval(new_point,new_x2,new_value,pro->y2,1.0f);const float currM=curr->M>curr1->M?curr->M:curr1->M;const size_t r_size=R.size(); if(mode){if(len2+len1==dmax){dmax=len2>len1?len2:len1;for(auto p:R){float L=p->x2-p->x1;if(L>dmax)dmax=L;}}if(threshold_03>dmax&&!(schetchick%3)||10.0f*dmax<initial_length){if(currM>Mmax){Mmax=currM;m=r*Mmax;}const float progress=fmaf(-inv_threshold_03,dmax,1.0f);const float alpha=progress*progress;const float betta=2.0f-alpha;const float MULT=(1.0f/dmax)*Mmax;const float global_coeff=fmaf(MULT,r,-MULT);const float GF=betta*global_coeff;curr->ChangeCharacteristic(fmaf(GF,len1,curr->M*alpha));curr1->ChangeCharacteristic(fmaf(GF,len2,curr1->M*alpha));RecomputeR_AffineM_AVX2(R.data(),r_size,GF,alpha);std::make_heap(R.begin(),R.end(),ComparePtr);}else{if(currM>Mmax){if(currM<1.15f*Mmax){Mmax=currM;m=r*Mmax;curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);}else{Mmax=currM;m=r*Mmax;curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);RecomputeR_ConstM_AVX2(R.data(),r_size,m);std::make_heap(R.begin(),R.end(),ComparePtr);}}else{curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);}}} else{if(currM>Mmax){if(currM<1.15f*Mmax){Mmax=currM;m=r*Mmax;curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);}else{Mmax=currM;m=r*Mmax;curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);RecomputeR_ConstM_AVX2(R.data(),r_size,m);std::make_heap(R.begin(),R.end(),ComparePtr);}}else{curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);}} R.back()=curr;std::push_heap(R.begin(),R.end(),ComparePtr);R.emplace_back(curr1);std::push_heap(R.begin(),R.end(),ComparePtr);const Interval* top=R.front();x_Rmax_1=top->x1;x_Rmax_2=top->x2;y_Rmax_1=top->y1;y_Rmax_2=top->y2;} } extern "C" __declspec(dllexport) __declspec(noalias) void AGP_2D(const float N,const float global_iterations,const float a,const float b,const float c,const float d,const float r,const bool mode,const float epsilon,const float seed,float** const __restrict out_data,size_t* const __restrict out_len)noexcept{ Slab* const __restrict slab=tls.local();slab->current=slab->base;int schetchick=0;int no_improve=0;const int rank=g_world->rank();const int world_size=g_world->size();while(g_world->iprobe(boost::mpi::any_source,0)){CtrlMsg dummy;g_world->recv(boost::mpi::any_source,0,dummy);} const float inv_divider=ldexpf(1.0f,-((gActiveMap.levels<<1)+1));const float x_addition=(b-a)*inv_divider;const float y_addition=(d-c)*inv_divider;const float true_start=a+x_addition;const float true_end=b-x_addition;float x_Rmax_1=true_start;float x_Rmax_2=true_end;const float initial_length=x_Rmax_2-x_Rmax_1;float dmax=initial_length;const float threshold_03=0.3f*initial_length;const float inv_threshold_03=1.0f/threshold_03;const float start_val=rank%3?RastriginFunc(true_end,d-y_addition):RastriginFunc(true_start,c+y_addition);float best_f=rank%2?RastriginFunc(true_start,d-y_addition):RastriginFunc(true_end,c+y_addition);float y_Rmax_1=start_val;float y_Rmax_2=best_f; std::vector<float,boost::alignment::aligned_allocator<float,16u>> Extr;std::vector<Interval* __restrict,boost::alignment::aligned_allocator<Interval* __restrict,64u>> R;Extr.clear();Extr.reserve(static_cast<size_t>(global_iterations)<<2u);R.clear();R.reserve(static_cast<size_t>(global_iterations)<<1u);R.emplace_back(new Interval(true_start,true_end,start_val,best_f,2.0f));const Interval* __restrict top_ptr;float Mmax=R.front()->M;float m=r*Mmax; while(true){ const float interval_len=x_Rmax_2-x_Rmax_1;const bool stagnation=no_improve>100&&schetchick>270;const float p=fmaf(-1.0f/initial_length,dmax,1.0f); while(g_world->iprobe(boost::mpi::any_source,0)){CtrlMsg in;g_world->recv(boost::mpi::any_source,0,in);if(in.kind){if(!rank){Extr.emplace_back(static_cast<float>(schetchick));Extr.emplace_back(interval_len);*out_len=Extr.size();*out_data=reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float)*(*out_len)));memcpy(*out_data,Extr.data(),sizeof(float)*(*out_len));}return;}const float sx=FindX2D_analytic(in.xchg.s_x1,in.xchg.s_x2);const float ex=FindX2D_analytic(in.xchg.e_x1,in.xchg.e_x2);Interval* const __restrict injected=new Interval(sx,ex,RastriginFunc(in.xchg.s_x1,in.xchg.s_x2),RastriginFunc(in.xchg.e_x1,in.xchg.e_x2),2.0f);injected->ChangeCharacteristic(m);if(injected->R>1.15f*top_ptr->R){const float k=stagnation?fmaf(0.5819767068693265f,expm1f(p),0.3f):fmaf(0.3491860241215959f,expm1f(p),0.6f);injected->R=in.xchg.Rtop*k;R.emplace_back(injected);std::push_heap(R.begin(),R.end(),ComparePtr);}} const int T=static_cast<int>(fmaf(-expm1f(p),264.0f,277.0f));const bool want_term=interval_len<epsilon||schetchick==static_cast<int>(global_iterations); if(!(++schetchick%T)||stagnation||want_term){CtrlMsg out;out.kind=want_term;if(!out.kind){float s_x1,s_x2,e_x1,e_x2;HitTest2D_analytic(top_ptr->x1,s_x1,s_x2);HitTest2D_analytic(top_ptr->x2,e_x1,e_x2);out.xchg=CrossMsg{s_x1,s_x2,e_x1,e_x2,top_ptr->R};}for(int i=0;i<world_size;++i){if(i!=rank)g_world->isend(i,0,out);}if(out.kind){if(!rank){Extr.emplace_back(static_cast<float>(schetchick));Extr.emplace_back(interval_len);*out_len=Extr.size();*out_data=reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float)*(*out_len)));memcpy(*out_data,Extr.data(),sizeof(float)*(*out_len));}return;}} const float new_point=Shag(m,x_Rmax_1,x_Rmax_2,y_Rmax_1,y_Rmax_2,2.0f,r);float new_x1_val,new_x2_val;HitTest2D_analytic(new_point,new_x1_val,new_x2_val);const float new_value=RastriginFunc(new_x1_val,new_x2_val);if(new_value<best_f){best_f=new_value;Extr.emplace_back(best_f);Extr.emplace_back(new_x1_val);Extr.emplace_back(new_x2_val);no_improve=0;}else{++no_improve;} std::pop_heap(R.begin(),R.end(),ComparePtr);Interval* const __restrict promejutochny_otrezok=R.back();const float segment_x1=promejutochny_otrezok->x1;const float segment_x2=promejutochny_otrezok->x2;const float len2=segment_x2-new_point;const float len1=new_point-segment_x1;Interval* const __restrict curr=new Interval(segment_x1,new_point,promejutochny_otrezok->y1,new_value,2.0f);Interval* const __restrict curr1=new Interval(new_point,segment_x2,new_value,promejutochny_otrezok->y2,2.0f);const float currM=(std::max)(curr->M,curr1->M);const size_t r_size=R.size(); if(mode){if(len2+len1==dmax){dmax=(std::max)(len1,len2);for(auto pI:R){float L=pI->x2-pI->x1;if(L>dmax)dmax=L;}}if(threshold_03>dmax&&!(schetchick%3)||10.0f*dmax<initial_length){if(currM>Mmax){Mmax=currM;m=r*Mmax;}const float progress=fmaf(-inv_threshold_03,dmax,1.0f);const float alpha=progress*progress;const float betta=2.0f-alpha;const float MULTIPLIER=(1.0f/dmax)*Mmax;const float global_coeff=fmaf(MULTIPLIER,r,-MULTIPLIER);const float GLOBAL_FACTOR=betta*global_coeff;curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR,len1,curr->M*alpha));curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR,len2,curr1->M*alpha));RecomputeR_AffineM_AVX2(R.data(),r_size,GLOBAL_FACTOR,alpha);std::make_heap(R.begin(),R.end(),ComparePtr);}else{if(currM>Mmax){if(currM<1.15f*Mmax){Mmax=currM;m=r*Mmax;curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);}else{Mmax=currM;m=r*Mmax;curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);RecomputeR_ConstM_AVX2(R.data(),r_size,m);std::make_heap(R.begin(),R.end(),ComparePtr);}}else{curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);}}} else{if(currM>Mmax){if(currM<1.15f*Mmax){Mmax=currM;m=r*Mmax;curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);}else{Mmax=currM;m=r*Mmax;curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);RecomputeR_ConstM_AVX2(R.data(),r_size,m);std::make_heap(R.begin(),R.end(),ComparePtr);}}else{curr->ChangeCharacteristic(m);curr1->ChangeCharacteristic(m);}} R.back()=curr;std::push_heap(R.begin(),R.end(),ComparePtr);R.emplace_back(curr1);std::push_heap(R.begin(),R.end(),ComparePtr);top_ptr=R.front();x_Rmax_1=top_ptr->x1;x_Rmax_2=top_ptr->x2;y_Rmax_1=top_ptr->y1;y_Rmax_2=top_ptr->y2;} } struct RunParams{int nSegments;uint8_t varLen;float minTheta;float tx,ty;int levels,maxIter;float r;uint8_t adaptive;float eps;unsigned int seed;template<typename Archive> void serialize(Archive& ar,const unsigned int){ar&nSegments&varLen&minTheta&tx&ty&levels&maxIter&r&adaptive&eps&seed;}}; extern "C" __declspec(dllexport) __declspec(noalias) void AgpStartManipND(int nSegments,bool variableLengths,float minTheta,float targetX,float targetY,int peanoLevels,int maxIterPerBranch,float r,bool adaptiveMode,float epsilon,unsigned int seed){ RunParams p;p.nSegments=nSegments;p.varLen=(uint8_t)variableLengths;p.minTheta=minTheta;p.tx=targetX;p.ty=targetY;p.levels=peanoLevels;p.maxIter=maxIterPerBranch;p.r=r;p.adaptive=(uint8_t)adaptiveMode;p.eps=epsilon;p.seed=seed;const int rank=g_world->rank();const int world=g_world->size();for(int i=1;i<world;++i){if(i!=rank)g_world->isend(i,1,p);} } extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun(){ while(true){if(g_world->iprobe(0,1)){RunParams p;g_world->recv(0,1,p);float* q=nullptr;size_t qlen=0;float bx=0,by=0,bf=0;AGP_Manip2D(p.nSegments,(bool)p.varLen,p.minTheta,p.tx,p.ty,p.levels,p.maxIter,p.r,(bool)p.adaptive,p.eps,p.seed,&q,&qlen,&bx,&by,&bf);if(q)CoTaskMemFree(q);}Sleep(0);} } extern "C" __declspec(dllexport) __declspec(noalias) void AgpStartWorkers(){int i=1;const int world=g_world->size();while(i<world){g_world->isend(i,1,0);++i;}} extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_Free(float* p){CoTaskMemFree(p);}

مشاركة هذا السؤال والجواب