#include "pch.h" #define XOR_RAND(state, result_v...

Creato il: 30 ottobre 2025

Risposto usando GPT-5 Thinking da Chat01

Domanda

#include "pch.h"

#define XOR_RAND(state, result_var)
do {
int s = (state);
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = state * 0x1.0p-32f;
} while (0)

#define XOR_RAND_GRSH(state, result_var)
do {
int s = (state);
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = fmaf(state, 0x1.0p-31f, -1.0f);
} while (0)

#define FABE13_COS(x, result_var)
do {
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530718f);
if (r > 3.14159265359f) {
r = 6.28318530718f - r;
}
if (r < 1.57079632679f) {
const float t2 = r * r;
const float t4 = t2 * t2;
result_var = fmaf(t4, fmaf(t2, -0.0013888889f, 0.0416666667f), fmaf(t2, -0.5f, 1.0f));
} else {
r = 3.14159265359f - r;
const float t2 = r * r;
const float t4 = t2 * t2;
result_var = -fmaf(t4, fmaf(t2, -0.0013888889f, 0.0416666667f), fmaf(t2, -0.5f, 1.0f));
}
} while (0)

#define FABE13_SIN(x, result_var)
do {
const float x = (x);
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530718f);
bool sfl = r > 3.14159265359f;
if (sfl) r = 6.28318530718f - r;
bool cfl = r > 1.57079632679f;
if (cfl) r = 3.14159265359f - r;
const float t2 = r * r;
float _s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
result_var = ((x < 0.0f) ^ sfl) ? -_s : _s;
} while (0)

#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
const int limit = (n) & ~7;
if ((n) >= 8) {
static __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f);
static __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f);
static __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f);
static __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f);
static __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f);
static __declspec(align(32)) const __m256 VEC_COS_P5 = _mm256_set1_ps(-0.0013888889f);
static __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f);
static __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f);
static __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_SIN_P5 = _mm256_set1_ps(-0.0001984127f);
static __declspec(align(32)) const __m256 VEC_SIN_P3 = _mm256_set1_ps(0.0083333333f);
static __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f);
static __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps();
while (i < limit) {
const __m256 vx = _mm256_load_ps(&(in)[i]);
const __m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
const __m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
const __m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
const __m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
const __m256 t2 = _mm256_mul_ps(r2, r2);
const __m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P5, VEC_COS_P3), VEC_COS_P1), VEC_COS_P0);
const __m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_SIN_P5, VEC_SIN_P3), VEC_SIN_P1), VEC_SIN_P0), r2);
const __m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
const __m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ), _mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
_mm256_store_ps(&(cos_out)[i], _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip));
_mm256_store_ps(&(sin_out)[i], _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip));
i += 8;
}
}
while (i < (n)) {
const float x = (in)[i];
const float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
const bool sflip = r > 3.14159265359f;
if (sflip) r = 6.28318530718f - r;
const bool cflip = r > 1.57079632679f;
if (cflip) r = 3.14159265359f - r;
const float t2 = r * r;
const float c = fmaf(t2, fmaf(t2, fmaf(t2, -0.0013888889f, 0.0416666667f), -0.5f), 1.0f);
const float s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
(cos_out)[i] = cflip ? -c : c;
(sin_out)[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while (0)

enum List : uint8_t {
Top = 0b00u,
Down = 0b01u,
Left = 0b10u,
Right = 0b11u
};

__declspec(align(4)) struct Step {
const uint8_t next;
const uint8_t dx;
const uint8_t dy;
};

__declspec(align(4)) struct InvStep {
const uint8_t q;
const uint8_t next;
};

__declspec(align(64)) static const Step g_step_tbl[4][4] = {
{ {Right, 0u, 0u}, {Top, 0u, 1u}, {Top, 1u, 1u}, {Left, 1u, 0u} },
{ {Left, 1u, 1u}, {Down, 1u, 0u}, {Down, 0u, 0u}, {Right, 0u, 1u} },
{ {Down, 1u, 1u}, {Left, 0u, 1u}, {Left, 0u, 0u}, {Top, 1u, 0u} },
{ {Top, 0u, 0u}, {Right, 1u, 0u}, {Right, 1u, 1u}, {Down, 0u, 1u} }
};

__declspec(align(64)) static const InvStep g_inv_tbl[4][4] = {
{ {0u, Right}, {1u, Top}, {3u, Left}, {2u, Top} },
{ {2u, Down}, {3u, Right}, {1u, Down}, {0u, Left} },
{ {2u, Left}, {1u, Left}, {3u, Top}, {0u, Down} },
{ {0u, Top}, {3u, Down}, {1u, Right}, {2u, Right} }
};

static const boost::mpi::environment* g_env;
static const boost::mpi::communicator* g_world;

__declspec(align(16)) struct CrossMsg final {
float s_x1, s_x2;
float e_x1, e_x2;
float Rtop;

text
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }

};

__declspec(align(16)) struct CtrlMsg final {
bool kind;
CrossMsg xchg;

text
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& kind& xchg; }

};

__declspec(align(16)) struct Slab {
char* const base;
char* current;
char* const end;

text
__forceinline Slab(void* const memory, const size_t usable) : base((char*)memory), current(base), end(base + (usable & ~(size_t)63u)) { }

};

static tbb::enumerable_thread_specific<Slab*> tls( noexcept {
void* memory = _aligned_malloc(16777216u, 16u);
Slab* slab = (Slab*)_aligned_malloc(32u, 16u);
new (slab) Slab(memory, 16777216u);
char* p = slab->base;
while (p < slab->end) {
*p = 0;
p += 4096u;
}
return slab;
});

__declspec(align(16)) struct Peano2DMap {
const int levels;
const float a, b, c, d;
const float lenx, leny;
const float inv_lenx;
const uint32_t scale;
const uint8_t start;

text
__forceinline Peano2DMap(int L, float _a, float _b, float _c, float _d, uint8_t st) : levels(L), a(_a), b(_b), c(_c), d(_d), lenx(_b - _a), leny(_d - _c), inv_lenx(1.0f / (_b - _a)), scale((uint32_t)1u << (L << 1)), start(st) { }

};

static Peano2DMap gActiveMap(0, 0, 0, 0, 0, 0);

__forceinline bool ComparePtr(const struct Interval* a, const struct Interval* b) noexcept;

__declspec(align(16)) struct Interval {
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float ordinate_factor;
const float N_factor;
const float quadratic_term;
const float M;
float R;

text
__forceinline void* operator new(size_t) noexcept { Slab* s = tls.local(); char* r = s->current; s->current += 64u; return r; } __forceinline Interval(float _x1, float _x2, float _y1, float _y2, float _N) noexcept : x1(_x1), x2(_x2), y1(_y1), y2(_y2), delta_y(_y2 - _y1), ordinate_factor(-(y1 + y2) * 2.0f), N_factor(_N == 1.0f ? _x2 - _x1 : _N == 2.0f ? sqrtf(_x2 - _x1) : powf(_x2 - _x1, 1.0f / _N)), quadratic_term((1.0f / N_factor)* delta_y* delta_y), M((1.0f / N_factor)* fabsf(delta_y)) { } __forceinline void ChangeCharacteristic(float _m) noexcept { R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor)); }

};

__forceinline bool ComparePtr(const Interval* a, const Interval* b) noexcept {
return a->R < b->R;
}

__forceinline void RecomputeR_ConstM_AVX2(Interval* const* arr, size_t n, float m) {
const __m256 vm = _mm256_set1_ps(m);
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));

text
size_t i = 0; const int limit = (int)(n & ~7u); if (n >= 8u) { while (i < (size_t)limit) { __declspec(align(32)) float q[8], nf[8], ord[8]; for (int k = 0; k < 8; ++k) { const Interval* p = arr[i + k]; q[k] = p->quadratic_term; nf[k] = p->N_factor; ord[k] = p->ordinate_factor; } const __m256 vq = _mm256_load_ps(q); const __m256 vnf = _mm256_load_ps(nf); const __m256 vod = _mm256_load_ps(ord); const __m256 t = _mm256_fmadd_ps(vm, vnf, vod); const __m256 res = _mm256_fmadd_ps(vq, vinvm, t); __declspec(align(32)) float out[8]; _mm256_store_ps(out, res); for (int k = 0; k < 8; ++k) { arr[i + k]->R = out[k]; } i += 8u; } } while (i < n) { arr[i]->ChangeCharacteristic(m); ++i; }

}

__forceinline void RecomputeR_AffineM_AVX2(Interval* const* arr, size_t n, float GF, float alpha) {
const __m256 vGF = _mm256_set1_ps(GF);
const __m256 va = _mm256_set1_ps(alpha);

text
size_t i = 0; const int limit = (int)(n & ~7u); if (n >= 8u) { while (i < (size_t)limit) { __declspec(align(32)) float len[8], Mv[8], q[8], nf[8], ord[8]; for (int k = 0; k < 8; ++k) { const Interval* p = arr[i + k]; len[k] = p->x2 - p->x1; Mv[k] = p->M; q[k] = p->quadratic_term; nf[k] = p->N_factor; ord[k] = p->ordinate_factor; } const __m256 vlen = _mm256_load_ps(len); const __m256 vM = _mm256_load_ps(Mv); const __m256 vq = _mm256_load_ps(q); const __m256 vnf = _mm256_load_ps(nf); const __m256 vod = _mm256_load_ps(ord); const __m256 vm = _mm256_fmadd_ps(vGF, vlen, _mm256_mul_ps(va, vM)); __m256 vinvm = _mm256_rcp_ps(vm); vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f))); const __m256 t = _mm256_fmadd_ps(vm, vnf, vod); const __m256 res = _mm256_fmadd_ps(vq, vinvm, t); __declspec(align(32)) float out[8]; _mm256_store_ps(out, res); for (int k = 0; k < 8; ++k) { arr[i + k]->R = out[k]; } i += 8u; } } while (i < n) { const Interval* p = arr[i]; const float mi = fmaf(GF, (p->x2 - p->x1), p->M * alpha); arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor)); ++i; }

}

__forceinline float Shag(float _m, float x1, float x2, float y1, float y2, float _N, float _r) {
const float diff = y2 - y1;
const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(0x3F800000u | (((((uint32_t)&diff)) & 0x80000000u) ^ 0x80000000u))));

text
return _N == 1.0f ? fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f : _N == 2.0f ? fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f : fmaf(sign_mult / powf(_m, _N), powf(diff, _N) * _r, x1 + x2) * 0.5f;

}

struct MortonCachePerRank {
std::vector<int> permCache;
std::vector<uint64_t> invMaskCache;
std::vector<uint64_t> levelMasks;
uint32_t baseSeed;

MortonCachePerRank() : baseSeed(0) {}

};

static MortonCachePerRank g_mc;

struct MortonND {
int dim;
int levels;
uint64_t scale;
std::vector<float> low, high;
std::vector<int> perm;
std::vector<uint64_t> invMask;
mutable std::vector<uint64_t> coord;

text
MortonND(int D, int L, const float* lows, const float* highs, const MortonCachePerRank& mc) : dim(D), levels(L), scale(1ull), low(lows, lows + D), high(highs, highs + D), perm(mc.permCache.begin(), mc.permCache.begin() + D), invMask(mc.invMaskCache.begin(), mc.invMaskCache.begin() + D), coord(D, 0ull) { uint64_t level_mask = (L <= 12) ? ((1ull << L) - 1ull) : ((1ull << 12) - 1ull); for (int k = 0; k < D; ++k) { invMask[k] &= level_mask; } int maxbits = 63; int need = levels * D; if (need > maxbits) { levels = maxbits / D; if (levels < 1) levels = 1; } scale = 1ull << ((uint64_t)levels * (uint64_t)D); } __forceinline void map01ToPoint(float t, float* out) const noexcept { if (t < 0.0f) t = 0.0f; else if (t >= 1.0f) t = nextafterf(1.0f, 0.0f); uint64_t idx = (uint64_t)(t * (double)scale); if (idx >= scale) idx = scale - 1; for (int d = 0; d < dim; ++d) { coord[d] = 0ull; } for (int bit = 0; bit < levels; ++bit) { for (int d = 0; d < dim; ++d) { int pd = perm[d]; uint64_t b = (idx >> ((bit * dim) + d)) & 1ull; coord[pd] |= (b << bit); } } for (int d = 0; d < dim; ++d) { if (invMask[d]) coord[d] ^= invMask[d]; } const float invC = 1.0f / (float)((uint64_t)1 << levels); for (int d = 0; d < dim; ++d) { float c0 = low[d] + (high[d] - low[d]) * ((float)coord[d] * invC); float cs = (high[d] - low[d]) * invC; out[d] = fmaf(0.5f, cs, c0); } }

};

struct ManipCost {
int n;
bool variableLen;
float targetX, targetY;
float minTheta;

text
ManipCost(int _n, bool _variableLen, float _targetX, float _targetY, float _minTheta) : n(_n), variableLen(_variableLen), targetX(_targetX), targetY(_targetY), minTheta(_minTheta) { } __forceinline float operator()(const float* q, float& out_x, float& out_y) const noexcept { const float* th = q; const float* L = variableLen ? (q + n) : nullptr; float x = 0.0f, y = 0.0f, phi = 0.0f; float penC = 0.0f; const float Lc = 1.0f; std::vector<float> phi_arr(n); std::vector<float> s_arr(n), c_arr(n); for (int i = 0; i < n; ++i) { phi += th[i]; phi_arr[i] = phi; } FABE13_SINCOS(phi_arr.data(), s_arr.data(), c_arr.data(), n); for (int i = 0; i < n; ++i) { float Li = variableLen ? L[i] : Lc; x = fmaf(Li, c_arr[i], x); y = fmaf(Li, s_arr[i], y); } for (int i = 0; i < n; ++i) { float ai = fabsf(th[i]); if (ai < minTheta) { float violation = minTheta - ai; penC += violation * violation; } } float dx = x - targetX; float dy = y - targetY; float dist = sqrtf(fmaf(dx, dx, dy * dy)); out_x = x; out_y = y; return dist + penC; }

};

__forceinline void HitTest2D_analytic(float x_param, float& out_x1, float& out_x2) {
const float a = gActiveMap.a;
const float inv_lenx = gActiveMap.inv_lenx;
const uint32_t scale = gActiveMap.scale;
const uint32_t scale_minus_1 = scale - 1u;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const float c = gActiveMap.c;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;

text
float norm = (x_param - a) * inv_lenx; norm = fminf(fmaxf(norm, 0.0f), 0x1.fffffep-1f); uint32_t idx = (uint32_t)(norm * (float)scale); idx = idx > scale_minus_1 ? scale_minus_1 : idx; float sx = lenx, sy = leny; float x1 = a, x2 = c; uint8_t type = start; int l = levels - 1; while (l >= 0) { const uint32_t q = (idx >> (l * 2)) & 3u; const Step s = g_step_tbl[type][q]; type = s.next; sx *= 0.5f; sy *= 0.5f; x1 += s.dx ? sx : 0.0f; x2 += s.dy ? sy : 0.0f; --l; } out_x1 = x1 + sx * 0.5f; out_x2 = x2 + sy * 0.5f;

}

__forceinline float FindX2D_analytic(float px, float py) {
const float a = gActiveMap.a;
const float b = gActiveMap.b;
const float c = gActiveMap.c;
const float d = gActiveMap.d;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const uint32_t scale = gActiveMap.scale;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;

text
const float clamped_px = fminf(fmaxf(px, a), b); const float clamped_py = fminf(fmaxf(py, c), d); float sx = lenx, sy = leny; float x0 = a, y0 = c; uint32_t idx = 0u; uint8_t type = start; int l = 0; while (l < levels) { sx *= 0.5f; sy *= 0.5f; const float mx = x0 + sx; const float my = y0 + sy; const uint32_t tr = (uint32_t)((clamped_px > mx) & (clamped_py > my)); const uint32_t tl = (uint32_t)((clamped_px < mx) & (clamped_py > my)); const uint32_t dl = (uint32_t)((clamped_px < mx) & (clamped_py < my)); const uint32_t none = (uint32_t)(1u ^ (tr | tl | dl)); const uint32_t dd = (tr << 1) | tr | tl | (none << 1); const InvStep inv = g_inv_tbl[type][dd]; type = inv.next; idx = (idx << 2) | inv.q; const uint32_t dx = dd >> 1; const uint32_t dy = dd & 1u; x0 += dx ? sx : 0.0f; y0 += dy ? sy : 0.0f; ++l; } const float scale_recip = 1.0f / (float)scale; return fmaf((float)idx * scale_recip, lenx, a);

}

static __forceinline void agp_run_branch_mpi(const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed, std::vector<Interval*>& H, std::vector<float>& bestQ, float& bestF, float& bestX, float& bestY) {
const int dim = cost.n + (cost.variableLen ? cost.n : 0);

text
std::vector<float> q(dim); bestQ.reserve(dim); float x, y; int no_improve = 0; auto evalAt = [&](float t) -> float { map.map01ToPoint(t, q.data()); float f = cost(q.data(), x, y); if (f < bestF) { bestF = f; bestQ = q; bestX = x; bestY = y; no_improve = 0; } else { no_improve++; } return f; }; float a = 0.0f, b = 1.0f; float f_a = evalAt(a); float f_b = evalAt(b); const int K = (std::min)((std::max)(2 * dim, 8), 128); H.clear(); H.reserve((size_t)maxIter + K + 8); float Mmax = 1e-3f; float prev_t = a; float prev_f = f_a; for (int k = 1; k <= K; ++k) { float t = a + (b - a) * ((float)k / (K + 1)); float f = evalAt(t); Interval* I = new Interval(prev_t, t, prev_f, f, (float)dim); Mmax = (std::max)(Mmax, I->M); I->ChangeCharacteristic(r * Mmax); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtr); prev_t = t; prev_f = f; } Interval* tail = new Interval(prev_t, b, prev_f, f_b, (float)dim); Mmax = (std::max)(Mmax, tail->M); float m = r * Mmax; tail->ChangeCharacteristic(m); H.emplace_back(tail); std::push_heap(H.begin(), H.end(), ComparePtr); float dmax = b - a; const float initial_len = dmax; const float thr03 = 0.3f * initial_len; const float inv_thr03 = 1.0f / thr03; int it = 0; const int rank = g_world->rank(); const int world = g_world->size(); while (it < maxIter) { const float p = fmaf(-1.0f / initial_len, dmax, 1.0f); bool stagnation = (no_improve > 100) && (it > 270); float k = stagnation ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f); while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg in; g_world->recv(boost::mpi::any_source, 0, in); if (in.kind) { if (!rank) { break; } else { return; } } else { float sx = in.xchg.s_x1, ex = in.xchg.e_x1; if (sx < 0.0f) sx = 0.0f; if (ex > 1.0f) ex = 1.0f; if (ex <= sx) continue; std::vector<float> tmp(dim); float tmp_x, tmp_y; map.map01ToPoint(sx, tmp.data()); float y1 = cost(tmp.data(), tmp_x, tmp_y); map.map01ToPoint(ex, tmp.data()); float y2 = cost(tmp.data(), tmp_x, tmp_y); Interval* inj = new Interval(sx, ex, y1, y2, (float)dim); inj->ChangeCharacteristic(m); Interval* top = H.front(); if (inj->R > 1.15f * top->R) { inj->R = in.xchg.Rtop * k; H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtr); } } } std::pop_heap(H.begin(), H.end(), ComparePtr); Interval* cur = H.back(); H.pop_back(); const float x1 = cur->x1, x2 = cur->x2, y1 = cur->y1, y2 = cur->y2; float tNew = Shag(m, x1, x2, y1, y2, (float)dim, r); tNew = fminf(fmaxf(tNew, a), b); float fNew = evalAt(tNew); Interval* left = new Interval(x1, tNew, y1, fNew, (float)dim); Interval* right = new Interval(tNew, x2, fNew, y2, (float)dim); float Mloc = (std::max)(left->M, right->M); if (adaptive) { float len1 = tNew - x1, len2 = x2 - tNew; if (len1 + len2 == dmax) { dmax = (std::max)(len1, len2); for (auto p : H) { float L = p->x2 - p->x1; if (L > dmax) dmax = L; } } if ((thr03 > dmax && !(it % 3)) || (10.0f * dmax < initial_len)) { if (Mloc > Mmax) { Mmax = Mloc; m = r * Mmax; } const float progress = fmaf(-dmax, inv_thr03, 1.0f); const float alpha = progress * progress; const float beta = fmaf(-alpha, 1.0f, 2.0f); const float MULT = (1.0f / dmax) * Mmax; const float global_coeff = fmaf(MULT, r, -MULT); const float GF = fmaf(beta, global_coeff, 0.0f); left->ChangeCharacteristic(fmaf(GF, len1, left->M * alpha)); right->ChangeCharacteristic(fmaf(GF, len2, right->M * alpha)); size_t sz = H.size(); RecomputeR_AffineM_AVX2(H.data(), sz, GF, alpha); std::make_heap(H.begin(), H.end(), ComparePtr); } else { if (Mloc > Mmax) { if (Mloc < 1.15f * Mmax) { Mmax = Mloc; m = r * Mmax; left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); } else { Mmax = Mloc; m = r * Mmax; left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); size_t sz = H.size(); RecomputeR_ConstM_AVX2(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtr); } } else { left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); } } } else { if (Mloc > Mmax) { if (Mloc < 1.15f * Mmax) { Mmax = Mloc; m = r * Mmax; left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); } else { Mmax = Mloc; m = r * Mmax; left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); size_t sz = H.size(); RecomputeR_ConstM_AVX2(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtr); } } else { left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); } } H.push_back(left); std::push_heap(H.begin(), H.end(), ComparePtr); H.push_back(right); std::push_heap(H.begin(), H.end(), ComparePtr); Interval* top = H.front(); float interval_len = top->x2 - top->x1; const int T = (int)fmaf(-expm1f(p), 264.0f, 277.0f); bool want_term = (powf(interval_len, 1.0f / (float)dim) < eps) || (it == maxIter - 1); if (!(it % T) || want_term) { CtrlMsg out; out.kind = want_term; out.xchg.s_x1 = top->x1; out.xchg.e_x1 = top->x2; out.xchg.s_x2 = 0.0f; out.xchg.e_x2 = 0.0f; out.xchg.Rtop = top->R; for (int i = 0; i < world; ++i) { if (i != rank) g_world->isend(i, 0, out); } if (want_term) { break; } } ++it; }

}

struct BestPacket {
float bestF;
int dim;
float bestX;
float bestY;

text
template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar& bestF& dim& bestX& bestY; }

};

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_Manip2D(int nSegments, bool variableLengths, float minTheta,
float targetX, float targetY, int peanoLevels, int maxIterPerBranch,
float r, bool adaptiveMode, float epsilon, unsigned int seed,
float** out_bestQ, size_t* out_bestQLen,
float* out_bestX, float* out_bestY, float* out_bestF) {

text
const int dim = nSegments + (variableLengths ? nSegments : 0); const float thetaMin = -3.14159265359f; const float thetaMax = 3.14159265359f; const float lenMin = 0.1f; const float lenMax = 2.0f; std::vector<float> low, high; low.reserve(dim); high.reserve(dim); for (int i = 0; i < nSegments; ++i) { low.push_back(thetaMin); high.push_back(thetaMax); } if (variableLengths) { for (int i = 0; i < nSegments; ++i) { low.push_back(lenMin); high.push_back(lenMax); } } ManipCost cost(nSegments, variableLengths, targetX, targetY, minTheta); const int rank = g_world->rank(); const int world = g_world->size(); MortonND map(dim, peanoLevels, low.data(), high.data(), g_mc); std::vector<Interval*> H; std::vector<float> bestQ; float bestF = FLT_MAX, bx = 0.0f, by = 0.0f; H.reserve((size_t)maxIterPerBranch + 256); bestQ.reserve(dim); agp_run_branch_mpi(map, cost, maxIterPerBranch, r, adaptiveMode, epsilon, seed, H, bestQ, bestF, bx, by); BestPacket me{ bestF, dim, bx, by }; if (!rank) { std::vector<float> winnerQ = bestQ; float winF = bestF, wx = bx, wy = by; for (int i = 1; i < world; ++i) { BestPacket bp; g_world->recv(i, 2, bp); std::vector<float> qin; g_world->recv(i, 3, qin); if (bp.bestF < winF) { winF = bp.bestF; wx = bp.bestX; wy = bp.bestY; winnerQ = qin; } } *out_bestQLen = winnerQ.size(); *out_bestQ = (float*)CoTaskMemAlloc(sizeof(float) * (*out_bestQLen)); std::memcpy(*out_bestQ, winnerQ.data(), sizeof(float) * (*out_bestQLen)); *out_bestX = wx; *out_bestY = wy; *out_bestF = winF; } else { g_world->send(0, 2, me); g_world->send(0, 3, bestQ); }

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(int peanoLevel, float a, float b, float c, float d) {
g_env = new boost::mpi::environment();
g_world = new boost::mpi::communicator();

text
const int rank = g_world->rank(); const int world_size = g_world->size(); if (world_size == 4) { switch (rank) { case 0: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, static_cast<uint8_t>(Top)); break; case 1: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, static_cast<uint8_t>(Down)); break; case 2: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, static_cast<uint8_t>(Left)); break; default: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, static_cast<uint8_t>(Right)); } } const int MaxDim = 16; g_mc.baseSeed = 0x9E3779B9u * (rank + 1); g_mc.permCache.resize(MaxDim); for (int i = 0; i < MaxDim; ++i) { g_mc.permCache[i] = i; } g_mc.invMaskCache.resize(MaxDim); for (int k = 0; k < MaxDim; ++k) { g_mc.invMaskCache[k] = 0ull; } g_mc.levelMasks.resize(13); for (int level = 0; level <= 12; ++level) { g_mc.levelMasks[level] = (level == 0) ? 0ull : (1ull << level) - 1ull; } return rank;

}

__forceinline float ShekelFunc(float x, float seed) {
int i = 0;
float st = seed, r1, r2, res = 0.0f;

text
while (i < 10) { XOR_RAND(st, r1); float xp = fmaf(-r1, 10.0f, x); XOR_RAND(st, r1); XOR_RAND(st, r2); float d = fmaf(fmaf(r1, 20.0f, 5.0f), xp * xp, fmaf(r2, 0.2f, 1.0f)); d = copysignf(fmaxf(fabsf(d), FLT_MIN), d); res -= 1.0f / d; ++i; } return res;

}

__forceinline float RastriginFunc(float x1, float x2) {
const float t = fmaf(x1, x1, x2 * x2);
float c1, c2;
FABE13_COS(6.28318530717958647692f * x1, c1);
FABE13_COS(6.28318530717958647692f * x2, c2);
return (t - fmaf(c1 + c2, 10.0f, -14.6f)) * fmaf(-t, 0.25f, 18.42f);
}

__forceinline float HillFunc(float x, float seed) {
int j = 0;
__declspec(align(32)) float ang[14u];
float st = 6.28318530717958647692f * x;

text
while (j < 14) { ang[j] = st * (float)(j + 1); ++j; } __declspec(align(32)) float sv[14u], cv[14u]; FABE13_SINCOS(ang, sv, cv, 14u); float state = seed, r1, r2; XOR_RAND(state, r1); float res = fmaf(r1, 2.0f, -1.1f); --j; while (j >= 0) { XOR_RAND(state, r1); XOR_RAND(state, r2); res += fmaf(fmaf(r1, 2.0f, -1.1f), sv[j], fmaf(r2, 2.0f, -1.1f) * cv[j]); --j; } return res;

}

__forceinline float GrishaginFunc(float x1, float x2, float seed) {
int j = 0;
__declspec(align(32)) float aj[8u], ak[8u];

text
while (j < 8) { float pj = 3.14159265358979323846f * (float)(j + 1); aj[j] = pj * x1; ak[j] = pj * x2; ++j; } __declspec(align(32)) float sj[8u], cj[8u], sk[8u], ck[8u]; FABE13_SINCOS(aj, sj, cj, 8u); FABE13_SINCOS(ak, sk, ck, 8u); --j; float p1 = 0.0f, p2 = 0.0f; float st = seed, r1, r2; while (j >= 0) { size_t k = 0u; while (k < 8u) { float s = sj[j] * sj[j]; float c = ck[k] * ck[k]; XOR_RAND_GRSH(st, r1); XOR_RAND_GRSH(st, r2); p1 = fmaf(r1, s, fmaf(r2, c, p1)); XOR_RAND_GRSH(st, r1); XOR_RAND_GRSH(st, r2); p2 = fmaf(-r1, c, fmaf(r2, s, p2)); ++k; } --j; } return -sqrtf(fmaf(p1, p1, p2 * p2));

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_1D(float global_iterations, float a, float b, float r, bool mode, float epsilon, float seed, float** out_data, size_t* out_len) {
Slab* slab = tls.local();
slab->current = slab->base;

text
int schetchick = 0; const float initial_length = b - a; float dmax = initial_length; const float threshold_03 = 0.3f * initial_length; const float inv_threshold_03 = 1.0f / threshold_03; const float start_val = ShekelFunc(a, seed); float best_f = ShekelFunc(b, seed); float x_Rmax_1 = a; float x_Rmax_2 = b; float y_Rmax_1 = start_val; float y_Rmax_2 = best_f; std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr; std::vector<Interval*, boost::alignment::aligned_allocator<Interval*, 64u>> R; Extr.reserve((size_t)global_iterations << 2u); R.reserve((size_t)global_iterations << 1u); R.emplace_back(new Interval(a, b, start_val, best_f, 1.0f)); float Mmax = R.front()->M; float m = r * Mmax; while (true) { const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r); const float new_value = ShekelFunc(new_point, seed); if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_point); } std::pop_heap(R.begin(), R.end(), ComparePtr); const Interval* pro = R.back(); const float new_x1 = pro->x1; const float new_x2 = pro->x2; const float len2 = new_x2 - new_point; const float len1 = new_point - new_x1; const float interval_len = len1 < len2 ? len1 : len2; if (++schetchick == (int)global_iterations || interval_len < epsilon) { Extr.emplace_back((float)schetchick); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = (float*)CoTaskMemAlloc(sizeof(float) * (*out_len)); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); return; } Interval* curr = new Interval(new_x1, new_point, pro->y1, new_value, 1.0f); Interval* curr1 = new Interval(new_point, new_x2, new_value, pro->y2, 1.0f); const float currM = curr->M > curr1->M ? curr->M : curr1->M; const size_t r_size = R.size(); if (mode) { if (len2 + len1 == dmax) { dmax = len2 > len1 ? len2 : len1; for (auto p : R) { float L = p->x2 - p->x1; if (L > dmax) dmax = L; } } if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) { if (currM > Mmax) { Mmax = currM; m = r * Mmax; } const float progress = fmaf(-inv_threshold_03, dmax, 1.0f); const float alpha = progress * progress; const float betta = 2.0f - alpha; const float MULT = (1.0f / dmax) * Mmax; const float global_coeff = fmaf(MULT, r, -MULT); const float GF = betta * global_coeff; curr->ChangeCharacteristic(fmaf(GF, len1, curr->M * alpha)); curr1->ChangeCharacteristic(fmaf(GF, len2, curr1->M * alpha)); RecomputeR_AffineM_AVX2(R.data(), r_size, GF, alpha); std::make_heap(R.begin(), R.end(), ComparePtr); } else { if (currM > Mmax) { if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); } } else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } } } else { if (currM > Mmax) { if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); } } else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } } R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr); R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr); const Interval* top = R.front(); x_Rmax_1 = top->x1; x_Rmax_2 = top->x2; y_Rmax_1 = top->y1; y_Rmax_2 = top->y2; }

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_2D(
const float N,
const float global_iterations,
const float a,
const float b,
const float c,
const float d,
const float r,
const bool mode,
const float epsilon,
const float seed,
float** const __restrict out_data,
size_t* const __restrict out_len) noexcept
{
Slab* const __restrict slab = tls.local();
slab->current = slab->base;

text
int schetchick = 0; int no_improve = 0; const int rank = g_world->rank(); const int world_size = g_world->size(); while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg dummy; g_world->recv(boost::mpi::any_source, 0, dummy); } const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1)); const float x_addition = (b - a) * inv_divider; const float y_addition = (d - c) * inv_divider; const float true_start = a + x_addition; const float true_end = b - x_addition; float x_Rmax_1 = true_start; float x_Rmax_2 = true_end; const float initial_length = x_Rmax_2 - x_Rmax_1; float dmax = initial_length; const float threshold_03 = 0.3f * initial_length; const float inv_threshold_03 = 1.0f / threshold_03; const float start_val = rank % 3 ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition); float best_f = rank % 2 ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition); float y_Rmax_1 = start_val; float y_Rmax_2 = best_f; std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr; std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R; Extr.clear(); Extr.reserve(static_cast<size_t>(global_iterations) << 2u); R.clear(); R.reserve(static_cast<size_t>(global_iterations) << 1u); R.emplace_back(new Interval(true_start, true_end, start_val, best_f, 2.0f)); const Interval* __restrict top_ptr; float Mmax = R.front()->M; float m = r * Mmax; while (true) { const float interval_len = x_Rmax_2 - x_Rmax_1; const bool stagnation = no_improve > 100 && schetchick > 270; const float p = fmaf(-1.0f / initial_length, dmax, 1.0f); while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg in; g_world->recv(boost::mpi::any_source, 0, in); if (in.kind) { if (!rank) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); } return; } const float sx = FindX2D_analytic(in.xchg.s_x1, in.xchg.s_x2); const float ex = FindX2D_analytic(in.xchg.e_x1, in.xchg.e_x2); Interval* const __restrict injected = new Interval(sx, ex, RastriginFunc(in.xchg.s_x1, in.xchg.s_x2), RastriginFunc(in.xchg.e_x1, in.xchg.e_x2), 2.0f); injected->ChangeCharacteristic(m); if (injected->R > 1.15f * top_ptr->R) { const float k = stagnation ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f); injected->R = in.xchg.Rtop * k; R.emplace_back(injected); std::push_heap(R.begin(), R.end(), ComparePtr); } } const int T = static_cast<int>(fmaf(-expm1f(p), 264.0f, 277.0f)); const bool want_term = interval_len < epsilon || schetchick == static_cast<int>(global_iterations); if (!(++schetchick % T) || stagnation || want_term) { CtrlMsg out; out.kind = want_term; if (!out.kind) { float s_x1, s_x2, e_x1, e_x2; HitTest2D_analytic(top_ptr->x1, s_x1, s_x2); HitTest2D_analytic(top_ptr->x2, e_x1, e_x2); out.xchg = CrossMsg{ s_x1, s_x2, e_x1, e_x2, top_ptr->R }; } for (int i = 0; i < world_size; ++i) { if (i != rank) g_world->isend(i, 0, out); } if (out.kind) { if (!rank) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); } return; } } const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r); float new_x1_val, new_x2_val; HitTest2D_analytic(new_point, new_x1_val, new_x2_val); const float new_value = RastriginFunc(new_x1_val, new_x2_val); if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_x1_val); Extr.emplace_back(new_x2_val); no_improve = 0; } else { ++no_improve; } std::pop_heap(R.begin(), R.end(), ComparePtr); Interval* const __restrict promejutochny_otrezok = R.back(); const float segment_x1 = promejutochny_otrezok->x1; const float segment_x2 = promejutochny_otrezok->x2; const float len2 = segment_x2 - new_point; const float len1 = new_point - segment_x1; Interval* const __restrict curr = new Interval(segment_x1, new_point, promejutochny_otrezok->y1, new_value, 2.0f); Interval* const __restrict curr1 = new Interval(new_point, segment_x2, new_value, promejutochny_otrezok->y2, 2.0f); const float currM = (std::max)(curr->M, curr1->M); const size_t r_size = R.size(); if (mode) { if (len2 + len1 == dmax) { dmax = (std::max)(len1, len2); for (auto p : R) { float L = p->x2 - p->x1; if (L > dmax) dmax = L; } } if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) { if (currM > Mmax) { Mmax = currM; m = r * Mmax; } const float progress = fmaf(-inv_threshold_03, dmax, 1.0f); const float alpha = progress * progress; const float betta = 2.0f - alpha; const float MULTIPLIER = (1.0f / dmax) * Mmax; const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER); const float GLOBAL_FACTOR = betta * global_coeff; curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha)); curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha)); RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha); std::make_heap(R.begin(), R.end(), ComparePtr); } else { if (currM > Mmax) { if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); } } else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } } } else { if (currM > Mmax) { if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); } } else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } } R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr); R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr); top_ptr = R.front(); x_Rmax_1 = top_ptr->x1; x_Rmax_2 = top_ptr->x2; y_Rmax_1 = top_ptr->y1; y_Rmax_2 = top_ptr->y2; }

}

struct RunParams {
int nSegments;
uint8_t varLen;
float minTheta;
float tx, ty;
int levels, maxIter;
float r;
uint8_t adaptive;
float eps;
unsigned int seed;

text
template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar& nSegments& varLen& minTheta& tx& ty& levels& maxIter& r& adaptive& eps& seed; }

};

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartManipND(
int nSegments,
bool variableLengths,
float minTheta,
float targetX,
float targetY,
int peanoLevels,
int maxIterPerBranch,
float r,
bool adaptiveMode,
float epsilon,
unsigned int seed)
{
RunParams p;
p.nSegments = nSegments;
p.varLen = (uint8_t)variableLengths;
p.minTheta = minTheta;
p.tx = targetX;
p.ty = targetY;
p.levels = peanoLevels;
p.maxIter = maxIterPerBranch;
p.r = r;
p.adaptive = (uint8_t)adaptiveMode;
p.eps = epsilon;
p.seed = seed;

text
const int rank = g_world->rank(); const int world = g_world->size(); for (int i = 1; i < world; ++i) { if (i != rank) g_world->isend(i, 1, p); }

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() {
while (true) {
if (g_world->iprobe(0, 1)) {
RunParams p;
g_world->recv(0, 1, p);
float* q = nullptr;
size_t qlen = 0;
float bx = 0, by = 0, bf = 0;

text
AGP_Manip2D( p.nSegments, (bool)p.varLen, p.minTheta, p.tx, p.ty, p.levels, p.maxIter, p.r, (bool)p.adaptive, p.eps, p.seed, &q, &qlen, &bx, &by, &bf ); if (q) CoTaskMemFree(q); } Sleep(0); }

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartWorkers() {
int i = 1;
const int world = g_world->size();

text
while (i < world) { g_world->isend(i, 1, 0); ++i; }

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_Free(float* p) {
CoTaskMemFree(p);
} - обрати внимание на многомерную версию АГП, одномерная и двумерная версии уже достаточно оптимизированы, но многомерная сырая - нужно выжать из неё максимум - я готов пойти на всё ради абсолютной скорости и алгоритмической корректности жертвуя читаемостью/поддерживаемостью/безопасностью/переносимостью, ты должен выровнять все структуры учитывая что я использую dll в управляемом winforms коде, ты должен везде где это разумно использовать fmaf, AVX для особо крупных блоков не имеет смысла - размерность задачи никогда точно не перевалит за 15 даже с учётом переменных длин, а глубина развёртки никогда не будет больше 12-13 - ты должен учитывать эти факты для более агрессивной оптимизации, используй предвычисления, jump-table, по максимуму загружай регистры и быстрый кеш желательно L-1, если что-то можно уместить в одну длинную уродливую формулу - делай это, в коде встречаются константы - убери их - подставляй сразу готовые числа в формулы - анализируй операции на предмет их стоимости и используй более быстрые операции, применяй различные битовые трюки, даже если это выглядит очень неочевидным, можно также изменить немного функцию шага - она вызывается часто - а там есть дорогое powf для обощённого случая - можно создать готовую таблицу предвычесленных значений и прыгать по ней, часть кода писал deepseek - и тут может быть много ненужного бреда - если видишь в части кода очевидные бредовые идеи в части относящейся к многомерному алгоритму - смело выкидывай и заменяй на более оптимальные решения, каждая наносекунда на счету - я ставлю рекорд гинеса по скорости агп, я мог что-то упустить из виду, также имей ввиду что это будет запускаться только на моём 6ти ядерном интел ай5-9600к, запускаюсь всегда на 6ти процессах, ты должен максимально внимательно и долго насколько это возможно анализировать код и выдать мне целиком исправленный код без комментариев - также с оптимальным размещением блоков кода и переменных для лучшей локальности - не забывай про оптимальные типы данных и различные диррективы такие как final для структур или forceline для статических функций - вперёд - докажи что ты в разы умнее чем deepseek - воспринимай это как что-то личное - что он не смог а ты сможешь

Pensiero

Planning code modifications

The request is clear: modify the code without comments, use aggressive intrinsics, and avoid constants by inlining numbers directly. I need to adjust struct alignment with __declspec(align(64)) for arrays, and use "final" for structs (though it doesn't affect alignment). I should precompute tables for N (values 3 to 15) for the Shag function.

Clarifying Shag parameters

In the Shag function, N seems linked to the dimension (dim). For 2D manipulation, N = 2, but dimensions can reach 30, though the user limits N to 15. Depending on the context, I'll precompute power tables for faster calculation. But for diff, I’ll optimize by exponentiation by squaring or using precomputed values.

Clarifying the Shag function details

In the Shag function, N can be a float, but in the code, it’s set as the integer dim. I’ll implement a switch for N values between 3 and 15, computing powers efficiently. For comparison, I’ll adjust the heap to prioritize the largest R value using std::push_heap with a correct comparator.

Heap behavior and inlining thoughts

The comparator used in std::push_heap will work as expected under the default std::less for a max-heap. With ComparePtr, R is used to prioritize the element with the greatest R. As for AVX, it's not beneficial here due to the small dimension, so no need for heavy vectorization. I'll optimize by avoiding vectorization in heap operations and using restrict qualifiers, inlining, and precomputing K.

Optimizing data structures

We’ll improve performance by inlining numeric constants where possible and using __forceinline for small functions. Dynamic allocations should be minimized, so we’ll use stack arrays for phi, s_arr, and c_arr where applicable. For the MortonND map, we’ll precompute bit masks and map operations using optimized bit manipulations for faster execution with vector instructions, reducing overhead significantly. We'll align memory usage and pack fields together to optimize cache performance.

Optimizing calculations

We’re focusing on performance improvements by inlining constants and using __forceinline for small functions. Static array allocations replace vector-based dynamic allocations for variables like q_local[16]. We’ll precompute values like masks, scales, and step sizes to minimize runtime overhead. Bit manipulations with optimized functions like _pext_u64 will speed up processing in critical mapping functions like map01ToPoint. Other optimizations include reducing memory allocation complexity and leveraging fast math approximations for performance.

Thought for 1m 26s

Risposta

cpp
#include "pch.h" #define XOR_RAND(state, result_var) \ do { \ int s = (state); \ s ^= s << 13; \ s ^= s >> 17; \ s ^= s << 5; \ state = s; \ result_var = state * 0x1.0p-32f; \ } while (0) #define XOR_RAND_GRSH(state, result_var) \ do { \ int s = (state); \ s ^= s << 13; \ s ^= s >> 17; \ s ^= s << 5; \ state = s; \ result_var = fmaf(state, 0x1.0p-31f, -1.0f); \ } while (0) #define FABE13_COS(x, result_var) \ do { \ const float _ax_ = fabsf(x); \ float _r_ = fmodf(_ax_, 6.28318530718f); \ if (_r_ > 3.14159265359f) _r_ = 6.28318530718f - _r_; \ if (_r_ < 1.57079632679f) { \ const float _t2_ = _r_ * _r_; \ const float _t4_ = _t2_ * _t2_; \ result_var = fmaf(_t4_, fmaf(_t2_, -0.0013888889f, 0.0416666667f), fmaf(_t2_, -0.5f, 1.0f)); \ } else { \ _r_ = 3.14159265359f - _r_; \ const float _t2_ = _r_ * _r_; \ const float _t4_ = _t2_ * _t2_; \ result_var = -fmaf(_t4_, fmaf(_t2_, -0.0013888889f, 0.0416666667f), fmaf(_t2_, -0.5f, 1.0f)); \ } \ } while (0) #define FABE13_SIN(x, result_var) \ do { \ const float _x_ = (x); \ const float _ax_ = fabsf(_x_); \ float _r_ = fmodf(_ax_, 6.28318530718f); \ bool _sfl_ = _r_ > 3.14159265359f; \ if (_sfl_) _r_ = 6.28318530718f - _r_; \ bool _cfl_ = _r_ > 1.57079632679f; \ if (_cfl_) _r_ = 3.14159265359f - _r_; \ const float _t2_ = _r_ * _r_; \ float _s = fmaf(_t2_, fmaf(_t2_, fmaf(_t2_, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * _r_; \ result_var = ((_x_ < 0.0f) ^ _sfl_) ? -_s : _s; \ } while (0) #define FABE13_SINCOS(in, sin_out, cos_out, n) \ do { \ int i = 0; \ const int limit = (n) & ~7; \ if ((n) >= 8) { \ static __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f); \ static __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f); \ static __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f); \ static __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f); \ static __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f); \ static __declspec(align(32)) const __m256 VEC_COS_P5 = _mm256_set1_ps(-0.0013888889f); \ static __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f); \ static __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f); \ static __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f); \ static __declspec(align(32)) const __m256 VEC_SIN_P5 = _mm256_set1_ps(-0.0001984127f); \ static __declspec(align(32)) const __m256 VEC_SIN_P3 = _mm256_set1_ps(0.0083333333f); \ static __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f); \ static __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f); \ static __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps(); \ while (i < limit) { \ const __m256 vx = _mm256_load_ps(&(in)[i]); \ const __m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx); \ __m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS); \ q = _mm256_sub_ps(q, BIAS); \ const __m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax); \ const __m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r)); \ const __m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1)); \ const __m256 t2 = _mm256_mul_ps(r2, r2); \ const __m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P5, VEC_COS_P3), VEC_COS_P1), VEC_COS_P0); \ const __m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_SIN_P5, VEC_SIN_P3), VEC_SIN_P1), VEC_SIN_P0), r2); \ const __m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ); \ const __m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ), _mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ)); \ _mm256_store_ps(&(cos_out)[i], _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip)); \ _mm256_store_ps(&(sin_out)[i], _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip)); \ i += 8; \ } \ } \ while (i < (n)) { \ const float x = (in)[i]; \ const float ax = fabsf(x); \ float q = fmaf(ax, 0.15915494309189535f, 12582912.0f); \ q -= 12582912.0f; \ float r = fmaf(-6.28318530718f, q, ax); \ const bool sflip = r > 3.14159265359f; \ if (sflip) r = 6.28318530718f - r; \ const bool cflip = r > 1.57079632679f; \ if (cflip) r = 3.14159265359f - r; \ const float t2 = r * r; \ const float c = fmaf(t2, fmaf(t2, fmaf(t2, -0.0013888889f, 0.0416666667f), -0.5f), 1.0f); \ const float s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r; \ (cos_out)[i] = cflip ? -c : c; \ (sin_out)[i] = ((x < 0.0f) ^ sflip) ? -s : s; \ ++i; \ } \ } while (0) enum List : uint8_t { Top = 0b00u, Down = 0b01u, Left = 0b10u, Right = 0b11u }; __declspec(align(4)) struct Step final { const uint8_t next; const uint8_t dx; const uint8_t dy; }; __declspec(align(4)) struct InvStep final { const uint8_t q; const uint8_t next; }; __declspec(align(64)) static const Step g_step_tbl[4][4] = { { {Right, 0u, 0u}, {Top, 0u, 1u}, {Top, 1u, 1u}, {Left, 1u, 0u} }, { {Left, 1u, 1u}, {Down, 1u, 0u}, {Down, 0u, 0u}, {Right, 0u, 1u} }, { {Down, 1u, 1u}, {Left, 0u, 1u}, {Left, 0u, 0u}, {Top, 1u, 0u} }, { {Top, 0u, 0u}, {Right, 1u, 0u}, {Right, 1u, 1u}, {Down, 0u, 1u} } }; __declspec(align(64)) static const InvStep g_inv_tbl[4][4] = { { {0u, Right}, {1u, Top}, {3u, Left}, {2u, Top} }, { {2u, Down}, {3u, Right}, {1u, Down}, {0u, Left} }, { {2u, Left}, {1u, Left}, {3u, Top}, {0u, Down} }, { {0u, Top}, {3u, Down}, {1u, Right}, {2u, Right} } }; static const boost::mpi::environment* g_env; static const boost::mpi::communicator* g_world; __declspec(align(16)) struct CrossMsg final { float s_x1, s_x2; float e_x1, e_x2; float Rtop; template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar & s_x1 & s_x2 & e_x1 & e_x2 & Rtop; } }; __declspec(align(16)) struct CtrlMsg final { bool kind; CrossMsg xchg; template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar & kind & xchg; } }; __declspec(align(16)) struct Slab final { char* const base; char* current; char* const end; __forceinline Slab(void* const memory, const size_t usable) : base((char*)memory), current(base), end(base + (usable & ~(size_t)63u)) {} }; static tbb::enumerable_thread_specific<Slab*> tls([]() noexcept { void* memory = _aligned_malloc(16777216u, 64u); Slab* slab = (Slab*)_aligned_malloc(32u, 64u); new (slab) Slab(memory, 16777216u); char* p = slab->base; while (p < slab->end) { *p = 0; p += 4096u; } return slab; }); __declspec(align(16)) struct Peano2DMap final { const int levels; const float a, b, c, d; const float lenx, leny; const float inv_lenx; const uint32_t scale; const uint8_t start; __forceinline Peano2DMap(int L, float _a, float _b, float _c, float _d, uint8_t st) : levels(L), a(_a), b(_b), c(_c), d(_d), lenx(_b - _a), leny(_d - _c), inv_lenx(1.0f / (_b - _a)), scale((uint32_t)1u << (L << 1)), start(st) {} }; static Peano2DMap gActiveMap(0, 0, 0, 0, 0, 0); __forceinline bool ComparePtr(const struct Interval* a, const struct Interval* b) noexcept; __declspec(align(64)) struct Interval final { const float x1; const float x2; const float y1; const float y2; const float delta_y; const float ordinate_factor; const float N_factor; const float quadratic_term; const float M; float R; __forceinline void* operator new(size_t) noexcept { Slab* s = tls.local(); char* r = s->current; s->current += 64u; return r; } __forceinline Interval(float _x1, float _x2, float _y1, float _y2, float _N) noexcept : x1(_x1), x2(_x2), y1(_y1), y2(_y2), delta_y(_y2 - _y1), ordinate_factor(-(y1 + y2) * 2.0f), N_factor(_N == 1.0f ? _x2 - _x1 : _N == 2.0f ? sqrtf(_x2 - _x1) : powf(_x2 - _x1, 1.0f / _N)), quadratic_term((1.0f / N_factor) * delta_y * delta_y), M((1.0f / N_factor) * fabsf(delta_y)) {} __forceinline void ChangeCharacteristic(float _m) noexcept { R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor)); } }; __forceinline bool ComparePtr(const Interval* a, const Interval* b) noexcept { return a->R < b->R; } __forceinline void RecomputeR_ConstM_AVX2(Interval* const* arr, size_t n, float m) { const __m256 vm = _mm256_set1_ps(m); __m256 vinvm = _mm256_rcp_ps(vm); vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f))); size_t i = 0; const int limit = (int)(n & ~7u); if (n >= 8u) { while (i < (size_t)limit) { __declspec(align(32)) float q[8], nf[8], ord[8]; for (int k = 0; k < 8; ++k) { const Interval* p = arr[i + k]; q[k] = p->quadratic_term; nf[k] = p->N_factor; ord[k] = p->ordinate_factor; } const __m256 vq = _mm256_load_ps(q); const __m256 vnf = _mm256_load_ps(nf); const __m256 vod = _mm256_load_ps(ord); const __m256 t = _mm256_fmadd_ps(vm, vnf, vod); const __m256 res = _mm256_fmadd_ps(vq, vinvm, t); __declspec(align(32)) float out[8]; _mm256_store_ps(out, res); for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k]; i += 8u; } } while (i < n) { arr[i]->ChangeCharacteristic(m); ++i; } } __forceinline void RecomputeR_AffineM_AVX2(Interval* const* arr, size_t n, float GF, float alpha) { const __m256 vGF = _mm256_set1_ps(GF); const __m256 va = _mm256_set1_ps(alpha); size_t i = 0; const int limit = (int)(n & ~7u); if (n >= 8u) { while (i < (size_t)limit) { __declspec(align(32)) float len[8], Mv[8], q[8], nf[8], ord[8]; for (int k = 0; k < 8; ++k) { const Interval* p = arr[i + k]; len[k] = p->x2 - p->x1; Mv[k] = p->M; q[k] = p->quadratic_term; nf[k] = p->N_factor; ord[k] = p->ordinate_factor; } const __m256 vlen = _mm256_load_ps(len); const __m256 vM = _mm256_load_ps(Mv); const __m256 vq = _mm256_load_ps(q); const __m256 vnf = _mm256_load_ps(nf); const __m256 vod = _mm256_load_ps(ord); const __m256 vm = _mm256_fmadd_ps(vGF, vlen, _mm256_mul_ps(va, vM)); __m256 vinvm = _mm256_rcp_ps(vm); vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f))); const __m256 t = _mm256_fmadd_ps(vm, vnf, vod); const __m256 res = _mm256_fmadd_ps(vq, vinvm, t); __declspec(align(32)) float out[8]; _mm256_store_ps(out, res); for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k]; i += 8u; } } while (i < n) { const Interval* p = arr[i]; const float mi = fmaf(GF, (p->x2 - p->x1), p->M * alpha); arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor)); ++i; } } __forceinline float fast_pow_int(float v, int n) { float r; switch (n) { case 1: r = v; break; case 2: r = v * v; break; case 3: { float v2 = v * v; r = v2 * v; } break; case 4: { float v2 = v * v; r = v2 * v2; } break; case 5: { float v2 = v * v; r = v2 * v2 * v; } break; case 6: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v2; } break; case 7: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v2 * v; } break; case 8: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v4; } break; case 9: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v3; } break; case 10:{ float v2 = v * v; float v4 = v2 * v2; float v8 = v4 * v4; r = v8 * v2; } break; case 11:{ float v2 = v * v; float v4 = v2 * v2; float v8 = v4 * v4; r = v8 * v2 * v; } break; case 12:{ float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v6; } break; case 13:{ float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v6 * v; } break; case 14:{ float v7 = v * v * v * v * v * v * v; r = v7 * v7; } break; default:{ float v7 = v * v * v * v * v * v * v; r = v7 * v7 * v; } break; } return r; } __forceinline float Shag(float _m, float x1, float x2, float y1, float y2, float _N, float _r) { const float diff = y2 - y1; const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(0x3F800000u | (((*((uint32_t*)&diff)) & 0x80000000u) ^ 0x80000000u)))); const int Ni = (int)(_N + 0.5f); if (Ni == 1) return fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f; if (Ni == 2) return fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f; const float invmN = 1.0f / fast_pow_int(_m, Ni); const float dN = fast_pow_int(fabsf(diff), Ni); return fmaf(sign_mult * invmN, dN * _r, x1 + x2) * 0.5f; } struct MortonCachePerRank { std::vector<int> permCache; std::vector<uint64_t> invMaskCache; std::vector<uint64_t> levelMasks; uint32_t baseSeed; MortonCachePerRank() : baseSeed(0) {} }; static MortonCachePerRank g_mc; struct MortonND final { int dim; int levels; uint64_t scale; std::vector<float> low, high; std::vector<float> step, baseOff; std::vector<int> perm; std::vector<uint64_t> invMask; std::vector<uint64_t> pextMask; float invScaleLevel; __forceinline MortonND(int D, int L, const float* lows, const float* highs, const MortonCachePerRank& mc) : dim(D), levels(L), scale(1ull), low(lows, lows + D), high(highs, highs + D), step(D, 0.0f), baseOff(D, 0.0f), perm(mc.permCache.begin(), mc.permCache.begin() + D), invMask(mc.invMaskCache.begin(), mc.invMaskCache.begin() + D), pextMask(D, 0ull), invScaleLevel(0.0f) { uint64_t level_mask = (L <= 12) ? ((1ull << L) - 1ull) : ((1ull << 12) - 1ull); for (int k = 0; k < D; ++k) invMask[k] &= level_mask; int maxbits = 63; int need = L * D; if (need > maxbits) { levels = maxbits / D; if (levels < 1) levels = 1; } scale = 1ull << (uint64_t)(levels * D); invScaleLevel = 1.0f / (float)((uint64_t)1 << levels); for (int d = 0; d < dim; ++d) { float rng = high[d] - low[d]; float st = rng * invScaleLevel; step[d] = st; baseOff[d] = fmaf(0.5f, st, low[d]); } for (int d = 0; d < dim; ++d) { uint64_t m = 0ull; uint64_t bitpos = (uint64_t)d; for (int b = 0; b < levels; ++b) { m |= 1ull << bitpos; bitpos += (uint64_t)dim; } pextMask[d] = m; } } __forceinline void map01ToPoint(float t, float* __restrict out) const noexcept { if (t < 0.0f) t = 0.0f; else if (t >= 1.0f) t = nextafterf(1.0f, 0.0f); uint64_t idx = (uint64_t)((double)t * (double)scale); if (idx >= scale) idx = scale - 1; for (int d = 0; d < dim; ++d) { const int pd = perm[d]; uint64_t bits = _pext_u64(idx, pextMask[d]); if (invMask[pd]) bits ^= invMask[pd]; out[pd] = fmaf(step[pd], (float)bits, baseOff[pd]); } } }; struct ManipCost final { int n; bool variableLen; float targetX, targetY; float minTheta; __forceinline ManipCost(int _n, bool _variableLen, float _targetX, float _targetY, float _minTheta) : n(_n), variableLen(_variableLen), targetX(_targetX), targetY(_targetY), minTheta(_minTheta) {} __forceinline float operator()(const float* __restrict q, float& out_x, float& out_y) const noexcept { const float* th = q; const float* L = variableLen ? (q + n) : nullptr; __declspec(align(64)) float phi[16]; __declspec(align(64)) float s_arr[16]; __declspec(align(64)) float c_arr[16]; float x = 0.0f, y = 0.0f, phi_acc = 0.0f, penC = 0.0f; for (int i = 0; i < n; ++i) { phi_acc += th[i]; phi[i] = phi_acc; } FABE13_SINCOS(phi, s_arr, c_arr, n); const float Lc = 1.0f; if (variableLen) { for (int i = 0; i < n; ++i) { float Li = L[i]; x = fmaf(Li, c_arr[i], x); y = fmaf(Li, s_arr[i], y); } } else { for (int i = 0; i < n; ++i) { x = fmaf(Lc, c_arr[i], x); y = fmaf(Lc, s_arr[i], y); } } for (int i = 0; i < n; ++i) { float ai = fabsf(th[i]); float v = minTheta - ai; float u = v > 0.0f ? v : 0.0f; penC = fmaf(u, u, penC); } float dx = x - targetX; float dy = y - targetY; float dist = sqrtf(fmaf(dx, dx, dy * dy)); out_x = x; out_y = y; return dist + penC; } }; __forceinline void HitTest2D_analytic(float x_param, float& out_x1, float& out_x2) { const float a = gActiveMap.a; const float inv_lenx = gActiveMap.inv_lenx; const uint32_t scale = gActiveMap.scale; const uint32_t scale_minus_1 = scale - 1u; const float lenx = gActiveMap.lenx; const float leny = gActiveMap.leny; const float c = gActiveMap.c; const uint8_t start = gActiveMap.start; const int levels = gActiveMap.levels; float norm = (x_param - a) * inv_lenx; norm = fminf(fmaxf(norm, 0.0f), 0x1.fffffep-1f); uint32_t idx = (uint32_t)(norm * (float)scale); idx = idx > scale_minus_1 ? scale_minus_1 : idx; float sx = lenx, sy = leny; float x1 = a, x2 = c; uint8_t type = start; int l = levels - 1; while (l >= 0) { const uint32_t q = (idx >> (l * 2)) & 3u; const Step s = g_step_tbl[type][q]; type = s.next; sx *= 0.5f; sy *= 0.5f; x1 += s.dx ? sx : 0.0f; x2 += s.dy ? sy : 0.0f; --l; } out_x1 = x1 + sx * 0.5f; out_x2 = x2 + sy * 0.5f; } __forceinline float FindX2D_analytic(float px, float py) { const float a = gActiveMap.a; const float b = gActiveMap.b; const float c = gActiveMap.c; const float d = gActiveMap.d; const float lenx = gActiveMap.lenx; const float leny = gActiveMap.leny; const uint32_t scale = gActiveMap.scale; const uint8_t start = gActiveMap.start; const int levels = gActiveMap.levels; const float clamped_px = fminf(fmaxf(px, a), b); const float clamped_py = fminf(fmaxf(py, c), d); float sx = lenx, sy = leny; float x0 = a, y0 = c; uint32_t idx = 0u; uint8_t type = start; int l = 0; while (l < levels) { sx *= 0.5f; sy *= 0.5f; const float mx = x0 + sx; const float my = y0 + sy; const uint32_t tr = (uint32_t)((clamped_px > mx) & (clamped_py > my)); const uint32_t tl = (uint32_t)((clamped_px < mx) & (clamped_py > my)); const uint32_t dl = (uint32_t)((clamped_px < mx) & (clamped_py < my)); const uint32_t none = (uint32_t)(1u ^ (tr | tl | dl)); const uint32_t dd = (tr << 1) | tr | tl | (none << 1); const InvStep inv = g_inv_tbl[type][dd]; type = inv.next; idx = (idx << 2) | inv.q; const uint32_t dx = dd >> 1; const uint32_t dy = dd & 1u; x0 += dx ? sx : 0.0f; y0 += dy ? sy : 0.0f; ++l; } const float scale_recip = 1.0f / (float)scale; return fmaf((float)idx * scale_recip, lenx, a); } static __forceinline void agp_run_branch_mpi(const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed, std::vector<Interval*>& H, std::vector<float>& bestQ, float& bestF, float& bestX, float& bestY) { const int dim = cost.n + (cost.variableLen ? cost.n : 0); __declspec(align(64)) float q_local[16]; bestQ.reserve(dim); float x, y; int no_improve = 0; auto evalAt = [&](float t) __forceinline -> float { map.map01ToPoint(t, q_local); float f = cost(q_local, x, y); if (f < bestF) { bestF = f; bestQ.assign(q_local, q_local + dim); bestX = x; bestY = y; no_improve = 0; } else { no_improve++; } return f; }; float a = 0.0f, b = 1.0f; float f_a = evalAt(a); float f_b = evalAt(b); const int K = (std::min)((std::max)(2 * dim, 8), 128); H.clear(); H.reserve((size_t)maxIter + K + 8); float Mmax = 1e-3f; float prev_t = a; float prev_f = f_a; for (int k = 1; k <= K; ++k) { float t = a + (b - a) * ((float)k / (K + 1)); float f = evalAt(t); Interval* I = new Interval(prev_t, t, prev_f, f, (float)dim); Mmax = (std::max)(Mmax, I->M); I->ChangeCharacteristic(r * Mmax); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtr); prev_t = t; prev_f = f; } Interval* tail = new Interval(prev_t, b, prev_f, f_b, (float)dim); Mmax = (std::max)(Mmax, tail->M); float m = r * Mmax; tail->ChangeCharacteristic(m); H.emplace_back(tail); std::push_heap(H.begin(), H.end(), ComparePtr); float dmax = b - a; const float initial_len = dmax; const float thr03 = 0.3f * initial_len; const float inv_thr03 = 1.0f / thr03; int it = 0; const int rank = g_world->rank(); const int world = g_world->size(); while (it < maxIter) { const float p = fmaf(-1.0f / initial_len, dmax, 1.0f); bool stagnation = (no_improve > 100) && (it > 270); float k = stagnation ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f); while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg in; g_world->recv(boost::mpi::any_source, 0, in); if (in.kind) { if (!rank) break; else return; } else { float sx = in.xchg.s_x1, ex = in.xchg.e_x1; if (sx < 0.0f) sx = 0.0f; if (ex > 1.0f) ex = 1.0f; if (ex <= sx) continue; __declspec(align(64)) float tmp[16]; float tmp_x, tmp_y; map.map01ToPoint(sx, tmp); float y1 = cost(tmp, tmp_x, tmp_y); map.map01ToPoint(ex, tmp); float y2 = cost(tmp, tmp_x, tmp_y); Interval* inj = new Interval(sx, ex, y1, y2, (float)dim); inj->ChangeCharacteristic(m); Interval* top = H.front(); if (inj->R > 1.15f * top->R) { inj->R = in.xchg.Rtop * k; H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtr); } } } std::pop_heap(H.begin(), H.end(), ComparePtr); Interval* cur = H.back(); H.pop_back(); const float x1 = cur->x1, x2 = cur->x2, y1 = cur->y1, y2 = cur->y2; float tNew = Shag(m, x1, x2, y1, y2, (float)dim, r); tNew = fminf(fmaxf(tNew, a), b); float fNew = evalAt(tNew); Interval* left = new Interval(x1, tNew, y1, fNew, (float)dim); Interval* right = new Interval(tNew, x2, fNew, y2, (float)dim); float Mloc = (std::max)(left->M, right->M); if (adaptive) { float len1 = tNew - x1, len2 = x2 - tNew; if (len1 + len2 == dmax) { dmax = (std::max)(len1, len2); for (auto pI : H) { float L = pI->x2 - pI->x1; if (L > dmax) dmax = L; } } if ((thr03 > dmax && !(it % 3)) || (10.0f * dmax < initial_len)) { if (Mloc > Mmax) { Mmax = Mloc; m = r * Mmax; } const float progress = fmaf(-dmax, inv_thr03, 1.0f); const float alpha = progress * progress; const float beta = fmaf(-alpha, 1.0f, 2.0f); const float MULT = (1.0f / dmax) * Mmax; const float global_coeff = fmaf(MULT, r, -MULT); const float GF = fmaf(beta, global_coeff, 0.0f); left->ChangeCharacteristic(fmaf(GF, len1, left->M * alpha)); right->ChangeCharacteristic(fmaf(GF, len2, right->M * alpha)); size_t sz = H.size(); RecomputeR_AffineM_AVX2(H.data(), sz, GF, alpha); std::make_heap(H.begin(), H.end(), ComparePtr); } else { if (Mloc > Mmax) { if (Mloc < 1.15f * Mmax) { Mmax = Mloc; m = r * Mmax; left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); } else { Mmax = Mloc; m = r * Mmax; left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); size_t sz = H.size(); RecomputeR_ConstM_AVX2(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtr); } } else { left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); } } } else { if (Mloc > Mmax) { if (Mloc < 1.15f * Mmax) { Mmax = Mloc; m = r * Mmax; left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); } else { Mmax = Mloc; m = r * Mmax; left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); size_t sz = H.size(); RecomputeR_ConstM_AVX2(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtr); } } else { left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); } } H.push_back(left); std::push_heap(H.begin(), H.end(), ComparePtr); H.push_back(right); std::push_heap(H.begin(), H.end(), ComparePtr); Interval* top = H.front(); float interval_len = top->x2 - top->x1; const int T = (int)fmaf(-expm1f(p), 264.0f, 277.0f); bool want_term = (powf(interval_len, 1.0f / (float)dim) < eps) || (it == maxIter - 1); if (!(it % T) || want_term) { CtrlMsg out; out.kind = want_term; out.xchg.s_x1 = top->x1; out.xchg.e_x1 = top->x2; out.xchg.s_x2 = 0.0f; out.xchg.e_x2 = 0.0f; out.xchg.Rtop = top->R; for (int i = 0; i < world; ++i) { if (i != rank) g_world->isend(i, 0, out); } if (want_term) break; } ++it; } } struct BestPacket { float bestF; int dim; float bestX; float bestY; template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar & bestF & dim & bestX & bestY; } }; extern "C" __declspec(dllexport) __declspec(noalias) void AGP_Manip2D(int nSegments, bool variableLengths, float minTheta, float targetX, float targetY, int peanoLevels, int maxIterPerBranch, float r, bool adaptiveMode, float epsilon, unsigned int seed, float** out_bestQ, size_t* out_bestQLen, float* out_bestX, float* out_bestY, float* out_bestF) { const int dim = nSegments + (variableLengths ? nSegments : 0); const float thetaMin = -3.14159265359f; const float thetaMax = 3.14159265359f; const float lenMin = 0.1f; const float lenMax = 2.0f; std::vector<float> low, high; low.reserve(dim); high.reserve(dim); for (int i = 0; i < nSegments; ++i) { low.push_back(thetaMin); high.push_back(thetaMax); } if (variableLengths) { for (int i = 0; i < nSegments; ++i) { low.push_back(lenMin); high.push_back(lenMax); } } ManipCost cost(nSegments, variableLengths, targetX, targetY, minTheta); const int rank = g_world->rank(); const int world = g_world->size(); MortonND map(dim, peanoLevels, low.data(), high.data(), g_mc); std::vector<Interval*> H; std::vector<float> bestQ; float bestF = FLT_MAX, bx = 0.0f, by = 0.0f; H.reserve((size_t)maxIterPerBranch + 256); bestQ.reserve(dim); agp_run_branch_mpi(map, cost, maxIterPerBranch, r, adaptiveMode, epsilon, seed, H, bestQ, bestF, bx, by); BestPacket me{ bestF, dim, bx, by }; if (!rank) { std::vector<float> winnerQ = bestQ; float winF = bestF, wx = bx, wy = by; for (int i = 1; i < world; ++i) { BestPacket bp; g_world->recv(i, 2, bp); std::vector<float> qin; g_world->recv(i, 3, qin); if (bp.bestF < winF) { winF = bp.bestF; wx = bp.bestX; wy = bp.bestY; winnerQ = qin; } } *out_bestQLen = winnerQ.size(); *out_bestQ = (float*)CoTaskMemAlloc(sizeof(float) * (*out_bestQLen)); std::memcpy(*out_bestQ, winnerQ.data(), sizeof(float) * (*out_bestQLen)); *out_bestX = wx; *out_bestY = wy; *out_bestF = winF; } else { g_world->send(0, 2, me); g_world->send(0, 3, bestQ); } } extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(int peanoLevel, float a, float b, float c, float d) { g_env = new boost::mpi::environment(); g_world = new boost::mpi::communicator(); const int rank = g_world->rank(); const int world_size = g_world->size(); if (world_size == 4) { switch (rank) { case 0: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, (uint8_t)Top); break; case 1: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, (uint8_t)Down); break; case 2: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, (uint8_t)Left); break; default: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, (uint8_t)Right); } } const int MaxDim = 16; g_mc.baseSeed = 0x9E3779B9u * (rank + 1); g_mc.permCache.resize(MaxDim); for (int i = 0; i < MaxDim; ++i) g_mc.permCache[i] = i; g_mc.invMaskCache.resize(MaxDim); for (int k = 0; k < MaxDim; ++k) g_mc.invMaskCache[k] = 0ull; g_mc.levelMasks.resize(13); for (int level = 0; level <= 12; ++level) g_mc.levelMasks[level] = (level == 0) ? 0ull : (1ull << level) - 1ull; return rank; } __forceinline float ShekelFunc(float x, float seed) { int i = 0; float st = seed, r1, r2, res = 0.0f; while (i < 10) { XOR_RAND(st, r1); float xp = fmaf(-r1, 10.0f, x); XOR_RAND(st, r1); XOR_RAND(st, r2); float d = fmaf(fmaf(r1, 20.0f, 5.0f), xp * xp, fmaf(r2, 0.2f, 1.0f)); d = copysignf(fmaxf(fabsf(d), FLT_MIN), d); res -= 1.0f / d; ++i; } return res; } __forceinline float RastriginFunc(float x1, float x2) { const float t = fmaf(x1, x1, x2 * x2); float c1, c2; FABE13_COS(6.28318530717958647692f * x1, c1); FABE13_COS(6.28318530717958647692f * x2, c2); return (t - fmaf(c1 + c2, 10.0f, -14.6f)) * fmaf(-t, 0.25f, 18.42f); } __forceinline float HillFunc(float x, float seed) { int j = 0; __declspec(align(32)) float ang[14u]; float st = 6.28318530717958647692f * x; while (j < 14) { ang[j] = st * (float)(j + 1); ++j; } __declspec(align(32)) float sv[14u], cv[14u]; FABE13_SINCOS(ang, sv, cv, 14u); float state = seed, r1, r2; XOR_RAND(state, r1); float res = fmaf(r1, 2.0f, -1.1f); --j; while (j >= 0) { XOR_RAND(state, r1); XOR_RAND(state, r2); res += fmaf(fmaf(r1, 2.0f, -1.1f), sv[j], fmaf(r2, 2.0f, -1.1f) * cv[j]); --j; } return res; } __forceinline float GrishaginFunc(float x1, float x2, float seed) { int j = 0; __declspec(align(32)) float aj[8u], ak[8u]; while (j < 8) { float pj = 3.14159265358979323846f * (float)(j + 1); aj[j] = pj * x1; ak[j] = pj * x2; ++j; } __declspec(align(32)) float sj[8u], cj[8u], sk[8u], ck[8u]; FABE13_SINCOS(aj, sj, cj, 8u); FABE13_SINCOS(ak, sk, ck, 8u); --j; float p1 = 0.0f, p2 = 0.0f; float st = seed, r1, r2; while (j >= 0) { size_t k = 0u; while (k < 8u) { float s = sj[j] * sj[j]; float c = ck[k] * ck[k]; XOR_RAND_GRSH(st, r1); XOR_RAND_GRSH(st, r2); p1 = fmaf(r1, s, fmaf(r2, c, p1)); XOR_RAND_GRSH(st, r1); XOR_RAND_GRSH(st, r2); p2 = fmaf(-r1, c, fmaf(r2, s, p2)); ++k; } --j; } return -sqrtf(fmaf(p1, p1, p2 * p2)); } extern "C" __declspec(dllexport) __declspec(noalias) void AGP_1D(float global_iterations, float a, float b, float r, bool mode, float epsilon, float seed, float** out_data, size_t* out_len) { Slab* slab = tls.local(); slab->current = slab->base; int schetchick = 0; const float initial_length = b - a; float dmax = initial_length; const float threshold_03 = 0.3f * initial_length; const float inv_threshold_03 = 1.0f / threshold_03; const float start_val = ShekelFunc(a, seed); float best_f = ShekelFunc(b, seed); float x_Rmax_1 = a; float x_Rmax_2 = b; float y_Rmax_1 = start_val; float y_Rmax_2 = best_f; std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr; std::vector<Interval*, boost::alignment::aligned_allocator<Interval*, 64u>> R; Extr.reserve((size_t)global_iterations << 2u); R.reserve((size_t)global_iterations << 1u); R.emplace_back(new Interval(a, b, start_val, best_f, 1.0f)); float Mmax = R.front()->M; float m = r * Mmax; while (true) { const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r); const float new_value = ShekelFunc(new_point, seed); if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_point); } std::pop_heap(R.begin(), R.end(), ComparePtr); const Interval* pro = R.back(); const float new_x1 = pro->x1; const float new_x2 = pro->x2; const float len2 = new_x2 - new_point; const float len1 = new_point - new_x1; const float interval_len = len1 < len2 ? len1 : len2; if (++schetchick == (int)global_iterations || interval_len < epsilon) { Extr.emplace_back((float)schetchick); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = (float*)CoTaskMemAlloc(sizeof(float) * (*out_len)); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); return; } Interval* curr = new Interval(new_x1, new_point, pro->y1, new_value, 1.0f); Interval* curr1 = new Interval(new_point, new_x2, new_value, pro->y2, 1.0f); const float currM = curr->M > curr1->M ? curr->M : curr1->M; const size_t r_size = R.size(); if (mode) { if (len2 + len1 == dmax) { dmax = len2 > len1 ? len2 : len1; for (auto p : R) { float L = p->x2 - p->x1; if (L > dmax) dmax = L; } } if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) { if (currM > Mmax) { Mmax = currM; m = r * Mmax; } const float progress = fmaf(-inv_threshold_03, dmax, 1.0f); const float alpha = progress * progress; const float betta = 2.0f - alpha; const float MULT = (1.0f / dmax) * Mmax; const float global_coeff = fmaf(MULT, r, -MULT); const float GF = betta * global_coeff; curr->ChangeCharacteristic(fmaf(GF, len1, curr->M * alpha)); curr1->ChangeCharacteristic(fmaf(GF, len2, curr1->M * alpha)); RecomputeR_AffineM_AVX2(R.data(), r_size, GF, alpha); std::make_heap(R.begin(), R.end(), ComparePtr); } else { if (currM > Mmax) { if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); } } else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } } } else { if (currM > Mmax) { if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); } } else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } } R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr); R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr); const Interval* top = R.front(); x_Rmax_1 = top->x1; x_Rmax_2 = top->x2; y_Rmax_1 = top->y1; y_Rmax_2 = top->y2; } } extern "C" __declspec(dllexport) __declspec(noalias) void AGP_2D( const float N, const float global_iterations, const float a, const float b, const float c, const float d, const float r, const bool mode, const float epsilon, const float seed, float** const __restrict out_data, size_t* const __restrict out_len) noexcept { Slab* const __restrict slab = tls.local(); slab->current = slab->base; int schetchick = 0; int no_improve = 0; const int rank = g_world->rank(); const int world_size = g_world->size(); while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg dummy; g_world->recv(boost::mpi::any_source, 0, dummy); } const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1)); const float x_addition = (b - a) * inv_divider; const float y_addition = (d - c) * inv_divider; const float true_start = a + x_addition; const float true_end = b - x_addition; float x_Rmax_1 = true_start; float x_Rmax_2 = true_end; const float initial_length = x_Rmax_2 - x_Rmax_1; float dmax = initial_length; const float threshold_03 = 0.3f * initial_length; const float inv_threshold_03 = 1.0f / threshold_03; const float start_val = rank % 3 ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition); float best_f = rank % 2 ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition); float y_Rmax_1 = start_val; float y_Rmax_2 = best_f; std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr; std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R; Extr.clear(); Extr.reserve(static_cast<size_t>(global_iterations) << 2u); R.clear(); R.reserve(static_cast<size_t>(global_iterations) << 1u); R.emplace_back(new Interval(true_start, true_end, start_val, best_f, 2.0f)); const Interval* __restrict top_ptr; float Mmax = R.front()->M; float m = r * Mmax; while (true) { const float interval_len = x_Rmax_2 - x_Rmax_1; const bool stagnation = no_improve > 100 && schetchick > 270; const float p = fmaf(-1.0f / initial_length, dmax, 1.0f); while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg in; g_world->recv(boost::mpi::any_source, 0, in); if (in.kind) { if (!rank) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); } return; } const float sx = FindX2D_analytic(in.xchg.s_x1, in.xchg.s_x2); const float ex = FindX2D_analytic(in.xchg.e_x1, in.xchg.e_x2); Interval* const __restrict injected = new Interval(sx, ex, RastriginFunc(in.xchg.s_x1, in.xchg.s_x2), RastriginFunc(in.xchg.e_x1, in.xchg.e_x2), 2.0f); injected->ChangeCharacteristic(m); if (injected->R > 1.15f * top_ptr->R) { const float k = stagnation ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f); injected->R = in.xchg.Rtop * k; R.emplace_back(injected); std::push_heap(R.begin(), R.end(), ComparePtr); } } const int T = static_cast<int>(fmaf(-expm1f(p), 264.0f, 277.0f)); const bool want_term = interval_len < epsilon || schetchick == static_cast<int>(global_iterations); if (!(++schetchick % T) || stagnation || want_term) { CtrlMsg out; out.kind = want_term; if (!out.kind) { float s_x1, s_x2, e_x1, e_x2; HitTest2D_analytic(top_ptr->x1, s_x1, s_x2); HitTest2D_analytic(top_ptr->x2, e_x1, e_x2); out.xchg = CrossMsg{ s_x1, s_x2, e_x1, e_x2, top_ptr->R }; } for (int i = 0; i < world_size; ++i) { if (i != rank) g_world->isend(i, 0, out); } if (out.kind) { if (!rank) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); } return; } } const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r); float new_x1_val, new_x2_val; HitTest2D_analytic(new_point, new_x1_val, new_x2_val); const float new_value = RastriginFunc(new_x1_val, new_x2_val); if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_x1_val); Extr.emplace_back(new_x2_val); no_improve = 0; } else { ++no_improve; } std::pop_heap(R.begin(), R.end(), ComparePtr); Interval* const __restrict promejutochny_otrezok = R.back(); const float segment_x1 = promejutochny_otrezok->x1; const float segment_x2 = promejutochny_otrezok->x2; const float len2 = segment_x2 - new_point; const float len1 = new_point - segment_x1; Interval* const __restrict curr = new Interval(segment_x1, new_point, promejutochny_otrezok->y1, new_value, 2.0f); Interval* const __restrict curr1 = new Interval(new_point, segment_x2, new_value, promejutochny_otrezok->y2, 2.0f); const float currM = (std::max)(curr->M, curr1->M); const size_t r_size = R.size(); if (mode) { if (len2 + len1 == dmax) { dmax = (std::max)(len1, len2); for (auto pI : R) { float L = pI->x2 - pI->x1; if (L > dmax) dmax = L; } } if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) { if (currM > Mmax) { Mmax = currM; m = r * Mmax; } const float progress = fmaf(-inv_threshold_03, dmax, 1.0f); const float alpha = progress * progress; const float betta = 2.0f - alpha; const float MULTIPLIER = (1.0f / dmax) * Mmax; const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER); const float GLOBAL_FACTOR = betta * global_coeff; curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha)); curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha)); RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha); std::make_heap(R.begin(), R.end(), ComparePtr); } else { if (currM > Mmax) { if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); } } else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } } } else { if (currM > Mmax) { if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); } } else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } } R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr); R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr); top_ptr = R.front(); x_Rmax_1 = top_ptr->x1; x_Rmax_2 = top_ptr->x2; y_Rmax_1 = top_ptr->y1; y_Rmax_2 = top_ptr->y2; } } struct RunParams { int nSegments; uint8_t varLen; float minTheta; float tx, ty; int levels, maxIter; float r; uint8_t adaptive; float eps; unsigned int seed; template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar & nSegments & varLen & minTheta & tx & ty & levels & maxIter & r & adaptive & eps & seed; } }; extern "C" __declspec(dllexport) __declspec(noalias) void AgpStartManipND(int nSegments, bool variableLengths, float minTheta, float targetX, float targetY, int peanoLevels, int maxIterPerBranch, float r, bool adaptiveMode, float epsilon, unsigned int seed) { RunParams p; p.nSegments = nSegments; p.varLen = (uint8_t)variableLengths; p.minTheta = minTheta; p.tx = targetX; p.ty = targetY; p.levels = peanoLevels; p.maxIter = maxIterPerBranch; p.r = r; p.adaptive = (uint8_t)adaptiveMode; p.eps = epsilon; p.seed = seed; const int rank = g_world->rank(); const int world = g_world->size(); for (int i = 1; i < world; ++i) { if (i != rank) g_world->isend(i, 1, p); } } extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() { while (true) { if (g_world->iprobe(0, 1)) { RunParams p; g_world->recv(0, 1, p); float* q = nullptr; size_t qlen = 0; float bx = 0, by = 0, bf = 0; AGP_Manip2D(p.nSegments, (bool)p.varLen, p.minTheta, p.tx, p.ty, p.levels, p.maxIter, p.r, (bool)p.adaptive, p.eps, p.seed, &q, &qlen, &bx, &by, &bf); if (q) CoTaskMemFree(q); } Sleep(0); } } extern "C" __declspec(dllexport) __declspec(noalias) void AgpStartWorkers() { int i = 1; const int world = g_world->size(); while (i < world) { g_world->isend(i, 1, 0); ++i; } } extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_Free(float* p) { CoTaskMemFree(p); }

Condividi questa Q&A