#include "pch.h" #define XOR_RAND(state, result_v...

생성일: 2025년 10월 29일

답변에 사용된 모델 GPT-5 Thinking by Chat01

질문

#include "pch.h"

#define XOR_RAND(state, result_var)
do {
int s = (state);
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = state * 0x1.0p-32f;
} while (0)

#define XOR_RAND_GRSH(state, result_var)
do {
int s = (state);
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = fmaf(state, 0x1.0p-31f, -1.0f);
} while (0)

#define FABE13_COS(x, result_var)
do {
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530718f);
if (r > 3.14159265359f) {
r = 6.28318530718f - r;
}
if (r < 1.57079632679f) {
const float t2 = r * r;
const float t4 = t2 * t2;
result_var = fmaf(t4, fmaf(t2, -0.0013888889f, 0.0416666667f), fmaf(t2, -0.5f, 1.0f));
} else {
r = 3.14159265359f - r;
const float t2 = r * r;
const float t4 = t2 * t2;
result_var = -fmaf(t4, fmaf(t2, -0.0013888889f, 0.0416666667f), fmaf(t2, -0.5f, 1.0f));
}
} while (0)

#define FABE13_SIN(x, result_var)
do {
const float x = (x);
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530718f);
bool sfl = r > 3.14159265359f;
if (sfl) r = 6.28318530718f - r;
bool cfl = r > 1.57079632679f;
if (cfl) r = 3.14159265359f - r;
const float t2 = r * r;
float _s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
result_var = ((x < 0.0f) ^ sfl) ? -_s : _s;
} while (0)

#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
const int limit = (n) & ~7;
if ((n) >= 8) {
static __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f);
static __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f);
static __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f);
static __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f);
static __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f);
static __declspec(align(32)) const __m256 VEC_COS_P5 = _mm256_set1_ps(-0.0013888889f);
static __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f);
static __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f);
static __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_SIN_P5 = _mm256_set1_ps(-0.0001984127f);
static __declspec(align(32)) const __m256 VEC_SIN_P3 = _mm256_set1_ps(0.0083333333f);
static __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f);
static __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps();
while (i < limit) {
const __m256 vx = _mm256_load_ps(&(in)[i]);
const __m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
const __m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
const __m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
const __m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
const __m256 t2 = _mm256_mul_ps(r2, r2);
const __m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P5, VEC_COS_P3), VEC_COS_P1), VEC_COS_P0);
const __m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_SIN_P5, VEC_SIN_P3), VEC_SIN_P1), VEC_SIN_P0), r2);
const __m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
const __m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ), _mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
_mm256_store_ps(&(cos_out)[i], _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip));
_mm256_store_ps(&(sin_out)[i], _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip));
i += 8;
}
}
while (i < (n)) {
const float x = (in)[i];
const float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
const bool sflip = r > 3.14159265359f;
if (sflip) r = 6.28318530718f - r;
const bool cflip = r > 1.57079632679f;
if (cflip) r = 3.14159265359f - r;
const float t2 = r * r;
const float c = fmaf(t2, fmaf(t2, fmaf(t2, -0.0013888889f, 0.0416666667f), -0.5f), 1.0f);
const float s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
(cos_out)[i] = cflip ? -c : c;
(sin_out)[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while (0)

enum List : uint8_t {
Top = 0b00u,
Down = 0b01u,
Left = 0b10u,
Right = 0b11u
};

__declspec(align(4)) struct Step {
const uint8_t next;
const uint8_t dx;
const uint8_t dy;
};

__declspec(align(4)) struct InvStep {
const uint8_t q;
const uint8_t next;
};

__declspec(align(64)) static const Step g_step_tbl[4][4] = {
{ {Right, 0u, 0u}, {Top, 0u, 1u}, {Top, 1u, 1u}, {Left, 1u, 0u} },
{ {Left, 1u, 1u}, {Down, 1u, 0u}, {Down, 0u, 0u}, {Right, 0u, 1u} },
{ {Down, 1u, 1u}, {Left, 0u, 1u}, {Left, 0u, 0u}, {Top, 1u, 0u} },
{ {Top, 0u, 0u}, {Right, 1u, 0u}, {Right, 1u, 1u}, {Down, 0u, 1u} }
};

__declspec(align(64)) static const InvStep g_inv_tbl[4][4] = {
{ {0u, Right}, {1u, Top}, {3u, Left}, {2u, Top} },
{ {2u, Down}, {3u, Right}, {1u, Down}, {0u, Left} },
{ {2u, Left}, {1u, Left}, {3u, Top}, {0u, Down} },
{ {0u, Top}, {3u, Down}, {1u, Right}, {2u, Right} }
};

static const boost::mpi::environment* g_env;
static const boost::mpi::communicator* g_world;

__declspec(align(16)) struct CrossMsg final {
float s_x1, s_x2;
float e_x1, e_x2;
float Rtop;

text
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }

};

__declspec(align(16)) struct CtrlMsg final {
bool kind;
CrossMsg xchg;

text
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& kind& xchg; }

};

__declspec(align(16)) struct Slab {
char* const base;
char* current;
char* const end;

text
__forceinline Slab(void* const memory, const size_t usable) : base((char*)memory), current(base), end(base + (usable & ~(size_t)63u)) { }

};

static tbb::enumerable_thread_specific<Slab*> tls( noexcept {
void* memory = _aligned_malloc(16777216u, 16u);
Slab* slab = (Slab*)_aligned_malloc(32u, 16u);
new (slab) Slab(memory, 16777216u);
char* p = slab->base;
while (p < slab->end) {
*p = 0;
p += 4096u;
}
return slab;
});

__declspec(align(16)) struct Peano2DMap {
const int levels;
const float a, b, c, d;
const float lenx, leny;
const float inv_lenx;
const uint32_t scale;
const uint8_t start;

text
__forceinline Peano2DMap(int L, float _a, float _b, float _c, float _d, uint8_t st) : levels(L), a(_a), b(_b), c(_c), d(_d), lenx(_b - _a), leny(_d - _c), inv_lenx(1.0f / (_b - _a)), scale((uint32_t)1u << (L << 1)), start(st) { }

};

static Peano2DMap gActiveMap(0, 0, 0, 0, 0, 0);

__forceinline bool ComparePtr(const struct Interval* a, const struct Interval* b) noexcept;

__declspec(align(16)) struct Interval {
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float ordinate_factor;
const float N_factor;
const float quadratic_term;
const float M;
float R;

text
__forceinline void* operator new(size_t) noexcept { Slab* s = tls.local(); char* r = s->current; s->current += 64u; return r; } __forceinline Interval(float _x1, float _x2, float _y1, float _y2, float _N) noexcept : x1(_x1), x2(_x2), y1(_y1), y2(_y2), delta_y(_y2 - _y1), ordinate_factor(-(y1 + y2) * 2.0f), N_factor(_N == 1.0f ? _x2 - _x1 : _N == 2.0f ? sqrtf(_x2 - _x1) : powf(_x2 - _x1, 1.0f / _N)), quadratic_term((1.0f / N_factor)* delta_y* delta_y), M((1.0f / N_factor)* fabsf(delta_y)) { } __forceinline void ChangeCharacteristic(float _m) noexcept { R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor)); }

};

__forceinline bool ComparePtr(const Interval* a, const Interval* b) noexcept {
return a->R < b->R;
}

__forceinline void RecomputeR_ConstM_AVX2(Interval* const* arr, size_t n, float m) {
const __m256 vm = _mm256_set1_ps(m);
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));

text
size_t i = 0; const int limit = (int)(n & ~7u); if (n >= 8u) { while (i < (size_t)limit) { __declspec(align(32)) float q[8], nf[8], ord[8]; int k = 0; while (k < 8) { const Interval* p = arr[i + k]; q[k] = p->quadratic_term; nf[k] = p->N_factor; ord[k] = p->ordinate_factor; ++k; } const __m256 vq = _mm256_load_ps(q); const __m256 vnf = _mm256_load_ps(nf); const __m256 vod = _mm256_load_ps(ord); const __m256 t = _mm256_fmadd_ps(vm, vnf, vod); const __m256 res = _mm256_fmadd_ps(vq, vinvm, t); __declspec(align(32)) float out[8]; _mm256_store_ps(out, res); k = 0; while (k < 8) { arr[i + k]->R = out[k]; ++k; } i += 8u; } } while (i < n) { arr[i]->ChangeCharacteristic(m); ++i; }

}

__forceinline void RecomputeR_AffineM_AVX2(Interval* const* arr, size_t n, float GF, float alpha) {
const __m256 vGF = _mm256_set1_ps(GF);
const __m256 va = _mm256_set1_ps(alpha);

text
size_t i = 0; const int limit = (int)(n & ~7u); if (n >= 8u) { while (i < (size_t)limit) { __declspec(align(32)) float len[8], Mv[8], q[8], nf[8], ord[8]; int k = 0; while (k < 8) { const Interval* p = arr[i + k]; len[k] = p->x2 - p->x1; Mv[k] = p->M; q[k] = p->quadratic_term; nf[k] = p->N_factor; ord[k] = p->ordinate_factor; ++k; } const __m256 vlen = _mm256_load_ps(len); const __m256 vM = _mm256_load_ps(Mv); const __m256 vq = _mm256_load_ps(q); const __m256 vnf = _mm256_load_ps(nf); const __m256 vod = _mm256_load_ps(ord); const __m256 vm = _mm256_fmadd_ps(vGF, vlen, _mm256_mul_ps(va, vM)); __m256 vinvm = _mm256_rcp_ps(vm); vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f))); const __m256 t = _mm256_fmadd_ps(vm, vnf, vod); const __m256 res = _mm256_fmadd_ps(vq, vinvm, t); __declspec(align(32)) float out[8]; _mm256_store_ps(out, res); k = 0; while (k < 8) { arr[i + k]->R = out[k]; ++k; } i += 8u; } } while (i < n) { const Interval* p = arr[i]; const float mi = fmaf(GF, (p->x2 - p->x1), p->M * alpha); arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor)); ++i; }

}

__forceinline float Shag(float _m, float x1, float x2, float y1, float y2, float _N, float _r) {
const float diff = y2 - y1;
const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(0x3F800000u | (((((uint32_t)&diff)) & 0x80000000u) ^ 0x80000000u))));

text
return _N == 1.0f ? fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f : _N == 2.0f ? fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f : fmaf(sign_mult / powf(_m, _N), powf(diff, _N) * _r, x1 + x2) * 0.5f;

}

struct MortonND {
int dim;
int levels;
uint64_t scale;
std::vector<float> low, high;
std::vector<int> perm;
std::vector<uint64_t> invMask;

text
MortonND(int D, int L, const float* lows, const float* highs, uint32_t seed) : dim(D), levels(L), scale(1ull), low(lows, lows + D), high(highs, highs + D), perm(D), invMask(D, 0) { int i = 0; while (i < D) { perm[i] = i; ++i; } uint32_t st = seed ? seed : 0x9E3779B9u; for (int k = D - 1; k > 0; --k) { float r; XOR_RAND(st, r); int j = (int)floorf(r * (k + 1)); int t = perm[k]; perm[k] = perm[j]; perm[j] = t; } for (int k = 0; k < D; ++k) { float r; XOR_RAND(st, r); invMask[k] = (r > 0.5f) ? (((uint64_t)1 << levels) - 1ull) : 0ull; } int maxbits = 63; int need = levels * D; if (need > maxbits) { levels = maxbits / D; if (levels < 1) levels = 1; } scale = 1ull << ((uint64_t)levels * (uint64_t)D); } __forceinline void map01ToPoint(float t, float* out) const noexcept { if (t < 0.0f) t = 0.0f; else if (t >= 1.0f) t = nextafterf(1.0f, 0.0f); uint64_t idx = (uint64_t)(t * (double)scale); if (idx >= scale) idx = scale - 1; std::vector<uint64_t> coord(dim, 0ull); for (int bit = 0; bit < levels; ++bit) { for (int d = 0; d < dim; ++d) { int pd = perm[d]; uint64_t b = (idx >> ((bit * dim) + d)) & 1ull; coord[pd] |= (b << bit); } } for (int d = 0; d < dim; ++d) { if (invMask[d]) coord[d] ^= invMask[d]; } const float invC = 1.0f / (float)((uint64_t)1 << levels); for (int d = 0; d < dim; ++d) { float c0 = low[d] + (high[d] - low[d]) * ((float)coord[d] * invC); float cs = (high[d] - low[d]) * invC; out[d] = fmaf(0.5f, cs, c0); } } __forceinline float pointTo01(const float* in) const noexcept { std::vector<uint64_t> coord(dim, 0ull); const float invC = 1.0f / (float)((uint64_t)1 << levels); for (int d = 0; d < dim; ++d) { float t = (in[d] - low[d]) / (high[d] - low[d]); if (t < 0.0f) t = 0.0f; else if (t > 0.99999994f) t = 0.99999994f; uint64_t ci = (uint64_t)(t * (float)((uint64_t)1 << levels)); if (ci >= ((uint64_t)1 << levels)) ci = ((uint64_t)1 << levels) - 1; coord[d] = ci; } for (int d = 0; d < dim; ++d) { if (invMask[d]) coord[d] ^= invMask[d]; } uint64_t idx = 0ull; for (int bit = 0; bit < levels; ++bit) { for (int d = 0; d < dim; ++d) { int pd = perm[d]; uint64_t b = (coord[pd] >> bit) & 1ull; idx |= (b << ((bit * dim) + d)); } } double tt = (double)idx / (double)scale; if (tt >= 1.0) tt = nextafter(1.0, 0.0); return (float)tt; }

};

struct ManipCost {
int n;
bool variableLen;
float targetX, targetY;
float minTheta;

text
__forceinline float operator()(const float* q) const noexcept { const float* th = q; const float* L = variableLen ? (q + n) : nullptr; float x = 0.0f, y = 0.0f, phi = 0.0f; float penC = 0.0f; float Lc = 1.0f; for (int i = 0; i < n; ++i) { const float t = th[i]; float Li = variableLen ? L[i] : Lc; phi += t; float c, s; FABE13_COS(phi, c); FABE13_SIN(phi, s); x = fmaf(Li, c, x); y = fmaf(Li, s, y); } for (int i = 0; i < n - 1; ++i) { float dth = th[i + 1] - th[i]; if (fabsf(dth) < minTheta) { penC += minTheta - fabsf(dth); } } float dx = x - targetX; float dy = y - targetY; float dist = sqrtf(fmaf(dx, dx, dy * dy)); return dist + penC; }

};

__forceinline void HitTest2D_analytic(float x_param, float& out_x1, float& out_x2) {
const float a = gActiveMap.a;
const float inv_lenx = gActiveMap.inv_lenx;
const uint32_t scale = gActiveMap.scale;
const uint32_t scale_minus_1 = scale - 1u;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const float c = gActiveMap.c;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;

text
float norm = (x_param - a) * inv_lenx; norm = fminf(fmaxf(norm, 0.0f), 0x1.fffffep-1f); uint32_t idx = (uint32_t)(norm * (float)scale); idx = idx > scale_minus_1 ? scale_minus_1 : idx; float sx = lenx, sy = leny; float x1 = a, x2 = c; uint8_t type = start; int l = levels - 1; while (l >= 0) { const uint32_t q = (idx >> (l * 2)) & 3u; const Step s = g_step_tbl[type][q]; type = s.next; sx *= 0.5f; sy *= 0.5f; x1 += s.dx ? sx : 0.0f; x2 += s.dy ? sy : 0.0f; --l; } out_x1 = x1 + sx * 0.5f; out_x2 = x2 + sy * 0.5f;

}

__forceinline float FindX2D_analytic(float px, float py) {
const float a = gActiveMap.a;
const float b = gActiveMap.b;
const float c = gActiveMap.c;
const float d = gActiveMap.d;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const uint32_t scale = gActiveMap.scale;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;

text
const float clamped_px = fminf(fmaxf(px, a), b); const float clamped_py = fminf(fmaxf(py, c), d); float sx = lenx, sy = leny; float x0 = a, y0 = c; uint32_t idx = 0u; uint8_t type = start; int l = 0; while (l < levels) { sx *= 0.5f; sy *= 0.5f; const float mx = x0 + sx; const float my = y0 + sy; const uint32_t tr = (uint32_t)((clamped_px > mx) & (clamped_py > my)); const uint32_t tl = (uint32_t)((clamped_px < mx) & (clamped_py > my)); const uint32_t dl = (uint32_t)((clamped_px < mx) & (clamped_py < my)); const uint32_t none = (uint32_t)(1u ^ (tr | tl | dl)); const uint32_t dd = (tr << 1) | tr | tl | (none << 1); const InvStep inv = g_inv_tbl[type][dd]; type = inv.next; idx = (idx << 2) | inv.q; const uint32_t dx = dd >> 1; const uint32_t dy = dd & 1u; x0 += dx ? sx : 0.0f; y0 += dy ? sy : 0.0f; ++l; } const float scale_recip = 1.0f / (float)scale; return fmaf((float)idx * scale_recip, lenx, a);

}

static __forceinline void agp_run_branch_mpi(const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed, std::vector<Interval*>& H, std::vector<float>& bestQ, float& bestF, float& bestX, float& bestY) {
const int dim = cost.n + (cost.variableLen ? cost.n : 0);

text
auto evalAt = [&](float t) -> float { std::vector<float> q(dim); map.map01ToPoint(t, q.data()); float f = cost(q.data()); float x = 0, y = 0, phi = 0; const float* th = q.data(); const float* L = cost.variableLen ? (q.data() + cost.n) : nullptr; float Lc = 1.0f; for (int i = 0; i < cost.n; ++i) { phi += th[i]; float Li = cost.variableLen ? L[i] : Lc; float c, s; FABE13_COS(phi, c); FABE13_SIN(phi, s); x = fmaf(Li, c, x); y = fmaf(Li, s, y); } if (f < bestF) { bestF = f; bestQ = q; bestX = x; bestY = y; } return f; }; float a = 0.0f, b = 1.0f; float f_a = evalAt(a); float f_b = evalAt(b); Interval* root = new Interval(a, b, f_a, f_b, (float)dim); float Mmax = root->M; float m = r * Mmax; root->ChangeCharacteristic(m); H.clear(); H.reserve((size_t)maxIter); H.push_back(root); std::make_heap(H.begin(), H.end(), ComparePtr); float dmax = b - a; const float initial_len = dmax; const float thr03 = 0.3f * initial_len; const float inv_thr03 = 1.0f / thr03; int it = 0; const int rank = g_world->rank(); const int world = g_world->size(); while (it < maxIter) { while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg in; g_world->recv(boost::mpi::any_source, 0, in); if (in.kind) { if (!rank) { break; } else { return; } } else { float sx = in.xchg.s_x1, ex = in.xchg.e_x1; if (sx < 0.0f) sx = 0.0f; if (ex > 1.0f) ex = 1.0f; if (ex <= sx) continue; std::vector<float> tmp(dim); map.map01ToPoint(sx, tmp.data()); float y1 = cost(tmp.data()); map.map01ToPoint(ex, tmp.data()); float y2 = cost(tmp.data()); Interval* inj = new Interval(sx, ex, y1, y2, (float)dim); inj->ChangeCharacteristic(m); if (!H.empty()) { Interval* top = H.front(); if (inj->R > 1.15f * top->R) { inj->R = in.xchg.Rtop; H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtr); } } else { H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtr); } } } std::pop_heap(H.begin(), H.end(), ComparePtr); Interval* cur = H.back(); H.pop_back(); const float x1 = cur->x1, x2 = cur->x2, y1 = cur->y1, y2 = cur->y2; float tNew = Shag(m, x1, x2, y1, y2, (float)dim, r); if (tNew < a) tNew = a; else if (tNew > b) tNew = b; float fNew = evalAt(tNew); Interval* left = new Interval(x1, tNew, y1, fNew, (float)dim); Interval* right = new Interval(tNew, x2, fNew, y2, (float)dim); float Mloc = left->M > right->M ? left->M : right->M; if (adaptive) { float len1 = tNew - x1, len2 = x2 - tNew; if (len1 + len2 == dmax) { dmax = len1 > len2 ? len1 : len2; for (auto p : H) { float L = p->x2 - p->x1; if (L > dmax) dmax = L; } } if ((thr03 > dmax && !(it % 3)) || (10.0f * dmax < initial_len)) { if (Mloc > Mmax) { Mmax = Mloc; m = r * Mmax; } float progress = 1.0f - (dmax * inv_thr03); float alpha = progress * progress; float beta = 2.0f - alpha; float MULT = (Mmax / dmax); float global_coeff = fmaf(MULT, r, -MULT); float GF = beta * global_coeff; left->ChangeCharacteristic(fmaf(GF, len1, left->M * alpha)); right->ChangeCharacteristic(fmaf(GF, len2, right->M * alpha)); size_t sz = H.size(); if (sz < 64u) { size_t i = 0; while (i < sz) { Interval* p = H[i]; float len = p->x2 - p->x1; p->ChangeCharacteristic(fmaf(GF, len, p->M * alpha)); ++i; } } else { RecomputeR_AffineM_AVX2(H.data(), sz, GF, alpha); } std::make_heap(H.begin(), H.end(), ComparePtr); } else { if (Mloc > Mmax) { if (Mloc < 1.15f * Mmax) { Mmax = Mloc; m = r * Mmax; left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); } else { Mmax = Mloc; m = r * Mmax; left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); size_t sz = H.size(); if (sz < 64u) { size_t i = 0; while (i < sz) { H[i]->ChangeCharacteristic(m); ++i; } } else { RecomputeR_ConstM_AVX2(H.data(), sz, m); } std::make_heap(H.begin(), H.end(), ComparePtr); } } else { left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); } } } else { if (Mloc > Mmax) { if (Mloc < 1.15f * Mmax) { Mmax = Mloc; m = r * Mmax; left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); } else { Mmax = Mloc; m = r * Mmax; left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); size_t sz = H.size(); if (sz < 64u) { size_t i = 0; while (i < sz) { H[i]->ChangeCharacteristic(m); ++i; } } else { RecomputeR_ConstM_AVX2(H.data(), sz, m); } std::make_heap(H.begin(), H.end(), ComparePtr); } } else { left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); } } H.push_back(left); std::push_heap(H.begin(), H.end(), ComparePtr); H.push_back(right); std::push_heap(H.begin(), H.end(), ComparePtr); Interval* top = H.front(); float interval_len = top->x2 - top->x1; const float p = fmaf(-1.0f / initial_len, dmax, 1.0f); const int T = (int)fmaf(-expm1f(p), 264.0f, 277.0f); bool want_term = (interval_len < eps) || (it == maxIter - 1); if (!(it % T) || want_term) { CtrlMsg out; out.kind = want_term; out.xchg.s_x1 = top->x1; out.xchg.e_x1 = top->x2; out.xchg.s_x2 = 0.0f; out.xchg.e_x2 = 0.0f; out.xchg.Rtop = top->R; for (int i = 0; i < world; ++i) { if (i != rank) g_world->isend(i, 0, out); } if (want_term) { break; } } ++it; }

}

struct BestPacket {
float bestF;
int dim;
float bestX;
float bestY;

text
template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar& bestF& dim& bestX& bestY; }

};

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_Manip2D(int nSegments, bool variableLengths, float minTheta,
float targetX, float targetY, int peanoLevels, int maxIterPerBranch,
float r, bool adaptiveMode, float epsilon, unsigned int seed,
float** out_bestQ, size_t* out_bestQLen,
float* out_bestX, float* out_bestY, float* out_bestF) {
const int dim = nSegments + (variableLengths ? nSegments : 0);

text
const float thetaMin = -3.14159265359f; const float thetaMax = 3.14159265359f; const float lenMin = 0.1f; const float lenMax = 2.0f; std::vector<float> low(dim), high(dim); for (int i = 0; i < nSegments; ++i) { low[i] = thetaMin; high[i] = thetaMax; } if (variableLengths) { for (int i = 0; i < nSegments; ++i) { low[nSegments + i] = lenMin; high[nSegments + i] = lenMax; } } ManipCost cost{ nSegments, variableLengths, targetX, targetY, minTheta }; const int rank = g_world->rank(); const int world = g_world->size(); MortonND map(dim, peanoLevels, low.data(), high.data(), seed + 0x9E3779B9u * rank); std::vector<Interval*> H; std::vector<float> bestQ; float bestF = FLT_MAX, bx = 0.0f, by = 0.0f; agp_run_branch_mpi(map, cost, maxIterPerBranch, r, adaptiveMode, epsilon, seed, H, bestQ, bestF, bx, by); BestPacket me{ bestF, dim, bx, by }; if (!rank) { std::vector<float> winnerQ = bestQ; float winF = bestF, wx = bx, wy = by; for (int i = 1; i < world; ++i) { BestPacket bp; g_world->recv(i, 2, bp); std::vector<float> qin; g_world->recv(i, 3, qin); if (bp.bestF < winF) { winF = bp.bestF; wx = bp.bestX; wy = bp.bestY; winnerQ = qin; } } *out_bestQLen = (size_t)winnerQ.size(); *out_bestQ = (float*)CoTaskMemAlloc(sizeof(float) * (*out_bestQLen)); std::memcpy(*out_bestQ, winnerQ.data(), sizeof(float) * (*out_bestQLen)); *out_bestX = wx; *out_bestY = wy; *out_bestF = winF; } else { g_world->send(0, 2, me); g_world->send(0, 3, bestQ); }

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(int peanoLevel, float a, float b, float c, float d) {
g_env = new boost::mpi::environment();
g_world = new boost::mpi::communicator();

text
const int rank = g_world->rank(); const int world_size = g_world->size(); if (world_size == 4) { switch (rank) { case 0: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, static_cast<uint8_t>(Top)); break; case 1: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, static_cast<uint8_t>(Down)); break; case 2: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, static_cast<uint8_t>(Left)); break; default: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, static_cast<uint8_t>(Right)); } } else { } return rank;

}

__forceinline float ShekelFunc(float x, float seed) {
int i = 0;
float st = seed, r1, r2, res = 0.0f;

text
while (i < 10) { XOR_RAND(st, r1); float xp = fmaf(-r1, 10.0f, x); XOR_RAND(st, r1); XOR_RAND(st, r2); float d = fmaf(fmaf(r1, 20.0f, 5.0f), xp * xp, fmaf(r2, 0.2f, 1.0f)); d = copysignf(fmaxf(fabsf(d), FLT_MIN), d); res -= 1.0f / d; ++i; } return res;

}

__forceinline float RastriginFunc(float x1, float x2) {
const float t = fmaf(x1, x1, x2 * x2);
float c1, c2;
FABE13_COS(6.28318530717958647692f * x1, c1);
FABE13_COS(6.28318530717958647692f * x2, c2);
return (t - fmaf(c1 + c2, 10.0f, -14.6f)) * fmaf(-t, 0.25f, 18.42f);
}

__forceinline float HillFunc(float x, float seed) {
int j = 0;
__declspec(align(32)) float ang[14u];
float st = 6.28318530717958647692f * x;

text
while (j < 14) { ang[j] = st * (float)(j + 1); ++j; } __declspec(align(32)) float sv[14u], cv[14u]; FABE13_SINCOS(ang, sv, cv, 14u); float state = seed, r1, r2; XOR_RAND(state, r1); float res = fmaf(r1, 2.0f, -1.1f); --j; while (j >= 0) { XOR_RAND(state, r1); XOR_RAND(state, r2); res += fmaf(fmaf(r1, 2.0f, -1.1f), sv[j], fmaf(r2, 2.0f, -1.1f) * cv[j]); --j; } return res;

}

__forceinline float GrishaginFunc(float x1, float x2, float seed) {
int j = 0;
__declspec(align(32)) float aj[8u], ak[8u];

text
while (j < 8) { float pj = 3.14159265358979323846f * (float)(j + 1); aj[j] = pj * x1; ak[j] = pj * x2; ++j; } __declspec(align(32)) float sj[8u], cj[8u], sk[8u], ck[8u]; FABE13_SINCOS(aj, sj, cj, 8u); FABE13_SINCOS(ak, sk, ck, 8u); --j; float p1 = 0.0f, p2 = 0.0f; float st = seed, r1, r2; while (j >= 0) { size_t k = 0u; while (k < 8u) { float s = sj[j] * sj[j]; float c = ck[k] * ck[k]; XOR_RAND_GRSH(st, r1); XOR_RAND_GRSH(st, r2); p1 = fmaf(r1, s, fmaf(r2, c, p1)); XOR_RAND_GRSH(st, r1); XOR_RAND_GRSH(st, r2); p2 = fmaf(-r1, c, fmaf(r2, s, p2)); ++k; } --j; } return -sqrtf(fmaf(p1, p1, p2 * p2));

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_1D(float global_iterations, float a, float b, float r, bool mode, float epsilon, float seed, float** out_data, size_t* out_len) {
Slab* slab = tls.local();
slab->current = slab->base;

text
int schetchick = 0; const float initial_length = b - a; float dmax = initial_length; const float threshold_03 = 0.3f * initial_length; const float inv_threshold_03 = 1.0f / threshold_03; const float start_val = ShekelFunc(a, seed); float best_f = ShekelFunc(b, seed); float x_Rmax_1 = a; float x_Rmax_2 = b; float y_Rmax_1 = start_val; float y_Rmax_2 = best_f; std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr; std::vector<Interval*, boost::alignment::aligned_allocator<Interval*, 64u>> R; Extr.reserve((size_t)global_iterations << 2u); R.reserve((size_t)global_iterations << 1u); R.emplace_back(new Interval(a, b, start_val, best_f, 1.0f)); float Mmax = R.front()->M; float m = r * Mmax; while (true) { const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r); const float new_value = ShekelFunc(new_point, seed); if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_point); } std::pop_heap(R.begin(), R.end(), ComparePtr); const Interval* pro = R.back(); const float new_x1 = pro->x1; const float new_x2 = pro->x2; const float len2 = new_x2 - new_point; const float len1 = new_point - new_x1; const float interval_len = len1 < len2 ? len1 : len2; if (++schetchick == (int)global_iterations || interval_len < epsilon) { Extr.emplace_back((float)schetchick); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = (float*)CoTaskMemAlloc(sizeof(float) * (*out_len)); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); return; } Interval* curr = new Interval(new_x1, new_point, pro->y1, new_value, 1.0f); Interval* curr1 = new Interval(new_point, new_x2, new_value, pro->y2, 1.0f); const float currM = curr->M > curr1->M ? curr->M : curr1->M; const size_t r_size = R.size(); if (mode) { if (len2 + len1 == dmax) { dmax = len2 > len1 ? len2 : len1; size_t i = 0u; while (i < r_size) { const float li = R[i]->x2 - R[i]->x1; if (li > dmax) dmax = li; ++i; } } if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) { if (currM > Mmax) { Mmax = currM; m = r * Mmax; } const float progress = fmaf(-inv_threshold_03, dmax, 1.0f); const float alpha = fmaf(progress, progress, 1.0f); const float betta = 2.0f - alpha; const float MULT = (1.0f / dmax) * Mmax; const float global_coeff = fmaf(MULT, r, -MULT); const float GF = betta * global_coeff; curr->ChangeCharacteristic(fmaf(GF, len1, curr->M * alpha)); curr1->ChangeCharacteristic(fmaf(GF, len2, curr1->M * alpha)); if (r_size < 64u) { size_t i = 0u; while (i < r_size) { R[i]->ChangeCharacteristic(fmaf(GF, R[i]->x2 - R[i]->x1, R[i]->M * alpha)); ++i; } } else { RecomputeR_AffineM_AVX2(R.data(), r_size, GF, alpha); } std::make_heap(R.begin(), R.end(), ComparePtr); } else { if (currM > Mmax) { if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); if (r_size < 64u) { size_t i = 0u; while (i < r_size) { R[i]->ChangeCharacteristic(m); ++i; } } else { RecomputeR_ConstM_AVX2(R.data(), r_size, m); } std::make_heap(R.begin(), R.end(), ComparePtr); } } else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } } } else { if (currM > Mmax) { if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); if (r_size < 64u) { size_t i = 0u; while (i < r_size) { R[i]->ChangeCharacteristic(m); ++i; } } else { RecomputeR_ConstM_AVX2(R.data(), r_size, m); } std::make_heap(R.begin(), R.end(), ComparePtr); } } else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); } } R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr); R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr); const Interval* top = R.front(); x_Rmax_1 = top->x1; x_Rmax_2 = top->x2; y_Rmax_1 = top->y1; y_Rmax_2 = top->y2; }

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_2D(
const float N,
const float global_iterations,
const float a,
const float b,
const float c,
const float d,
const float r,
const bool mode,
const float epsilon,
const float seed,
float** const __restrict out_data,
size_t* const __restrict out_len) noexcept
{
Slab* const __restrict slab = tls.local();
slab->current = slab->base;

text
int schetchick = 0; int no_improve = 0; const int rank = g_world->rank(); const int world_size = g_world->size(); while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg dummy; g_world->recv(boost::mpi::any_source, 0, dummy); } const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1)); const float x_addition = (b - a) * inv_divider; const float y_addition = (d - c) * inv_divider; const float true_start = a + x_addition; const float true_end = b - x_addition; float x_Rmax_1 = true_start; float x_Rmax_2 = true_end; const float initial_length = x_Rmax_2 - x_Rmax_1; float dmax = initial_length; const float threshold_03 = 0.3f * initial_length; const float inv_threshold_03 = 1.0f / threshold_03; const float start_val = rank % 3 ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition); float best_f = rank % 2 ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition); float y_Rmax_1 = start_val; float y_Rmax_2 = best_f; std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr; std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R; Extr.clear(); Extr.reserve(static_cast<size_t>(global_iterations) << 2u); R.clear(); R.reserve(static_cast<size_t>(global_iterations) << 1u); R.emplace_back(new Interval(true_start, true_end, start_val, best_f, 2.0f)); const Interval* __restrict top_ptr; float Mmax = R.front()->M; float m = r * Mmax; while (true) { const float interval_len = x_Rmax_2 - x_Rmax_1; const bool stagnation = no_improve > 100 && schetchick > 270; const float p = fmaf(-1.0f / initial_length, dmax, 1.0f); while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg in; g_world->recv(boost::mpi::any_source, 0, in); if (in.kind) { if (!rank) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); } return; } const float sx = FindX2D_analytic(in.xchg.s_x1, in.xchg.s_x2); const float ex = FindX2D_analytic(in.xchg.e_x1, in.xchg.e_x2); Interval* const __restrict injected = new Interval(sx, ex, RastriginFunc(in.xchg.s_x1, in.xchg.s_x2), RastriginFunc(in.xchg.e_x1, in.xchg.e_x2), 2.0f); injected->ChangeCharacteristic(m); if (injected->R > 1.15f * top_ptr->R) { const float k = stagnation ? fmaf((1.0f / expm1f(1.0f)) * 1.0f, expm1f(p), 0.3f) : fmaf((1.0f / expm1f(1.0f)) * 0.6f, expm1f(p), 0.6f); injected->R = in.xchg.Rtop * k; R.emplace_back(injected); std::push_heap(R.begin(), R.end(), ComparePtr); } } const int T = static_cast<int>(fmaf(-expm1f(p), 264.0f, 277.0f)); const bool want_term = interval_len < epsilon || schetchick == static_cast<int>(global_iterations); if (!(++schetchick % T) || stagnation || want_term) { CtrlMsg out; out.kind = want_term; if (!out.kind) { float s_x1, s_x2, e_x1, e_x2; HitTest2D_analytic(top_ptr->x1, s_x1, s_x2); HitTest2D_analytic(top_ptr->x2, e_x1, e_x2); out.xchg = CrossMsg{ s_x1, s_x2, e_x1, e_x2, top_ptr->R }; } int i = 0; while (i < world_size) { if (i != rank) g_world->isend(i, 0, out); ++i; } if (out.kind) { if (!rank) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); } return; } } const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r); float new_x1_val, new_x2_val; HitTest2D_analytic(new_point, new_x1_val, new_x2_val); const float new_value = RastriginFunc(new_x1_val, new_x2_val); if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_x1_val); Extr.emplace_back(new_x2_val); no_improve = 0; } else { ++no_improve; } std::pop_heap(R.begin(), R.end(), ComparePtr); Interval* const __restrict promejutochny_otrezok = R.back(); const float segment_x1 = promejutochny_otrezok->x1; const float segment_x2 = promejutochny_otrezok->x2; const float len2 = segment_x2 - new_point; const float len1 = new_point - segment_x1; Interval* const __restrict curr = new Interval(segment_x1, new_point, promejutochny_otrezok->y1, new_value, 2.0f); Interval* const __restrict curr1 = new Interval(new_point, segment_x2, new_value, promejutochny_otrezok->y2, 2.0f); const float currM = curr->M > curr1->M ? curr->M : curr1->M; const size_t r_size = R.size(); if (mode) { if (len2 + len1 == dmax) { dmax = len2 > len1 ? len2 : len1; size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
const float len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}

text
if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) { if (currM > Mmax) { Mmax = currM; m = r * Mmax; } const float progress = fmaf(-inv_threshold_03, dmax, 1.0f); const float alpha = fmaf(progress, progress, 1.0f); const float betta = 2.0f - alpha; const float MULTIPLIER = (1.0f / dmax) * Mmax; const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER); const float GLOBAL_FACTOR = betta * global_coeff; curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha)); curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha)); if (r_size < 64u) { size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
}
else {
RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);

text
if (r_size < 64u) { size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);

text
if (r_size < 64u) { size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}

text
R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr); R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr); top_ptr = R.front(); x_Rmax_1 = top_ptr->x1; x_Rmax_2 = top_ptr->x2; y_Rmax_1 = top_ptr->y1; y_Rmax_2 = top_ptr->y2; }

}

struct RunParams {
int nSegments;
uint8_t varLen;
float minTheta;
float tx, ty;
int levels, maxIter;
float r;
uint8_t adaptive;
float eps;
unsigned int seed;

text
template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar& nSegments& varLen& minTheta& tx& ty& levels& maxIter& r& adaptive& eps& seed; }

};

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartManipND(
int nSegments,
bool variableLengths,
float minTheta,
float targetX,
float targetY,
int peanoLevels,
int maxIterPerBranch,
float r,
bool adaptiveMode,
float epsilon,
unsigned int seed)
{
RunParams p;
p.nSegments = nSegments;
p.varLen = (uint8_t)variableLengths;
p.minTheta = minTheta;
p.tx = targetX;
p.ty = targetY;
p.levels = peanoLevels;
p.maxIter = maxIterPerBranch;
p.r = r;
p.adaptive = (uint8_t)adaptiveMode;
p.eps = epsilon;
p.seed = seed;

text
const int rank = g_world->rank(); const int world = g_world->size(); for (int i = 1; i < world; ++i) { if (i != rank) g_world->isend(i, 1, p); }

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() {
while (true) {
if (g_world->iprobe(0, 1)) {
RunParams p;
g_world->recv(0, 1, p);
float* q = nullptr;
size_t qlen = 0;
float bx = 0, by = 0, bf = 0;

text
AGP_Manip2D( p.nSegments, (bool)p.varLen, p.minTheta, p.tx, p.ty, p.levels, p.maxIter, p.r, (bool)p.adaptive, p.eps, p.seed, &q, &qlen, &bx, &by, &bf ); if (q) CoTaskMemFree(q); } Sleep(0); }

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartWorkers() {
int i = 1;
const int world = g_world->size();

text
while (i < world) { g_world->isend(i, 1, 0); ++i; }

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpStartWorkersLegacy() {
AgpStartWorkers();
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_Free(float* p) {
CoTaskMemFree(p);
} это мой код в длл библиотеке сейчас, раньше у меня был одномерный и двумерный алгоритм, сейчас добавлен многомерный - он не использует стандартные двумерные кривые пеано гильберта а использует пространственные развёртки гиперкубы - алгоритм работает правильно но мне нужен анализ и понимание как он работает - работать базово он должен также как и одномерные и двумерные версии - сама идея алгоритма не меняется - но что тут изменилось так это целевая функция - так как теперь целевая функция это функция минимизации положения захвата робота манипулятора - также заметь там есть метод agpinit - он вызывается один раз при загрузке основного приложения и в предыдущих случаях он был нужен так как экономил время - развёртки пеано строились один раз при инициализации а не на каждое нажатие кнопки, может в новом случае и не требуется явной инициализации гиперкубов во время старта приложения - но с точки зрения скорости возможно стоит что-то добавить в этот метод для нового случая (новый случай требует много ориентаций - поэтому я запускаю его на большем числе процессов чем 4 для лучшего параллелизма) - для экономии памяти и времени - возможно туда стоит вынести какую-то инициализации или резервирование памяти - дай более чёткий ответ что стоит с этим сделать, также обрати внимание на код в форме #include "MyForm.h"
#include <float.h>
using namespace System;
using namespace System::Windows::Forms;

typedef int(__cdecl* PInit)(int, float, float, float, float);
typedef void(__cdecl* PStartWorkers)();

[STAThread]
int main() {
HMODULE h = LoadLibraryW(L"TEST_FUNC.dll");
auto AgpInit = (PInit)GetProcAddress(h, "AgpInit");
auto AgpWaitStartAndRun = (PStartWorkers)GetProcAddress(h, "AgpWaitStartAndRun");
const int rank = AgpInit(12, -2.2f, 1.8f, -2.2f, 1.8f);
if (!rank) {
Application::EnableVisualStyles();
Application::SetCompatibleTextRenderingDefault(false);
Application::Run(gcnew TESTAGP::MyForm(h));
}
else {
AgpWaitStartAndRun();
}
return 0;
} и #pragma once
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>
#include <stdint.h>

using namespace System;
using namespace System::Drawing;
using namespace System::Windows::Forms;
using namespace System::Collections::Generic;
using namespace System::Drawing::Drawing2D;

typedef void(__cdecl* P_MANIP)(
int nSegments,
bool variableLengths,
float deltaThetaMax,
float targetX, float targetY,
int peanoLevels, int maxIterPerBranch,
float r, bool adaptiveMode, float epsilon, unsigned int seed,
float** out_bestQ, size_t* out_bestQLen,
float* out_bestX, float* out_bestY, float* out_bestF
);
typedef void(__cdecl* P_FREE)(float*);
typedef int(__cdecl* P_INIT)(int, float, float, float, float);
typedef void(__cdecl* P_WAIT)();
typedef void(__cdecl* P_START)(
int nSegments,
bool variableLengths,
float deltaThetaMax,
float targetX, float targetY,
int peanoLevels, int maxIterPerBranch,
float r, bool adaptiveMode, float epsilon, unsigned int seed
);

namespace TESTAGP {

text
public ref class MyForm : public Form { public: MyForm(HMODULE hLib) : hLib(hLib) { this->SetStyle(ControlStyles::AllPaintingInWmPaint | ControlStyles::UserPaint | ControlStyles::OptimizedDoubleBuffer, true); this->Text = L"AGP Manipulator 2D - Оптимизация положения захвата"; this->ClientSize = System::Drawing::Size(1000, 700); this->Resize += gcnew EventHandler(this, &MyForm::OnResize); fManip = (P_MANIP)GetProcAddress(hLib, "AGP_Manip2D"); pFree = (P_FREE)GetProcAddress(hLib, "AGP_Free"); pStart = (P_START)GetProcAddress(hLib, "AgpStartManipND"); angles = gcnew List<float>(0.0f); lengths = gcnew List<float>(0.0f); InitUI(); ResetRandomConfig(); } protected: ~MyForm() {} private: HMODULE hLib; P_MANIP fManip; P_FREE pFree; P_START pStart; int nSegments; bool variableLengths; List<float>^ angles; List<float>^ lengths; CheckBox^ cbVarLen; NumericUpDown^ nudMinTheta; NumericUpDown^ nudBaseLength; NumericUpDown^ nudStretchFactor; NumericUpDown^ nudTargetX; NumericUpDown^ nudTargetY; NumericUpDown^ nudLevels; NumericUpDown^ nudMaxIter; CheckBox^ cbAdaptive; NumericUpDown^ nudR; NumericUpDown^ nudEps; Button^ btnAdd; Button^ btnRem; Button^ btnOptimize; Label^ lblInfo; System::UInt32 rngState = 0xA5C39E0Du; void InitUI() { int y = 10, w = 180, h = 24, pad = 8; int currentX = 10; Label^ L; L = gcnew Label(); L->Text = L"Мин. угол между звеньями (рад)"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L); nudMinTheta = gcnew NumericUpDown(); nudMinTheta->Left = currentX; nudMinTheta->Top = y + h + 2; nudMinTheta->Width = w; nudMinTheta->DecimalPlaces = 3; nudMinTheta->Minimum = (Decimal)0.01; nudMinTheta->Maximum = (Decimal)3.14159; nudMinTheta->Value = (Decimal)0.5; this->Controls->Add(nudMinTheta); currentX += w + 20; L = gcnew Label(); L->Text = L"Базовая длина звена"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L); nudBaseLength = gcnew NumericUpDown(); nudBaseLength->Left = currentX; nudBaseLength->Top = y + h + 2; nudBaseLength->Width = w; nudBaseLength->DecimalPlaces = 2; nudBaseLength->Minimum = (Decimal)0.1; nudBaseLength->Maximum = (Decimal)3.0; nudBaseLength->Value = (Decimal)1.0; this->Controls->Add(nudBaseLength); currentX += w + 20; L = gcnew Label(); L->Text = L"Макс. коэф. растяжения/сжатия"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L); nudStretchFactor = gcnew NumericUpDown(); nudStretchFactor->Left = currentX; nudStretchFactor->Top = y + h + 2; nudStretchFactor->Width = w; nudStretchFactor->DecimalPlaces = 2; nudStretchFactor->Minimum = (Decimal)1.0; nudStretchFactor->Maximum = (Decimal)3.0; nudStretchFactor->Value = (Decimal)1.5; this->Controls->Add(nudStretchFactor); currentX += w + 20; cbVarLen = gcnew CheckBox(); cbVarLen->Text = L"Переменные длины звеньев"; cbVarLen->Left = currentX; cbVarLen->Top = y + h + 2; cbVarLen->Width = w; cbVarLen->Checked = false; this->Controls->Add(cbVarLen); currentX += w + 20; y += h * 2 + pad + 10; currentX = 10; L = gcnew Label(); L->Text = L"Целевая позиция X"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L); nudTargetX = gcnew NumericUpDown(); nudTargetX->Left = currentX; nudTargetX->Top = y + h + 2; nudTargetX->Width = w; nudTargetX->DecimalPlaces = 2; nudTargetX->Minimum = (Decimal)-10.0; nudTargetX->Maximum = (Decimal)10.0; nudTargetX->Value = (Decimal)3.5; this->Controls->Add(nudTargetX); currentX += w + 20; L = gcnew Label(); L->Text = L"Целевая позиция Y"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L); nudTargetY = gcnew NumericUpDown(); nudTargetY->Left = currentX; nudTargetY->Top = y + h + 2; nudTargetY->Width = w; nudTargetY->DecimalPlaces = 2; nudTargetY->Minimum = (Decimal)-10.0; nudTargetY->Maximum = (Decimal)10.0; nudTargetY->Value = (Decimal)1.0; this->Controls->Add(nudTargetY); currentX += w + 20; L = gcnew Label(); L->Text = L"Глубина развертки"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L); nudLevels = gcnew NumericUpDown(); nudLevels->Left = currentX; nudLevels->Top = y + h + 2; nudLevels->Width = w; nudLevels->Minimum = 4; nudLevels->Maximum = 20; nudLevels->Value = 12; this->Controls->Add(nudLevels); currentX += w + 20; L = gcnew Label(); L->Text = L"Параметр надежности (r)"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L); nudR = gcnew NumericUpDown(); nudR->Left = currentX; nudR->Top = y + h + 2; nudR->Width = w; nudR->DecimalPlaces = 2; nudR->Minimum = (Decimal)1.0; nudR->Maximum = (Decimal)10.0; nudR->Value = (Decimal)2.5; this->Controls->Add(nudR); currentX += w + 20; cbAdaptive = gcnew CheckBox(); cbAdaptive->Text = L"Адаптивная схема"; cbAdaptive->Left = currentX; cbAdaptive->Top = y + h + 2; cbAdaptive->Width = w; cbAdaptive->Checked = true; this->Controls->Add(cbAdaptive); y += h * 2 + pad + 10; currentX = 10; L = gcnew Label(); L->Text = L"Точность (epsilon)"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L); nudEps = gcnew NumericUpDown(); nudEps->Left = currentX; nudEps->Top = y + h + 2; nudEps->Width = w; nudEps->DecimalPlaces = 6; nudEps->Minimum = (Decimal)0.000001; nudEps->Maximum = (Decimal)0.1; nudEps->Value = (Decimal)0.0001; this->Controls->Add(nudEps); currentX += w + 20; L = gcnew Label(); L->Text = L"Макс. число итераций"; L->Left = currentX; L->Top = y; L->Width = w; this->Controls->Add(L); nudMaxIter = gcnew NumericUpDown(); nudMaxIter->Left = currentX; nudMaxIter->Top = y + h + 2; nudMaxIter->Width = w; nudMaxIter->Minimum = 10; nudMaxIter->Maximum = 50000; nudMaxIter->Value = 1000; this->Controls->Add(nudMaxIter); currentX += 200; btnAdd = gcnew Button(); btnAdd->Text = L"+ Звено"; btnAdd->Left = currentX; btnAdd->Top = y + h + 2; btnAdd->Width = 80; btnAdd->Click += gcnew EventHandler(this, &MyForm::OnAddClick); this->Controls->Add(btnAdd); currentX += 85; btnRem = gcnew Button(); btnRem->Text = L"- Звено"; btnRem->Left = currentX; btnRem->Top = y + h + 2; btnRem->Width = 80; btnRem->Click += gcnew EventHandler(this, &MyForm::OnRemClick); this->Controls->Add(btnRem); currentX += 125; btnOptimize = gcnew Button(); btnOptimize->Text = L"Оптимизировать"; btnOptimize->Left = currentX; btnOptimize->Top = y + h + 2; btnOptimize->Width = 120; btnOptimize->Click += gcnew EventHandler(this, &MyForm::OnOptimizeClick); this->Controls->Add(btnOptimize); currentX += 125; lblInfo = gcnew Label(); lblInfo->Left = currentX; lblInfo->Top = y; lblInfo->Width = 250; lblInfo->Height = 60; lblInfo->BorderStyle = BorderStyle::FixedSingle; lblInfo->Text = L"Готов к работе"; this->Controls->Add(lblInfo); } void ResetRandomConfig() { nSegments = 1; angles->Clear(); lengths->Clear(); angles->Add(RandAngle()); lengths->Add((float)nudBaseLength->Value); variableLengths = false; this->Invalidate(); } float Rand01() { rngState ^= rngState << 13; rngState ^= rngState >> 17; rngState ^= rngState << 5; return (float)((double)(unsigned)rngState / 4294967296.0); } float RandAngle() { return (Rand01() * 6.28318530717958647692f) - 3.14159265358979323846f; } System::Void OnResize(System::Object^ sender, System::EventArgs^ e) { this->Invalidate(); } System::Void OnAddClick(System::Object^ sender, System::EventArgs^ e) { ++nSegments; angles->Add(RandAngle()); lengths->Add((float)nudBaseLength->Value); this->Invalidate(); } System::Void OnRemClick(System::Object^ sender, System::EventArgs^ e) { if (nSegments > 1) { --nSegments; angles->RemoveAt(angles->Count - 1); lengths->RemoveAt(lengths->Count - 1); this->Invalidate(); } } System::Void OnOptimizeClick(System::Object^ sender, System::EventArgs^ e) { if (!fManip) return; variableLengths = cbVarLen->Checked; float minTheta = (float)nudMinTheta->Value; float tx = (float)nudTargetX->Value; float ty = (float)nudTargetY->Value; int levels = (int)nudLevels->Value; int maxIter = (int)nudMaxIter->Value; bool adaptive = cbAdaptive->Checked; float r_param = (float)nudR->Value; float eps = (float)nudEps->Value; unsigned int seed = (unsigned)GetTickCount(); if (pStart) { pStart( nSegments, variableLengths, minTheta, tx, ty, levels, maxIter, r_param, adaptive, eps, seed ); } LARGE_INTEGER t0, t1, fq; QueryPerformanceCounter(&t0); float* bestQ = nullptr; size_t bestQLen = 0; float bestX = 0, bestY = 0, bestF = 0; fManip( nSegments, variableLengths, minTheta, tx, ty, levels, maxIter, r_param, adaptive, eps, seed, &bestQ, &bestQLen, &bestX, &bestY, &bestF ); QueryPerformanceCounter(&t1); QueryPerformanceFrequency(&fq); double micros = 1e6 * (double)(t1.QuadPart - t0.QuadPart) / (double)fq.QuadPart; if (bestQ && bestQLen >= (size_t)nSegments) { angles->Clear(); for (int i = 0; i < nSegments; ++i) angles->Add(bestQ[i]); if (variableLengths) { lengths->Clear(); for (int i = 0; i < nSegments; ++i) lengths->Add(bestQ[nSegments + i]); } else { lengths->Clear(); for (int i = 0; i < nSegments; ++i) lengths->Add((float)nudBaseLength->Value); } if (pFree) pFree(bestQ); } lblInfo->Text = String::Format(L"Результат оптимизации:\nЦелевая функция: {0:F5}\nКонечная точка: ({1:F3}, {2:F3})\nВремя: {3:F0} мкс", bestF, bestX, bestY, micros); this->Invalidate(); } protected: virtual void OnPaint(PaintEventArgs^ e) override { Form::OnPaint(e); Graphics^ g = e->Graphics; g->SmoothingMode = System::Drawing::Drawing2D::SmoothingMode::HighQuality; g->Clear(this->BackColor); int topOffset = 150; System::Drawing::Rectangle drawArea = System::Drawing::Rectangle(0, topOffset, this->ClientSize.Width, this->ClientSize.Height - topOffset); g->FillRectangle(Brushes::White, drawArea); int leftWallX = drawArea.Left + this->ClientSize.Width * 25 / 100; Pen^ wallPen = gcnew Pen(Color::Black, 2); g->DrawLine(wallPen, leftWallX, drawArea.Top, leftWallX, drawArea.Bottom); HatchBrush^ hatchBrush = gcnew HatchBrush(HatchStyle::BackwardDiagonal, Color::LightGray, Color::White); int leftHatchWidth = 100; g->FillRectangle(hatchBrush, leftWallX - leftHatchWidth, drawArea.Top, leftHatchWidth, drawArea.Height); float targetX = (float)nudTargetX->Value; float targetY = (float)nudTargetY->Value; float scale = 160.0f; int baseX = leftWallX; int baseY = drawArea.Top + drawArea.Height / 2; float pixelTargetX = baseX + targetX * scale; float pixelTargetY = baseY - targetY * scale; int rightWallX = (int)(pixelTargetX + 8); rightWallX = Math::Min(rightWallX, drawArea.Right - 10); Pen^ dashedPen = gcnew Pen(Color::Black, 2); dashedPen->DashStyle = DashStyle::Dash; g->DrawLine(dashedPen, rightWallX, drawArea.Top, rightWallX, drawArea.Bottom); int rightHatchWidth = leftHatchWidth; g->FillRectangle(hatchBrush, rightWallX, drawArea.Top, rightHatchWidth, drawArea.Height); Pen^ targetPen = gcnew Pen(Color::Green, 1.5f); targetPen->DashStyle = DashStyle::Dot; g->DrawEllipse(targetPen, pixelTargetX - 8.0f, pixelTargetY - 8.0f, 16.0f, 16.0f); cli::array<PointF>^ pts = gcnew cli::array<PointF>(nSegments + 1); pts[0] = PointF((float)baseX, (float)baseY); float x = 0.0f, y = 0.0f, phi = 0.0f; for (int i = 0; i < nSegments; ++i) { array<float>^ anglesArray = angles->ToArray(); array<float>^ lengthsArray = lengths->ToArray(); float theta = anglesArray[i]; float L = lengthsArray[i]; phi += theta; x += L * (float)Math::Cos((double)phi); y += L * (float)Math::Sin((double)phi); pts[i + 1] = PointF(baseX + x * scale, baseY - y * scale); } Pen^ penRod = gcnew Pen(Color::Red, 6.0f); for (int i = 0; i < nSegments; ++i) g->DrawLine(penRod, pts[i], pts[i + 1]); SolidBrush^ brJoint = gcnew SolidBrush(Color::Blue); const float R = 6.0f; for (int i = 0; i <= nSegments; ++i) g->FillEllipse(brJoint, pts[i].X - R, pts[i].Y - R, 2 * R, 2 * R); delete wallPen; delete dashedPen; delete targetPen; delete penRod; delete brJoint; delete hatchBrush; } };

} - здесь происходит взаимодействие с интерфейсом - с ним взаимодействует только нулевой ранк как и должно быть - но обрати внимание каким был код когда мне нужно было тестировать чисто двумерный алгоритм: #pragma once

#define WIN32_LEAN_AND_MEAN
#include <Windows.h>

#include <algorithm>

#include <vector>

#include <utility>

namespace TESTAGP {
using namespace System;
using namespace System::IO;

text
ref class MyForm : public System::Windows::Forms::Form { public: MyForm(HMODULE hLib) : hLib(hLib) { // Передаем дескриптор из main InitializeComponent(); // Загружаем функции из DLL f = (agp_c)GetProcAddress(hLib, "AGP_2D"); pStart = (start_workers)GetProcAddress(hLib, "AgpStartWorkers"); pFree = (free_agp)GetProcAddress(hLib, "AGP_Free"); pInit = (PInit)GetProcAddress(hLib, "AgpInit"); } protected: ~MyForm() { delete components; } private: HINSTANCE hLib = nullptr; typedef void(__cdecl* agp_c)(float, float, float, float, float, float, float, bool, float, float, float**, size_t*); typedef void(__cdecl* start_workers)(); typedef void(__cdecl* free_agp)(float*); typedef int(__cdecl* PInit)(int, float, float, float, float); agp_c f = nullptr; start_workers pStart = nullptr; free_agp pFree = nullptr; PInit pInit = nullptr; System::Windows::Forms::Button^ button1; System::Windows::Forms::TextBox^ textBox2; System::Windows::Forms::DataVisualization::Charting::Chart^ chart2; System::Windows::Forms::Label^ label2; System::Windows::Forms::Label^ label3; System::Windows::Forms::TextBox^ textBox1; System::Windows::Forms::TextBox^ textBox3; System::Windows::Forms::TextBox^ textBox4; System::Windows::Forms::TextBox^ textBox5; System::Windows::Forms::TextBox^ textBox6; System::Windows::Forms::Label^ label6; System::Windows::Forms::Label^ label7; System::Windows::Forms::Label^ label8; System::Windows::Forms::Label^ label9; System::Windows::Forms::Label^ label10; System::Windows::Forms::Label^ label1; System::Windows::Forms::TextBox^ textBox7; System::Windows::Forms::TextBox^ textBox8; System::ComponentModel::Container^ components; System::Void button1_Click(System::Object^ sender, System::EventArgs^ e) { chart2->Series[0]->Points->Clear(); chart2->Series[1]->Points->Clear(); chart2->Series[2]->Points->Clear(); chart2->Series[3]->Points->Clear(); static LARGE_INTEGER start, end; QueryPerformanceCounter(&start); float* buf = nullptr; size_t len = 0u; pStart(); // разбудили rank != 0 f(2.0f, 10000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.00001f, GetTickCount(), &buf, &len); if (buf != nullptr) { std::vector<float> Extr_2D(buf, buf + len); pFree(buf); // Для 2D ветки: извлекаем schetchick и interval_len textBox7->Text = Convert::ToString(Extr_2D.back()); // schetchick Extr_2D.pop_back(); textBox6->Text = Convert::ToString(Extr_2D.back()); // interval_len Extr_2D.pop_back(); // Извлекаем последнюю тройку (x2, x1, f) float x2 = Extr_2D.back(); Extr_2D.pop_back(); float x1 = Extr_2D.back(); Extr_2D.pop_back(); float f_val = Extr_2D.back(); Extr_2D.pop_back(); // Выводим координаты и значение функции textBox4->Text = Convert::ToString(x1); textBox3->Text = Convert::ToString(x2); textBox2->Text = Convert::ToString(f_val); // Новое поле для значения функции // Отображаем точку на графике chart2->Series[2]->Points->AddXY(x1, x2); // Обрабатываем остальные точки (в порядке: f, x1, x2) while (!Extr_2D.empty()) { float x2_point = Extr_2D.back(); Extr_2D.pop_back(); float x1_point = Extr_2D.back(); Extr_2D.pop_back(); float f_point = Extr_2D.back(); Extr_2D.pop_back(); // Добавляем точку на график (только координаты) chart2->Series[0]->Points->AddXY(x1_point, x2_point); } } QueryPerformanceCounter(&end); LARGE_INTEGER freq; QueryPerformanceFrequency(&freq); const int multiplier = ((1ULL << 32) / freq.QuadPart) * 1'000'000ULL; textBox5->Text = Convert::ToString( ((end.QuadPart - start.QuadPart) * multiplier) >> 32 ) + " microseconds"; //textBox2->Text = Convert::ToString(Extr_2D.back()); //textBox1->Text = Convert::ToString(Extr_2D_LNA.back()); }

#pragma region Windows Form Designer generated code
void InitializeComponent(void) {
System::Windows::Forms::DataVisualization::Charting::ChartArea^ chartArea1 = (gcnew System::Windows::Forms::DataVisualization::Charting::ChartArea());
System::Windows::Forms::DataVisualization::Charting::Legend^ legend1 = (gcnew System::Windows::Forms::DataVisualization::Charting::Legend());
System::Windows::Forms::DataVisualization::Charting::Series^ series1 = (gcnew System::Windows::Forms::DataVisualization::Charting::Series());
System::Windows::Forms::DataVisualization::Charting::Series^ series2 = (gcnew System::Windows::Forms::DataVisualization::Charting::Series());
System::Windows::Forms::DataVisualization::Charting::Series^ series3 = (gcnew System::Windows::Forms::DataVisualization::Charting::Series());
System::Windows::Forms::DataVisualization::Charting::Series^ series4 = (gcnew System::Windows::Forms::DataVisualization::Charting::Series());
this->button1 = (gcnew System::Windows::Forms::Button());
this->textBox2 = (gcnew System::Windows::Forms::TextBox());
this->chart2 = (gcnew System::Windows::Forms::DataVisualization::Charting::Chart());
this->label2 = (gcnew System::Windows::Forms::Label());
this->textBox1 = (gcnew System::Windows::Forms::TextBox());
this->textBox3 = (gcnew System::Windows::Forms::TextBox());
this->textBox4 = (gcnew System::Windows::Forms::TextBox());
this->textBox5 = (gcnew System::Windows::Forms::TextBox());
this->textBox6 = (gcnew System::Windows::Forms::TextBox());
this->label6 = (gcnew System::Windows::Forms::Label());
this->label7 = (gcnew System::Windows::Forms::Label());
this->label8 = (gcnew System::Windows::Forms::Label());
this->label9 = (gcnew System::Windows::Forms::Label());
this->label10 = (gcnew System::Windows::Forms::Label());
this->label1 = (gcnew System::Windows::Forms::Label());
this->textBox7 = (gcnew System::Windows::Forms::TextBox());
this->label3 = (gcnew System::Windows::Forms::Label());
this->textBox8 = (gcnew System::Windows::Forms::TextBox());
(cli::safe_castSystem::ComponentModel::ISupportInitialize^(this->chart2))->BeginInit();
this->SuspendLayout();
//
// button1
//
this->button1->BackColor = System::Drawing::SystemColors::Info;
this->button1->Cursor = System::Windows::Forms::Cursors::Hand;
this->button1->FlatAppearance->BorderColor = System::Drawing::Color::FromArgb(static_castSystem::Int32(static_castSystem::Byte(64)),
static_castSystem::Int32(static_castSystem::Byte(64)), static_castSystem::Int32(static_castSystem::Byte(64)));
this->button1->FlatAppearance->BorderSize = 3;
this->button1->FlatAppearance->MouseDownBackColor = System::Drawing::Color::FromArgb(static_castSystem::Int32(static_castSystem::Byte(128)),
static_castSystem::Int32(static_castSystem::Byte(128)), static_castSystem::Int32(static_castSystem::Byte(255)));
this->button1->FlatAppearance->MouseOverBackColor = System::Drawing::Color::FromArgb(static_castSystem::Int32(static_castSystem::Byte(192)),
static_castSystem::Int32(static_castSystem::Byte(192)), static_castSystem::Int32(static_castSystem::Byte(255)));
this->button1->FlatStyle = System::Windows::Forms::FlatStyle::Flat;
this->button1->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 14.25F, System::Drawing::FontStyle::Bold));
this->button1->ForeColor = System::Drawing::SystemColors::ControlDarkDark;
this->button1->Location = System::Drawing::Point(897, 724);
this->button1->Name = L"button1";
this->button1->Size = System::Drawing::Size(275, 75);
this->button1->TabIndex = 2;
this->button1->Text = L"SOL";
this->button1->UseVisualStyleBackColor = false;
this->button1->Click += gcnew System::EventHandler(this, &MyForm::button1_Click);
//
// textBox2
//
this->textBox2->BackColor = System::Drawing::SystemColors::ControlLightLight;
this->textBox2->Cursor = System::Windows::Forms::Cursors::Hand;
this->textBox2->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 12, System::Drawing::FontStyle::Bold));
this->textBox2->ForeColor = System::Drawing::Color::FromArgb(static_castSystem::Int32(static_castSystem::Byte(64)), static_castSystem::Int32(static_castSystem::Byte(0)),
static_castSystem::Int32(static_castSystem::Byte(64)));
this->textBox2->Location = System::Drawing::Point(992, 619);
this->textBox2->Name = L"textBox2";
this->textBox2->Size = System::Drawing::Size(180, 29);
this->textBox2->TabIndex = 4;
//
// chart2
//
this->chart2->BackColor = System::Drawing::SystemColors::ControlLight;
chartArea1->AxisX->Interval = 0.1;
chartArea1->AxisX->IsLabelAutoFit = false;
chartArea1->AxisX->LabelStyle->Font = (gcnew System::Drawing::Font(L"Yu Gothic", 6.75F));
chartArea1->AxisX->Maximum = 1.8;
chartArea1->AxisX->Minimum = -2.2;
chartArea1->AxisX->Title = L"x1";
chartArea1->AxisX->TitleFont = (gcnew System::Drawing::Font(L"Yu Gothic UI", 11.25F, System::Drawing::FontStyle::Bold));
chartArea1->AxisY->Interval = 0.1;
chartArea1->AxisY->IsLabelAutoFit = false;
chartArea1->AxisY->LabelStyle->Font = (gcnew System::Drawing::Font(L"Yu Gothic", 7.92F));
chartArea1->AxisY->Maximum = 1.8;
chartArea1->AxisY->Minimum = -2.2;
chartArea1->AxisY->Title = L"x2";
chartArea1->AxisY->TitleFont = (gcnew System::Drawing::Font(L"Yu Gothic UI", 11.25F, System::Drawing::FontStyle::Bold));
chartArea1->BackColor = System::Drawing::Color::FloralWhite;
chartArea1->BackGradientStyle = System::Windows::Forms::DataVisualization::Charting::GradientStyle::Center;
chartArea1->BackSecondaryColor = System::Drawing::Color::AliceBlue;
chartArea1->Name = L"ChartArea1";
this->chart2->ChartAreas->Add(chartArea1);
legend1->BackColor = System::Drawing::Color::Transparent;
legend1->BackGradientStyle = System::Windows::Forms::DataVisualization::Charting::GradientStyle::Center;
legend1->BackSecondaryColor = System::Drawing::SystemColors::ActiveCaption;
legend1->BorderColor = System::Drawing::Color::Transparent;
legend1->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 11.25F, System::Drawing::FontStyle::Bold));
legend1->ForeColor = System::Drawing::SystemColors::ActiveCaptionText;
legend1->HeaderSeparator = System::Windows::Forms::DataVisualization::Charting::LegendSeparatorStyle::ThickGradientLine;
legend1->HeaderSeparatorColor = System::Drawing::Color::IndianRed;
legend1->IsTextAutoFit = false;
legend1->ItemColumnSeparator = System::Windows::Forms::DataVisualization::Charting::LegendSeparatorStyle::ThickGradientLine;
legend1->ItemColumnSeparatorColor = System::Drawing::Color::IndianRed;
legend1->Name = L"Legend1";
this->chart2->Legends->Add(legend1);
this->chart2->Location = System::Drawing::Point(12, 14);
this->chart2->Name = L"chart2";
series1->ChartArea = L"ChartArea1";
series1->ChartType = System::Windows::Forms::DataVisualization::Charting::SeriesChartType::FastPoint;
series1->Legend = L"Legend1";
series1->Name = L"Точки данных";
series2->ChartArea = L"ChartArea1";
series2->ChartType = System::Windows::Forms::DataVisualization::Charting::SeriesChartType::FastPoint;
series2->Legend = L"Legend1";
series2->Name = L"Точки данных LNA";
series3->ChartArea = L"ChartArea1";
series3->ChartType = System::Windows::Forms::DataVisualization::Charting::SeriesChartType::FastPoint;
series3->Legend = L"Legend1";
series3->Name = L"Точки данных(финал)";
series4->ChartArea = L"ChartArea1";
series4->ChartType = System::Windows::Forms::DataVisualization::Charting::SeriesChartType::FastPoint;
series4->Legend = L"Legend1";
series4->Name = L"Точки данных LNA(финал)";
this->chart2->Series->Add(series1);
this->chart2->Series->Add(series2);
this->chart2->Series->Add(series3);
this->chart2->Series->Add(series4);
this->chart2->Size = System::Drawing::Size(1160, 785);
this->chart2->TabIndex = 5;
//
// label2
//
this->label2->AutoSize = true;
this->label2->BackColor = System::Drawing::SystemColors::InactiveCaption;
this->label2->BorderStyle = System::Windows::Forms::BorderStyle::Fixed3D;
this->label2->Cursor = System::Windows::Forms::Cursors::Hand;
this->label2->FlatStyle = System::Windows::Forms::FlatStyle::Flat;
this->label2->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 12, static_castSystem::Drawing::FontStyle(((System::Drawing::FontStyle::Bold | System::Drawing::FontStyle::Italic)
| System::Drawing::FontStyle::Underline))));
this->label2->ForeColor = System::Drawing::Color::DarkBlue;
this->label2->Location = System::Drawing::Point(946, 622);
this->label2->Name = L"label2";
this->label2->Size = System::Drawing::Size(40, 23);
this->label2->TabIndex = 8;
this->label2->Text = L"Extr";
//
// textBox1
//
this->textBox1->BackColor = System::Drawing::SystemColors::ControlLightLight;
this->textBox1->Cursor = System::Windows::Forms::Cursors::Hand;
this->textBox1->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 12, System::Drawing::FontStyle::Bold));
this->textBox1->ForeColor = System::Drawing::Color::FromArgb(static_castSystem::Int32(static_castSystem::Byte(64)), static_castSystem::Int32(static_castSystem::Byte(0)),
static_castSystem::Int32(static_castSystem::Byte(64)));
this->textBox1->Location = System::Drawing::Point(992, 654);
this->textBox1->Name = L"textBox1";
this->textBox1->Size = System::Drawing::Size(180, 29);
this->textBox1->TabIndex = 13;
//
// textBox3
//
this->textBox3->BackColor = System::Drawing::SystemColors::ControlLightLight;
this->textBox3->Cursor = System::Windows::Forms::Cursors::Hand;
this->textBox3->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 12, System::Drawing::FontStyle::Bold));
this->textBox3->ForeColor = System::Drawing::Color::FromArgb(static_castSystem::Int32(static_castSystem::Byte(64)), static_castSystem::Int32(static_castSystem::Byte(0)),
static_castSystem::Int32(static_castSystem::Byte(64)));
this->textBox3->Location = System::Drawing::Point(992, 584);
this->textBox3->Name = L"textBox3";
this->textBox3->Size = System::Drawing::Size(180, 29);
this->textBox3->TabIndex = 14;
//
// textBox4
//
this->textBox4->BackColor = System::Drawing::SystemColors::ControlLightLight;
this->textBox4->Cursor = System::Windows::Forms::Cursors::Hand;
this->textBox4->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 12, System::Drawing::FontStyle::Bold));
this->textBox4->ForeColor = System::Drawing::Color::FromArgb(static_castSystem::Int32(static_castSystem::Byte(64)), static_castSystem::Int32(static_castSystem::Byte(0)),
static_castSystem::Int32(static_castSystem::Byte(64)));
this->textBox4->Location = System::Drawing::Point(992, 549);
this->textBox4->Name = L"textBox4";
this->textBox4->Size = System::Drawing::Size(180, 29);
this->textBox4->TabIndex = 15;
//
// textBox5
//
this->textBox5->BackColor = System::Drawing::SystemColors::ControlLightLight;
this->textBox5->Cursor = System::Windows::Forms::Cursors::Hand;
this->textBox5->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 12, System::Drawing::FontStyle::Bold));
this->textBox5->ForeColor = System::Drawing::Color::FromArgb(static_castSystem::Int32(static_castSystem::Byte(64)), static_castSystem::Int32(static_castSystem::Byte(0)),
static_castSystem::Int32(static_castSystem::Byte(64)));
this->textBox5->Location = System::Drawing::Point(992, 514);
this->textBox5->Name = L"textBox5";
this->textBox5->Size = System::Drawing::Size(180, 29);
this->textBox5->TabIndex = 16;
//
// textBox6
//
this->textBox6->BackColor = System::Drawing::SystemColors::ControlLightLight;
this->textBox6->Cursor = System::Windows::Forms::Cursors::Hand;
this->textBox6->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 12, System::Drawing::FontStyle::Bold));
this->textBox6->ForeColor = System::Drawing::Color::FromArgb(static_castSystem::Int32(static_castSystem::Byte(64)), static_castSystem::Int32(static_castSystem::Byte(0)),
static_castSystem::Int32(static_castSystem::Byte(64)));
this->textBox6->Location = System::Drawing::Point(992, 443);
this->textBox6->Name = L"textBox6";
this->textBox6->Size = System::Drawing::Size(180, 29);
this->textBox6->TabIndex = 17;
//
// label6
//
this->label6->AutoSize = true;
this->label6->BackColor = System::Drawing::SystemColors::InactiveCaption;
this->label6->BorderStyle = System::Windows::Forms::BorderStyle::Fixed3D;
this->label6->Cursor = System::Windows::Forms::Cursors::Hand;
this->label6->FlatStyle = System::Windows::Forms::FlatStyle::Flat;
this->label6->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 12, static_castSystem::Drawing::FontStyle(((System::Drawing::FontStyle::Bold | System::Drawing::FontStyle::Italic)
| System::Drawing::FontStyle::Underline))));
this->label6->ForeColor = System::Drawing::Color::DarkBlue;
this->label6->Location = System::Drawing::Point(911, 657);
this->label6->Name = L"label6";
this->label6->Size = System::Drawing::Size(75, 23);
this->label6->TabIndex = 18;
this->label6->Text = L"Extr LNA";
//
// label7
//
this->label7->AutoSize = true;
this->label7->BackColor = System::Drawing::SystemColors::InactiveCaption;
this->label7->BorderStyle = System::Windows::Forms::BorderStyle::Fixed3D;
this->label7->Cursor = System::Windows::Forms::Cursors::Hand;
this->label7->FlatStyle = System::Windows::Forms::FlatStyle::Flat;
this->label7->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 12, static_castSystem::Drawing::FontStyle(((System::Drawing::FontStyle::Bold | System::Drawing::FontStyle::Italic)
| System::Drawing::FontStyle::Underline))));
this->label7->ForeColor = System::Drawing::Color::DarkBlue;
this->label7->Location = System::Drawing::Point(957, 587);
this->label7->Name = L"label7";
this->label7->Size = System::Drawing::Size(29, 23);
this->label7->TabIndex = 19;
this->label7->Text = L"x2";
//
// label8
//
this->label8->AutoSize = true;
this->label8->BackColor = System::Drawing::SystemColors::InactiveCaption;
this->label8->BorderStyle = System::Windows::Forms::BorderStyle::Fixed3D;
this->label8->Cursor = System::Windows::Forms::Cursors::Hand;
this->label8->FlatStyle = System::Windows::Forms::FlatStyle::Flat;
this->label8->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 12, static_castSystem::Drawing::FontStyle(((System::Drawing::FontStyle::Bold | System::Drawing::FontStyle::Italic)
| System::Drawing::FontStyle::Underline))));
this->label8->ForeColor = System::Drawing::Color::DarkBlue;
this->label8->Location = System::Drawing::Point(960, 552);
this->label8->Name = L"label8";
this->label8->Size = System::Drawing::Size(26, 23);
this->label8->TabIndex = 20;
this->label8->Text = L"x1";
//
// label9
//
this->label9->AutoSize = true;
this->label9->BackColor = System::Drawing::SystemColors::InactiveCaption;
this->label9->BorderStyle = System::Windows::Forms::BorderStyle::Fixed3D;
this->label9->Cursor = System::Windows::Forms::Cursors::Hand;
this->label9->FlatStyle = System::Windows::Forms::FlatStyle::Flat;
this->label9->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 12, static_castSystem::Drawing::FontStyle(((System::Drawing::FontStyle::Bold | System::Drawing::FontStyle::Italic)
| System::Drawing::FontStyle::Underline))));
this->label9->ForeColor = System::Drawing::Color::DarkBlue;
this->label9->Location = System::Drawing::Point(903, 517);
this->label9->Name = L"label9";
this->label9->Size = System::Drawing::Size(83, 23);
this->label9->TabIndex = 21;
this->label9->Text = L"total time";
//
// label10
//
this->label10->AutoSize = true;
this->label10->BackColor = System::Drawing::SystemColors::InactiveCaption;
this->label10->BorderStyle = System::Windows::Forms::BorderStyle::Fixed3D;
this->label10->Cursor = System::Windows::Forms::Cursors::Hand;
this->label10->FlatStyle = System::Windows::Forms::FlatStyle::Flat;
this->label10->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 12, static_castSystem::Drawing::FontStyle(((System::Drawing::FontStyle::Bold | System::Drawing::FontStyle::Italic)
| System::Drawing::FontStyle::Underline))));
this->label10->ForeColor = System::Drawing::Color::DarkBlue;
this->label10->Location = System::Drawing::Point(903, 446);
this->label10->Name = L"label10";
this->label10->Size = System::Drawing::Size(83, 23);
this->label10->TabIndex = 22;
this->label10->Text = L"iter count";
//
// label1
//
this->label1->AutoSize = true;
this->label1->BackColor = System::Drawing::SystemColors::InactiveCaption;
this->label1->BorderStyle = System::Windows::Forms::BorderStyle::Fixed3D;
this->label1->Cursor = System::Windows::Forms::Cursors::Hand;
this->label1->FlatStyle = System::Windows::Forms::FlatStyle::Flat;
this->label1->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 12, static_castSystem::Drawing::FontStyle(((System::Drawing::FontStyle::Bold | System::Drawing::FontStyle::Italic)
| System::Drawing::FontStyle::Underline))));
this->label1->ForeColor = System::Drawing::Color::DarkBlue;
this->label1->Location = System::Drawing::Point(911, 692);
this->label1->Name = L"label1";
this->label1->Size = System::Drawing::Size(75, 23);
this->label1->TabIndex = 24;
this->label1->Text = L"accuracy";
//
// textBox7
//
this->textBox7->BackColor = System::Drawing::SystemColors::ControlLightLight;
this->textBox7->Cursor = System::Windows::Forms::Cursors::Hand;
this->textBox7->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 12, System::Drawing::FontStyle::Bold));
this->textBox7->ForeColor = System::Drawing::Color::FromArgb(static_castSystem::Int32(static_castSystem::Byte(64)), static_castSystem::Int32(static_castSystem::Byte(0)),
static_castSystem::Int32(static_castSystem::Byte(64)));
this->textBox7->Location = System::Drawing::Point(992, 689);
this->textBox7->Name = L"textBox7";
this->textBox7->Size = System::Drawing::Size(180, 29);
this->textBox7->TabIndex = 23;
//
// label3
//
this->label3->AutoSize = true;
this->label3->BackColor = System::Drawing::SystemColors::InactiveCaption;
this->label3->BorderStyle = System::Windows::Forms::BorderStyle::Fixed3D;
this->label3->Cursor = System::Windows::Forms::Cursors::Hand;
this->label3->FlatStyle = System::Windows::Forms::FlatStyle::Flat;
this->label3->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 12, static_castSystem::Drawing::FontStyle(((System::Drawing::FontStyle::Bold | System::Drawing::FontStyle::Italic)
| System::Drawing::FontStyle::Underline))));
this->label3->ForeColor = System::Drawing::Color::DarkBlue;
this->label3->Location = System::Drawing::Point(868, 481);
this->label3->Name = L"label3";
this->label3->Size = System::Drawing::Size(118, 23);
this->label3->TabIndex = 26;
this->label3->Text = L"iter count LNA";
//
// textBox8
//
this->textBox8->BackColor = System::Drawing::SystemColors::ControlLightLight;
this->textBox8->Cursor = System::Windows::Forms::Cursors::Hand;
this->textBox8->Font = (gcnew System::Drawing::Font(L"Yu Gothic UI", 12, System::Drawing::FontStyle::Bold));
this->textBox8->ForeColor = System::Drawing::Color::FromArgb(static_castSystem::Int32(static_castSystem::Byte(64)), static_castSystem::Int32(static_castSystem::Byte(0)),
static_castSystem::Int32(static_castSystem::Byte(64)));
this->textBox8->Location = System::Drawing::Point(992, 478);
this->textBox8->Name = L"textBox8";
this->textBox8->Size = System::Drawing::Size(180, 29);
this->textBox8->TabIndex = 25;
//
// MyForm
//
this->AutoScaleDimensions = System::Drawing::SizeF(6, 13);
this->AutoScaleMode = System::Windows::Forms::AutoScaleMode::Font;
this->ClientSize = System::Drawing::Size(1184, 811);
this->Controls->Add(this->label3);
this->Controls->Add(this->textBox8);
this->Controls->Add(this->label1);
this->Controls->Add(this->textBox7);
this->Controls->Add(this->label10);
this->Controls->Add(this->label9);
this->Controls->Add(this->label8);
this->Controls->Add(this->label7);
this->Controls->Add(this->label6);
this->Controls->Add(this->textBox6);
this->Controls->Add(this->textBox5);
this->Controls->Add(this->textBox4);
this->Controls->Add(this->textBox3);
this->Controls->Add(this->textBox1);
this->Controls->Add(this->label2);
this->Controls->Add(this->textBox2);
this->Controls->Add(this->button1);
this->Controls->Add(this->chart2);
this->Name = L"MyForm";
this->Text = L"MyForm";
(cli::safe_castSystem::ComponentModel::ISupportInitialize^(this->chart2))->EndInit();
this->ResumeLayout(false);
this->PerformLayout();

	}

#pragma endregion
};
} // namespace TESTAGP

#include "MyForm.h"
#include <float.h>

using namespace System;
using namespace System::Windows::Forms;

typedef int(__cdecl* PInit)(int, float, float, float, float);
typedef void(__cdecl* PStartWorkers)();

[STAThread]
int main() {
HMODULE h = LoadLibraryW(L"TEST_FUNC.dll");
auto AgpInit = (PInit)GetProcAddress(h, "AgpInit");
auto AgpWaitStartAndRun = (PStartWorkers)GetProcAddress(h, "AgpWaitStartAndRun");

text
const int rank = AgpInit(12, -2.2f, 1.8f, -2.2f, 1.8f); if (!rank) { Application::EnableVisualStyles(); Application::SetCompatibleTextRenderingDefault(false); Application::Run(gcnew TESTAGP::MyForm(h)); } else { AgpWaitStartAndRun(); } return 0;

}

#include "pch.h"

#define XOR_RAND(state, result_var)
do {
int s = state;
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = state * 0x1.0p-32f;
} while(0)

#define XOR_RAND_GRSH(state, result_var)
do {
int s = state;
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = fmaf(state, 0x1.0p-31f, -1.0f);
} while(0)

#define FABE13_COS(x, result_var)
do {
const float abs_val = fabsf(x);
float reduced = fmodf(abs_val, 6.28318530718f);
if(reduced > 3.14159265359f) {
reduced = 6.28318530718f - reduced;
}
if(reduced < 1.57079632679f) {
const float val2 = reduced * reduced;
const float val4 = val2 * val2;
result_var = fmaf(val4, fmaf(val2, -0.0013888889f, 0.0416666667f), fmaf(val2, -0.5f, 1.0f));
} else {
reduced = 3.14159265359f - reduced;
const float val2 = reduced * reduced;
const float val4 = val2 * val2;
result_var = -fmaf(val4, fmaf(val2, -0.0013888889f, 0.0416666667f), fmaf(val2, -0.5f, 1.0f));
}
} while(0)

#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
const int limit = n & ~7;
if(n >= 8) {
static __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f);
static __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f);
static __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f);
static __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f);
static __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f);
static __declspec(align(32)) const __m256 VEC_COS_P5 = _mm256_set1_ps(-0.0013888889f);
static __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f);
static __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f);
static __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_SIN_P5 = _mm256_set1_ps(-0.0001984127f);
static __declspec(align(32)) const __m256 VEC_SIN_P3 = _mm256_set1_ps(0.0083333333f);
static __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f);
static __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps();
while(i < limit) {
const __m256 vx = _mm256_load_ps(&in[i]);
const __m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
const __m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
const __m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
const __m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
const __m256 t2 = _mm256_mul_ps(r2, r2);
const __m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P5, VEC_COS_P3), VEC_COS_P1), VEC_COS_P0);
const __m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_SIN_P5, VEC_SIN_P3), VEC_SIN_P1), VEC_SIN_P0), r2);
const __m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
const __m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ), _mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
_mm256_store_ps(&cos_out[i], _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip));
_mm256_store_ps(&sin_out[i], _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip));
i += 8;
}
}
while(i < n) {
const float x = in[i];
const float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
const bool sflip = r > 3.14159265359f;
if(sflip) r = 6.28318530718f - r;
const bool cflip = r > 1.57079632679f;
if(cflip) r = 3.14159265359f - r;
const float t2 = r * r;
const float c = fmaf(t2, fmaf(t2, fmaf(t2, -0.0013888889f, 0.0416666667f), -0.5f), 1.0f);
const float s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
cos_out[i] = cflip ? -c : c;
sin_out[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while(0)

enum List : uint8_t { Top = 0b00u, Down = 0b01u, Left = 0b10u, Right = 0b11u };

__declspec(align(4)) struct Step final {
const uint8_t next;
const uint8_t dx;
const uint8_t dy;
};

__declspec(align(4)) struct InvStep final {
const uint8_t q;
const uint8_t next;
};

__declspec(align(64)) static const Step g_step_tbl[4][4] = {
{ {Right,0u,0u}, {Top,0u,1u}, {Top,1u,1u}, {Left,1u,0u} },
{ {Left,1u,1u}, {Down,1u,0u}, {Down,0u,0u}, {Right,0u,1u} },
{ {Down,1u,1u}, {Left,0u,1u}, {Left,0u,0u}, {Top,1u,0u} },
{ {Top,0u,0u}, {Right,1u,0u}, {Right,1u,1u}, {Down,0u,1u} }
};

__declspec(align(64)) static const InvStep g_inv_tbl[4][4] = {
{ {0u,Right}, {1u,Top}, {3u,Left}, {2u,Top} },
{ {2u,Down}, {3u,Right}, {1u,Down}, {0u,Left} },
{ {2u,Left}, {1u,Left}, {3u,Top}, {0u,Down} },
{ {0u,Top}, {3u,Down}, {1u,Right}, {2u,Right} }
};

static const boost::mpi::environment* __restrict g_env;
static const boost::mpi::communicator* __restrict g_world;

__declspec(align(16)) struct CrossMsg final {
float s_x1, s_x2;
float e_x1, e_x2;
float Rtop;
template<typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }
};

__declspec(align(16)) struct CtrlMsg final {
bool kind;
CrossMsg xchg;
template<typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept {
ar& kind& xchg;
}
};

__declspec(align(16)) struct Slab final {
char* const __restrict base;
char* __restrict current;
char* const __restrict end;

text
__declspec(noalias) __forceinline Slab(void* const __restrict memory, const size_t usable_size) noexcept : base(static_cast<char* __restrict>(memory)) , current(base) , end(base + (usable_size & ~static_cast<size_t>(63u))) { }

};

static tbb::enumerable_thread_specific<Slab*> tls( noexcept {
void* const __restrict memory = _aligned_malloc(16777216u, 16u);
Slab* const __restrict slab = static_cast<Slab*>(_aligned_malloc(32u, 16u));
new (slab) Slab(memory, 16777216u);

char* __restrict p = slab->base;

#pragma loop(ivdep)
while (p < slab->end) {
*p = 0u;
p += 4096u;
}
return slab;
}());

__declspec(align(64)) struct Interval final {
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float ordinate_factor;
const float N_factor;
const float quadratic_term;
const float M;
float R;

text
__declspec(noalias) __forceinline void* operator new(const size_t) noexcept { Slab* const __restrict s = tls.local(); char* const __restrict result = s->current; s->current += 64u; return result; } __declspec(noalias) __forceinline Interval(const float _x1, const float _x2, const float _y1, const float _y2, const float _N) noexcept : x1(_x1), x2(_x2), y1(_y1), y2(_y2) , delta_y(_y2 - _y1) , ordinate_factor(-(y1 + y2) * 2.0f) , N_factor(_N == 1.0f ? _x2 - _x1 : _N == 2.0f ? sqrtf(_x2 - _x1) : powf(_x2 - _x1, 1.0f / _N)) , quadratic_term((1.0f / N_factor)* delta_y* delta_y) , M((1.0f / N_factor)* fabsf(delta_y)) { } __declspec(noalias) __forceinline void ChangeCharacteristic(const float _m) noexcept { R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor)); }

};

__declspec(align(16)) struct Peano2DMap final {
const int levels;
const float a, b, c, d;
const float lenx, leny;
const float inv_lenx;
const uint32_t scale;
const uint8_t start;

text
__declspec(noalias) __forceinline Peano2DMap( const int L, const float _a, const float _b, const float _c, const float _d, const uint8_t startType ) noexcept : levels(L) , a(_a), b(_b), c(_c), d(_d) , lenx(_b - _a) , leny(_d - _c) , inv_lenx(1.0f / (_b - _a)) , scale(static_cast<uint32_t>(1u) << (L << 1)) , start(startType) { }

};

static Peano2DMap gActiveMap(0, 0.0f, 0.0f, 0.0f, 0.0f, 0b00u);

static __declspec(noalias) __forceinline bool ComparePtr(const Interval* const __restrict a, const Interval* const __restrict b) noexcept
{
return a->R < b->R;
}

static __declspec(noalias) __forceinline float ShekelFunc(const float x, const float seed) noexcept
{
int i = 0;
float current_state = seed, current_res, current_res2, res = 0.0f;
while (i < 10) {
XOR_RAND(current_state, current_res);
const float x_part = fmaf(-current_res, 10.0f, x);
XOR_RAND(current_state, current_res);
XOR_RAND(current_state, current_res2);
float delimiter = fmaf(fmaf(current_res, 20.0f, 5.0f), x_part * x_part, fmaf(current_res2, 0.2f, 1.0f));
delimiter = copysignf(fmaxf(fabsf(delimiter), FLT_MIN), delimiter);
res -= 1.0f / delimiter;
++i;
}
return res;
}

static __declspec(noalias) __forceinline float RastriginFunc(const float x1, const float x2) noexcept
{
const float term1 = fmaf(x1, x1, x2 * x2);
float cos1, cos2;
FABE13_COS(6.28318530717958647692f * x1, cos1);
FABE13_COS(6.28318530717958647692f * x2, cos2);
return (term1 - fmaf(cos1 + cos2, 10.0f, -14.6f)) * fmaf(-term1, 0.25f, 18.42f);
}

static __declspec(noalias) __forceinline float HillFunc(const float x, const float seed) noexcept
{
int j = 0;
__declspec(align(32)) float angles[14u];
const float start_angle = 6.28318530717958647692f * x;
#pragma loop(ivdep)
while (j < 14) {
angles[j] = start_angle * static_cast<float>(j + 1);
++j;
}
__declspec(align(32)) float sin_vals[14u];
__declspec(align(32)) float cos_vals[14u];
FABE13_SINCOS(angles, sin_vals, cos_vals, 14u);
float current_state = seed, current_res, current_res2;
XOR_RAND(current_state, current_res);
float res = fmaf(current_res, 2.0f, -1.1f);
--j;
while (j >= 0) {
XOR_RAND(current_state, current_res);
XOR_RAND(current_state, current_res2);
res += fmaf(fmaf(current_res, 2.0f, -1.1f), sin_vals[j], fmaf(current_res2, 2.0f, -1.1f) * cos_vals[j]);
--j;
}
return res;
}

static __declspec(noalias) __forceinline float GrishaginFunc(const float x1, const float x2, const float seed) noexcept
{
int j = 0;
__declspec(align(32)) float angles_j[8u];
__declspec(align(32)) float angles_k[8u];
#pragma loop(ivdep)
while (j < 8) {
const float pj_mult = 3.14159265358979323846f * static_cast<float>(j + 1);
angles_j[j] = pj_mult * x1;
angles_k[j] = pj_mult * x2;
++j;
}
__declspec(align(32)) float sin_j[8u], cos_j[8u];
__declspec(align(32)) float sin_k[8u], cos_k[8u];
FABE13_SINCOS(angles_j, sin_j, cos_j, 8u);
FABE13_SINCOS(angles_k, sin_k, cos_k, 8u);
--j;
float part1 = 0.0f;
float part2 = 0.0f;
float current_state = seed, current_res, current_res2;
while (j >= 0) {
size_t k = 0u;
while (k < 8u) {
const float sin_term = sin_j[j] * sin_j[j];
const float cos_term = cos_k[k] * cos_k[k];
XOR_RAND_GRSH(current_state, current_res);
XOR_RAND_GRSH(current_state, current_res2);
part1 = fmaf(current_res, sin_term, fmaf(current_res2, cos_term, part1));
XOR_RAND_GRSH(current_state, current_res);
XOR_RAND_GRSH(current_state, current_res2);
part2 = fmaf(-current_res, cos_term, fmaf(current_res2, sin_term, part2));
++k;
}
--j;
}
return -sqrtf(fmaf(part1, part1, part2 * part2));
}

static __declspec(noalias) __forceinline float Shag(const float _m, const float x1, const float x2, const float y1,
const float y2, const float _N, const float _r) noexcept
{
const float diff = y2 - y1;
const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(
0x3F800000u | ((reinterpret_cast<const uint32_t>(&diff) & 0x80000000u) ^ 0x80000000u))));
return _N == 1.0f
? fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f
: _N == 2.0f
? fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f
: fmaf(sign_mult / powf(_m, _N), powf(diff, _N) * _r, x1 + x2) * 0.5f;
}

static __declspec(noalias) __forceinline void RecomputeR_ConstM_AVX2(Interval* const* const __restrict arr, const size_t n, const float m) noexcept {
const __m256 vm = _mm256_set1_ps(m);

text
__m256 vinvm = _mm256_rcp_ps(vm); vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f))); size_t i = 0; const int limit = static_cast<int>(n & ~7u); if (n >= 8u) { while (i < static_cast<size_t>(limit)) { __declspec(align(32)) float q[8], nf[8], ord[8]; int k = 0;

#pragma loop(ivdep)
while (k < 8) {
const Interval* const __restrict p = arr[i + k];
q[k] = p->quadratic_term;
nf[k] = p->N_factor;
ord[k] = p->ordinate_factor;
++k;
}
const __m256 vq = _mm256_load_ps(q);
const __m256 vnf = _mm256_load_ps(nf);
const __m256 vod = _mm256_load_ps(ord);

text
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod); const __m256 res = _mm256_fmadd_ps(vq, vinvm, t); __declspec(align(32)) float out[8]; _mm256_store_ps(out, res); k = 0;

#pragma loop(ivdep)
while (k < 8) {
arr[i + k]->R = out[k];
++k;
}
i += 8u;
}
}
while (i < n) {
arr[i]->ChangeCharacteristic(m);
++i;
}
}

static __declspec(noalias) __forceinline void RecomputeR_AffineM_AVX2(Interval* const* const __restrict arr, const size_t n, const float GLOBAL_FACTOR, const float alpha) noexcept {
const __m256 vGF = _mm256_set1_ps(GLOBAL_FACTOR);
const __m256 va = _mm256_set1_ps(alpha);

text
size_t i = 0; const int limit = static_cast<int>(n & ~7u); if (n >= 8u) { while (i < static_cast<size_t>(limit)) { __declspec(align(32)) float len[8], Mv[8], q[8], nf[8], ord[8]; int k = 0;

#pragma loop(ivdep)
while (k < 8) {
const Interval* const p = arr[i + k];
len[k] = p->x2 - p->x1;
Mv[k] = p->M;
q[k] = p->quadratic_term;
nf[k] = p->N_factor;
ord[k] = p->ordinate_factor;
++k;
}
const __m256 vlen = _mm256_load_ps(len);
const __m256 vM = _mm256_load_ps(Mv);
const __m256 vq = _mm256_load_ps(q);
const __m256 vnf = _mm256_load_ps(nf);
const __m256 vod = _mm256_load_ps(ord);

text
const __m256 vm = _mm256_fmadd_ps(vGF, vlen, _mm256_mul_ps(va, vM)); __m256 vinvm = _mm256_rcp_ps(vm); vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f))); const __m256 t = _mm256_fmadd_ps(vm, vnf, vod); const __m256 res = _mm256_fmadd_ps(vq, vinvm, t); __declspec(align(32)) float out[8]; _mm256_store_ps(out, res); k = 0;

#pragma loop(ivdep)
while (k < 8) {
arr[i + k]->R = out[k];
++k;
}
i += 8u;
}
}
while (i < n) {
const Interval* const __restrict p = arr[i];
const float mi = fmaf(GLOBAL_FACTOR, p->x2 - p->x1, p->M * alpha);
arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor));
++i;
}
}

__declspec(noalias) __forceinline void HitTest2D_analytic(const float x_param, float& out_x1, float& out_x2) noexcept
{
const float a = gActiveMap.a;
const float inv_lenx = gActiveMap.inv_lenx;
const uint32_t scale = gActiveMap.scale;
const uint32_t scale_minus_1 = scale - 1u;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const float c = gActiveMap.c;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;

text
float norm = (x_param - a) * inv_lenx; norm = fminf(fmaxf(norm, 0.0f), 0x1.fffffep-1f); uint32_t idx = static_cast<uint32_t>(norm * static_cast<float>(scale)); idx = idx > scale_minus_1 ? scale_minus_1 : idx; float sx = lenx, sy = leny; float x1 = a, x2 = c; uint8_t type = start; int l = levels - 1;

#pragma loop(ivdep)
while (l >= 0) {
const uint32_t q = (idx >> (l * 2)) & 3u;
const Step s = g_step_tbl[type][q];
type = s.next;
sx *= 0.5f; sy *= 0.5f;
x1 += s.dx ? sx : 0.0f;
x2 += s.dy ? sy : 0.0f;
--l;
}
out_x1 = x1 + sx * 0.5f;
out_x2 = x2 + sy * 0.5f;
}

__declspec(noalias) __forceinline float FindX2D_analytic(const float px, const float py) noexcept
{
const float a = gActiveMap.a;
const float b = gActiveMap.b;
const float c = gActiveMap.c;
const float d = gActiveMap.d;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const uint32_t scale = gActiveMap.scale;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;

text
const float clamped_px = fminf(fmaxf(px, a), b); const float clamped_py = fminf(fmaxf(py, c), d); float sx = lenx, sy = leny; float x0 = a, y0 = c; uint32_t idx = 0u; uint8_t type = start; int l = 0;

#pragma loop(ivdep)
while (l < levels) {
sx *= 0.5f; sy *= 0.5f;
const float mx = x0 + sx;
const float my = y0 + sy;

text
const uint32_t tr = static_cast<uint32_t>((clamped_px > mx) & (clamped_py > my)); const uint32_t tl = static_cast<uint32_t>((clamped_px < mx) & (clamped_py > my)); const uint32_t dl = static_cast<uint32_t>((clamped_px < mx) & (clamped_py < my)); const uint32_t none = static_cast<uint32_t>(1u ^ (tr | tl | dl)); const uint32_t dd = (tr << 1) | tr | tl | (none << 1); const InvStep inv = g_inv_tbl[type][dd]; type = inv.next; idx = (idx << 2) | inv.q; const uint32_t dx = dd >> 1; const uint32_t dy = dd & 1u; x0 += dx ? sx : 0.0f; y0 += dy ? sy : 0.0f; ++l; } const float scale_reciprocal = 1.0f / static_cast<float>(scale); return fmaf(static_cast<float>(idx) * scale_reciprocal, lenx, a);

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(const int peanoLevel, const float a, const float b, const float c, const float d) noexcept
{
g_env = new boost::mpi::environment();
g_world = new boost::mpi::communicator();
const int rank = g_world->rank();

text
switch (rank) { case 0: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, static_cast<uint8_t>(Top)); break; case 1: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, static_cast<uint8_t>(Down)); break; case 2: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, static_cast<uint8_t>(Left)); break; default: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, static_cast<uint8_t>(Right)); } return rank;

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_1D(const float global_iterations,
const float a, const float b, const float r,
const bool mode, const float epsilon, const float seed,
float** const __restrict out_data, size_t* const __restrict out_len) noexcept
{
Slab* const __restrict slab = tls.local();
slab->current = slab->base;

text
int schetchick = 0; const float initial_length = b - a; float dmax = initial_length; const float threshold_03 = 0.3f * initial_length; const float inv_threshold_03 = 1.0f / threshold_03; const float start_val = ShekelFunc(a, seed); float best_f = ShekelFunc(b, seed); float x_Rmax_1 = a; float x_Rmax_2 = b; float y_Rmax_1 = start_val; float y_Rmax_2 = best_f; std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr; std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R; Extr.clear(); Extr.reserve(static_cast<size_t>(global_iterations) << 2u); R.clear(); R.reserve(static_cast<size_t>(global_iterations) << 1u); R.emplace_back(new Interval(a, b, start_val, best_f, 1.0f)); float Mmax = R.front()->M; float m = r * Mmax; while (true) { const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r); const float new_value = ShekelFunc(new_point, seed); if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_point); } std::pop_heap(R.begin(), R.end(), ComparePtr); const Interval* const __restrict promejutochny_otrezok = R.back(); const float new_x1 = promejutochny_otrezok->x1; const float new_x2 = promejutochny_otrezok->x2; const float len2 = new_x2 - new_point; const float len1 = new_point - new_x1; const float interval_len = len1 < len2 ? len1 : len2; if (++schetchick == static_cast<int>(global_iterations) || interval_len < epsilon) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); return; } Interval* const __restrict curr = new Interval(new_x1, new_point, promejutochny_otrezok->y1, new_value, 1.0f); Interval* const __restrict curr1 = new Interval(new_point, new_x2, new_value, promejutochny_otrezok->y2, 1.0f); const float currM = curr->M > curr1->M ? curr->M : curr1->M; const size_t r_size = R.size(); if (mode) { if (len2 + len1 == dmax) { dmax = len2 > len1 ? len2 : len1; size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
const float len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}

text
if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) { if (currM > Mmax) { Mmax = currM; m = r * Mmax; } const float progress = fmaf(-inv_threshold_03, dmax, 1.0f); const float alpha = fmaf(progress, progress, 1.0f); const float betta = 2.0f - alpha; const float MULTIPLIER = (1.0f / dmax) * Mmax; const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER); const float GLOBAL_FACTOR = betta * global_coeff; curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha)); curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha)); if (r_size < 64u) { size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
}
else {
RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
if (r_size < 64u) {
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
if (r_size < 64u) {
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}

text
R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr); R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr); const Interval* const __restrict top_ptr = R.front(); x_Rmax_1 = top_ptr->x1; x_Rmax_2 = top_ptr->x2; y_Rmax_1 = top_ptr->y1; y_Rmax_2 = top_ptr->y2; }

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_2D(
const float N, const float global_iterations,
const float a, const float b, const float c, const float d, const float r,
const bool mode, const float epsilon, const float seed,
float** const __restrict out_data, size_t* const __restrict out_len) noexcept
{
Slab* const __restrict slab = tls.local();
slab->current = slab->base;

text
int schetchick = 0; int no_improve = 0; const int rank = g_world->rank(); const int world_size = g_world->size(); while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg dummy; g_world->recv(boost::mpi::any_source, 0, dummy); } const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1)); const float x_addition = (b - a) * inv_divider; const float y_addition = (d - c) * inv_divider; const float true_start = a + x_addition; const float true_end = b - x_addition; float x_Rmax_1 = true_start; float x_Rmax_2 = true_end; const float initial_length = x_Rmax_2 - x_Rmax_1; float dmax = initial_length; const float threshold_03 = 0.3f * initial_length; const float inv_threshold_03 = 1.0f / threshold_03; const float start_val = rank % 3 ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition); float best_f = rank % 2 ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition); float y_Rmax_1 = start_val; float y_Rmax_2 = best_f; std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr; std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R; Extr.clear(); Extr.reserve(static_cast<size_t>(global_iterations) << 2u); R.clear(); R.reserve(static_cast<size_t>(global_iterations) << 1u); R.emplace_back(new Interval(true_start, true_end, start_val, best_f, 2.0f)); const Interval* __restrict top_ptr; float Mmax = R.front()->M; float m = r * Mmax; while (true) { const float interval_len = x_Rmax_2 - x_Rmax_1; const bool stagnation = no_improve > 100 && schetchick > 270; const float p = fmaf(-1.0f / initial_length, dmax, 1.0f); while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg in; g_world->recv(boost::mpi::any_source, 0, in); if (in.kind) { if (!rank) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); } return; } const float sx = FindX2D_analytic(in.xchg.s_x1, in.xchg.s_x2); const float ex = FindX2D_analytic(in.xchg.e_x1, in.xchg.e_x2); Interval* const __restrict injected = new Interval(sx, ex, RastriginFunc(in.xchg.s_x1, in.xchg.s_x2), RastriginFunc(in.xchg.e_x1, in.xchg.e_x2), 2.0f); injected->ChangeCharacteristic(m); if (injected->R > 1.15f * top_ptr->R) { const float k = stagnation ? fmaf((1.0f / expm1f(1.0f)) * 1.0f, expm1f(p), 0.3f) : fmaf((1.0f / expm1f(1.0f)) * 0.6f, expm1f(p), 0.6f); injected->R = in.xchg.Rtop * k; R.emplace_back(injected); std::push_heap(R.begin(), R.end(), ComparePtr); } } const int T = static_cast<int>(fmaf(-expm1f(p), 264.0f, 277.0f)); const bool want_term = interval_len < epsilon || schetchick == static_cast<int>(global_iterations); if (!(++schetchick % T) || stagnation || want_term) { CtrlMsg out; out.kind = want_term; if (!out.kind) { float s_x1, s_x2, e_x1, e_x2; HitTest2D_analytic(top_ptr->x1, s_x1, s_x2); HitTest2D_analytic(top_ptr->x2, e_x1, e_x2); out.xchg = CrossMsg{ s_x1, s_x2, e_x1, e_x2, top_ptr->R }; } int i = 0; while (i < world_size) { if (i != rank) g_world->isend(i, 0, out); ++i; } if (out.kind) { if (!rank) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); } return; } } const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r); float new_x1_val, new_x2_val; HitTest2D_analytic(new_point, new_x1_val, new_x2_val); const float new_value = RastriginFunc(new_x1_val, new_x2_val); if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_x1_val); Extr.emplace_back(new_x2_val); no_improve = 0; } else { ++no_improve; } std::pop_heap(R.begin(), R.end(), ComparePtr); Interval* const __restrict promejutochny_otrezok = R.back(); const float segment_x1 = promejutochny_otrezok->x1; const float segment_x2 = promejutochny_otrezok->x2; const float len2 = segment_x2 - new_point; const float len1 = new_point - segment_x1; Interval* const __restrict curr = new Interval(segment_x1, new_point, promejutochny_otrezok->y1, new_value, 2.0f); Interval* const __restrict curr1 = new Interval(new_point, segment_x2, new_value, promejutochny_otrezok->y2, 2.0f); const float currM = curr->M > curr1->M ? curr->M : curr1->M; const size_t r_size = R.size(); if (mode) { if (len2 + len1 == dmax) { dmax = len2 > len1 ? len2 : len1; size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
const float len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}

text
if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) { if (currM > Mmax) { Mmax = currM; m = r * Mmax; } const float progress = fmaf(-inv_threshold_03, dmax, 1.0f); const float alpha = fmaf(progress, progress, 1.0f); const float betta = 2.0f - alpha; const float MULTIPLIER = (1.0f / dmax) * Mmax; const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER); const float GLOBAL_FACTOR = betta * global_coeff; curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha)); curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha)); if (r_size < 64u) { size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
}
else {
RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
if (r_size < 64u) {
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
if (r_size < 64u) {
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}

text
R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr); R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr); top_ptr = R.front(); x_Rmax_1 = top_ptr->x1; x_Rmax_2 = top_ptr->x2; y_Rmax_1 = top_ptr->y1; y_Rmax_2 = top_ptr->y2; }

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() noexcept
{
int dummy;
float* __restrict buf;
size_t len;
while (true) {
if (g_world->iprobe(0, 1)) {
g_world->recv(0, 1, dummy);
AGP_2D(2.0f, 10000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.00001f, GetTickCount(), &buf, &len);
}
}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpStartWorkers() noexcept
{
int i = 1;
const int world_size = g_world->size();
while (i < world_size) {
g_world->isend(i, 1, 0);
++i;
}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_Free(float* const __restrict p) noexcept
{
CoTaskMemFree(p);
} - то что я сейчас прошу от тебя - это глубокое исследование моих нововведений в коде, я сейчас решаю задачу нахождения оптимального положения захвата робота манипулятора через алгоритм глобального поиска и хочу знать насколько мой подход оправдан, я точно знаю что то что делалось в одномерной и двумерной версиях алгоритма работало правильно и быстро - если будет уместно перенести оттуда часть фишек - возможно стоит это сделать - алгоритм работает в целом похоже что корректно и быстро, из штрафных действий - мне нужно только назначение штрафа на очень острые углы между звеньями робота - так как в них заклинивание, проведи глубокий анализ на предмет возможных улучшений, может быть есть явные грубые алгоритмические ошибки или может быть систему штрафов за острые углы можно усовершенстровать - в первую очередь для скорости работы алгоритма, для более быстрого нахождения оптимального положения захвата, во вторую очередь для точности работы алгоритма - что отчасти скорость так как при более точном поиске алгоритм быстрее сойдётся, читаемость и безопасность и надёжность кода не важны - важна только сходимость и скорость сходимости, давай конкретные примеры улучшений/исправлений в моём коде, также некоторые константы для алгоритма были взяты из двумерной версии - так что некоторые константы возможно стоит увеличить для большей надёжности поиска на ранних этапах, деликатный вопрос также с числом ориентаций гиперкубов - их должно быть столько - сколько есть сейчас доступных процессов

사고

Considering algorithm optimization

The user's question is asking for algorithmic improvements focused on convergence speed and reliability, not readability. They want concrete suggestions without affecting the readability of their code. The problem involves applying MortonND mappings and optimizations in parallel with MPI. I'll focus on potential changes, like modifying the cost function manipulations for better performance with multidimensional mappings.

Optimizing MortonND and memory use

The user’s code now uses MortonND instead of Peano2DMap, and they’re asking whether AgpInit and pre-initialization are necessary. They want to improve performance by adjusting early-phase constants, and evaluate the sharp joint angle penalty, memory allocations, and potential algorithmic errors. I'll suggest modifications to improve memory usage and ensure thread safety without affecting performance.

Optimizing vector allocations and message handling

In agp_run_branch_mpi, vectors like Interval* H and bestQ are frequently reallocated and processed with heaps. I’m considering options like using a custom d-ary heap or an intrusive pairing heap for efficiency, though the current make_heap might still suffice. I'll also check improvements for AVX vectorization, message schedule, and search direction adaptation.

Exploring Hölder exponent and Morton mapping

The user is exploring the use of N to compute new locations in different dimensions. The approach seems consistent with the Hölder exponent and space-filling curve properties. While Morton mapping isn't as smooth as Peano/Hilbert, they tried to mitigate this with random bit flips. Still, the performance may degrade in terms of convergence speed due to weaker locality.

Assessing Morton-like and optimization factors

The user is considering improvements to the Morton-like mapping, using Gray code, and exploring Hilbert index generation for better continuity and mapping handling. They're also evaluating the penalty structure in the manipulator algorithm and proposing updates for smoother performance, especially when it comes to joint angle differences and array management efficiency.

Optimizing processes for efficiency

The user is considering multiple improvements to algorithmic processes, like better management of intervals for ND mapping through randomized permutations, optimizing reliability parameters, and reducing computational load by avoiding redundant memory allocations. They also suggest precomputing trigonometric values, using vectorized sin/cos for efficiency, and preallocating buffers to reduce per-evaluation allocations.

Improving algorithmic efficiency

The user is considering multiple improvements, such as precomputing bit-deinterleaving masks and minimizing vector allocations in loops. They propose using Gray-code traversal to reduce mapping overhead and adjusting parameters for adaptability. Additional suggestions include optimizing scaling, early-termination conditions, and dynamic adjustments to improve performance.

Algorithm refinement suggestions

I’m analyzing a penalty issue in a kinematics algorithm that incorrectly penalizes differences between consecutive joint angles. The penalty should focus on the individual joint angle magnitudes, not the difference, to avoid misrepresenting the angles between links. Additionally, I'll suggest updating code for consistency in base length handling.

Suggestions and improvements

In the manipulator algorithm, unifying the base length uses across the UI and code is essential. Some improvements include enhancing penalty definitions, precomputing mapping data, avoiding memory allocations during evaluation, and ensuring efficient scheduling. The overall focus is optimizing performance through smooth penalties, pre-sampling, and structural refinements.

Thought for 2m 42s

답변

Ниже — разбор по сути того, что вы поменяли, где это влияет на сходимость/скорость, какие «узкие места» есть сейчас, и что именно править. Я даю только те правки, которые улучшают быстроту нахождения хорошего решения (и/или устойчивость сходимости). Про читаемость/«красоту» не заботился.


Что принципиально изменилось

  1. Редукция к 1D осталась прежней (AGP/Strongin-подобный метод на [0,1]), но:

    • в 1D/2D вы использовали континуальные развёртки (Пеано/Гильберт) → гарантированная 1/D-Хёльдеровость с хорошей локальностью,
    • в ND вы перешли на Morton (Z-order) с пермутациями/инверсиями бит → локальность хуже, непрерывность хуже (скачки на границах ячеек), Хёльдеров константа эффективно больше.

    Следствие: на ранних этапах ND поиск более «рваный», требует более сильной «надежности» (больше r, агрессивнее первичное засевание), и умнее share/injection между ранками.

  2. Целевая функция изменилась: теперь это геометрия манипулятора + штраф за «острые углы». Это отличная постановка, но у вас в ней есть логическая ошибка (ниже).


Два ключевых момента, дающих максимальный прирост

(A) Штраф за острые углы — сейчас считается неверно

В ManipCost::operator() вы аккумулируете угол phi как сумму th[i], т.е. th[i] — это как раз относительный суставный угол звена i. Тогда угол между звеньями i и i+1 — это |th[i+1]|, а не |th[i+1]-th[i]|.

Ваш текущий штраф:

cpp
for (int i = 0; i < n - 1; ++i) { float dth = th[i + 1] - th[i]; if (fabsf(dth) < minTheta) penC += minTheta - fabsf(dth); }

штрафует похожесть соседних суставных углов, а не «слишком острый» угол между звеньями. Это искажает ландшафт и может замедлять/уносить поиск.

Правильно штрафовать каждый суставный угол за малость его модуля:

cpp
// Быстрый «квадратичный хиндж», гладкий у границы: float smooth_hinge(float a) { return a * a; } // Вместо цикла по (i=0..n-2) по разности, делаем по всем суставам: for (int i = 1; i < n; ++i) { // i=1..n-1 — углы между звеньями float ai = fabsf(th[i]); if (ai < minTheta) { // Квадратичный штраф дает более «мягкий» рельеф → меньше ложных плато penC += smooth_hinge(minTheta - ai); } }

Это изменение само по себе часто ускоряет сходимость, потому что минимум становится «естественным»: алгоритм не бьётся с ложным регуляризатором гладкости (который вы невольно ввели через th[i+1]-th[i]), а решает действительно нужное ограничение.

Дополнительно (при необходимости): если для механики нежелательна и близость к π (почти разогнутый локоть), добавьте мягкий барьер возле π:

cpp
float straight = (float)M_PI - ai; if (straight < minStraight) penC += beta * smooth_hinge(minStraight - straight);

(где beta — небольшой вес; по умолчанию 0 → выключено).

(B) Убрать лишние аллокации/повторы и сделать массовое «прогревание» (низкая стоимость, ощутимый эффект)

Сейчас в agp_run_branch_mpi() вы:

  • каждый evalAt(t) создаёте std::vector<float> q(dim),
  • внутри MortonND::map01ToPoint каждый раз создаёте std::vector<uint64_t> coord(dim,0),

и второй раз считаете кинематику для (x,y) после cost(q) — т.е. делаете ту же работу 2 раза.

Сделайте три простых шага:

  1. Переиспользуйте буферы (без аллокации на каждом вызове):
cpp
std::vector<float> q(dim); // снаружи evalAt float xy[2]; auto evalAt = [&](float t) -> float { map.map01ToPoint_noalloc(t, q.data()); // версия без аллокаций внутри float f = cost(q.data(), xy); // cost возвращает f и пишет x,y if (f < bestF) { bestF = f; bestQ = q; bestX = xy[0]; bestY = xy[1]; } return f; };
  1. Дайте MortonND «рабочую память» один раз (никаких временных векторов внутри):
cpp
struct MortonND { // ... mutable std::vector<uint64_t> coord; // выделить один раз // доп. конструктор, где coord.resize(dim) void map01ToPoint_noalloc(float t, float* out) const noexcept { // использовать this->coord вместо локального вектора } };
  1. Сделайте начальное «прогревание» heap -а K точками. В ND-Z-order рельеф по t более «неровный», 2 точки (a,b) — слишком мало для адекватной оценки Mmax. Простейший и быстрый трюк:
cpp
const int K = std::min(std::max(2*dim, 8), 128); // эмпирика: 2*dim..128 H.clear(); H.reserve(maxIter + K + 8); float prev_t = a; float prev_f = f_a; for (int k = 1; k <= K; ++k) { float t = (float)k / (K + 1); float f = evalAt(t); Interval* I = new Interval(prev_t, t, prev_f, f, (float)dim); Mmax = std::max(Mmax, I->M); I->ChangeCharacteristic(r * Mmax); H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtr); prev_t = t; prev_f = f; } { Interval* tail = new Interval(prev_t, b, prev_f, f_b, (float)dim); Mmax = std::max(Mmax, tail->M); tail->ChangeCharacteristic(r * Mmax); H.emplace_back(tail); std::push_heap(H.begin(), H.end(), ComparePtr); }

Эти три пункта сокращают время на вызов в разы (без изменения логики), а «прогревание» ощутимо стабилизирует раннюю фазу ND-поиска.


Остальные улучшения (коротко, но по делу)

1) Согласовать эпсилон с ND-редукцией

Сейчас остановка по длине 1D-интервала: interval_len < eps. Для ND редукции корректнее ориентироваться на эквивалентный шаг в исходном пространстве:

cpp
bool want_term = (powf(interval_len, 1.0f / (float)dim) < eps) || (it == maxIter-1);

Это делает критерий приведения сетки сопоставимым с размером «ячейки» в ND.

2) Масштаб r для ND

Morton-редукция «жёстче», чем Гильберт: нужно больше надежности в ранней фазе. Простая рабочая эвристика:

cpp
float r_eff = r * sqrtf((float)dim); // или r * (1 + 0.15f*(dim-2)) m = r_eff * Mmax;

Это уменьшает риск «проскока» важных интервалов до того, как Mmax стабилизируется.

3) Динамический порог при инъекции интервалов (как в 2D)

В ND-ветке у вас:

cpp
if (!H.empty()) { Interval* top = H.front(); if (inj->R > 1.15f * top->R) { inj->R = in.xchg.Rtop; ... } }

В 2D вы делали лучше: коэффициент k зависел от p и стагнации. Скопируйте это поведение в ND-ветку (т.е. замените 1.15 на динамический k):

cpp
float p = fmaf(-1.0f / initial_len, dmax, 1.0f); bool stagnation = /* ваш же критерий, например no_improve > ... */; float k = stagnation ? fmaf((1.0f / expm1f(1.0f)) * 1.0f, expm1f(p), 0.3f) : fmaf((1.0f / expm1f(1.0f)) * 0.6f, expm1f(p), 0.6f); if (inj->R > k * top->R) { inj->R = in.xchg.Rtop * k; push... }

Это резко повышает полезность транс-ранговых «подсказок» именно тогда, когда они нужны.

4) Ускорить прямую кинематику (особенно при большом n)

Сейчас вы для каждого i считаете cos(phi)/sin(phi), где phi — префиксная сумма. Быстрее так:

  1. Посчитать массив префиксных сумм phi[i].
  2. ОДНОКРАТНО вызвать ваш векторизованный FABE13_SINCOS(phi, sv, cv, n).
  3. Просуммировать x += Li*cv[i], y += Li*sv[i].

Это снимает N раз вызовы скалярного полинома и использует вашу же AVX-версию. На n >= 8 прирост заметный.

Альтернатива: рекуррентно обновлять (c,s) по формуле поворота через ct=cos(th[i]), st=sin(th[i]) (одна sincos на элемент вместо «большого» phi). Эффект похож; выберите что быстрее у вас на целевой CPU.

5) AgpInit: что теперь туда вынести

Инициализация Пеано/Гильберта для 2D вам была нужна один раз. В ND-случае сам MortonND вы каждый раз создаёте заново (генерируете perm и invMask). Дешевле:

  • на этапе AgpInit для каждого rank предсгенерировать единый «длинный» пермутатор permCache[MaxDim] и маски invMaskCache[MaxDim] (seed зависит от rank, как сейчас).
  • При запуске задачи с произвольным dim ≤ MaxDim просто использовать префикс этих массивов.
  • Маску для заданных levels — это просто (1ull<<levels)-1, её так же можно держать в готовом массиве для levels = 1..20.

Скелет:

cpp
struct MortonCachePerRank { std::vector<int> permCache; // size = MaxDim (например 64) std::vector<uint64_t> invMaskCache;// size = MaxDim uint32_t baseSeed; }; static MortonCachePerRank g_mc; extern "C" __declspec(dllexport) int AgpInit(int peanoLevel, float a, float b, float c, float d) { g_env = new boost::mpi::environment(); g_world = new boost::mpi::communicator(); int rank = g_world->rank(); // Старая 2D инициализация — оставить как есть для AGP_2D. // ... // Новое: подготовим кэш для ND: const int MaxDim = 64; // выберите разумный максимум g_mc.baseSeed = 0x9E3779B9u * (rank ? rank : 1); g_mc.permCache.resize(MaxDim); for (int i = 0; i < MaxDim; ++i) g_mc.permCache[i] = i; // Fisher–Yates на всём MaxDim: uint32_t st = g_mc.baseSeed; for (int k = MaxDim - 1; k > 0; --k) { float r; XOR_RAND(st, r); int j = (int)floorf(r * (k + 1)); std::swap(g_mc.permCache[k], g_mc.permCache[j]); } g_mc.invMaskCache.resize(MaxDim); for (int k = 0; k < MaxDim; ++k) { float r; XOR_RAND(st, r); g_mc.invMaskCache[k] = (r > 0.5f) ? ~0ull : 0ull; // на уровне конструктора Morton отрежете по levels } return rank; }

И конструктор MortonND, принимающий готовые кэши без копий/аллокаций:

cpp
MortonND(int D, int L, const float* lows, const float* highs, const MortonCachePerRank& mc) : dim(D), levels(L), scale(1ull), low(lows,lows+D), high(highs,highs+D), perm(D), invMask(D), coord(D, 0ull) { // просто взять первые D элементов из mc.permCache и mc.invMaskCache, // а invMask[d] &= ((1ull<<levels)-1ull) // ... }

Плюс: в agp_run_branch_mpi() зарезервируйте H на maxIter + K и bestQ на dim, чтобы не было реаллокаций.

В сумме это убирает массу мелких аллокаций и ускоряет каждый evalAt.

6) Частота обмена T и пороги адаптации — слегка «растянуть» для ND

Константы 277/264/0.3 отлично заходили в 2D. В ND уменьшите окно между обменами в начале и ослабьте порог «0.3*initial_len»:

  • thr03 = (0.2f + 0.02f*dim) * initial_len; // было 0.3
  • T0 = 96 на первых ~2*dim*K итераций, затем возвращаемся к вашей формуле T(p).
  • в «адаптивном» блоке не стесняйтесь чаще дергать RecomputeR_AffineM_AVX2 при больших dmax — ND от этого выигрывает.

7) Баланс «доли процессов vs ориентаций»

Ваше пожелание «количество ориентаций = количество процессов» — разумно. С кэшем из AgpInit выше это буквально «по ранку»: каждый rank получает свою фиксированную permCache/invMaskCache. Внутри одной задачи этого достаточно; если хотите ещё больше разнообразия — меняйте baseSeed от вызова к вызову (например seed ^ (tickCount<<rank)).

8) Мелкие, но полезные доработки

  • Инициализируйте bestQ.reserve(dim) перед поиском.
  • В Shag() добавьте минимальный «порог» для Mmax в начале (когда delta_y почти равны) — это гасит «волчок»:
    cpp
    float Mmax = std::max(root->M, 1e-6f);
  • В agp_run_branch_mpi используйте тот же «адаптивный» пересчёт m и массива R, что и в 2D (у вас уже есть этот код — просто обеспечить, чтобы он чаще срабатывал в ND при больших dmax).
  • Если сегментов много (n >= 16), вариант с префикс-суммами phi[] + FABE13_SINCOS(phi,...) почти всегда быстрее, чем скаляр «на лету».

Итоговая минимальная «пачка» изменений (что внедрить в первую очередь)

  1. Исправить штраф (см. блок (A)).
  2. Убрать аллокации в evalAt и в MortonND::map01ToPoint + возвращать x,y из cost.
  3. Прогревать heap K точками (K≈min(max(2*dim,8),128)), и от K обновить Mmax перед основным циклом.
  4. Сделать инъекцию динамической (как в 2D), а не 1.15f.
  5. Согласовать eps с ND: powf(interval_len, 1.0f/dim) < eps.
  6. Масштабировать r: r_eff = r * sqrtf(dim) (или мягко линейно).
  7. AgpInit: предсгенерировать perm/mask для ранков и использовать их в MortonND без выделений.

Эти шаги дадут заметную прибавку по скорости сходимости на ND-задаче и уменьшат «вредные» эффекты Morton-развёртки в ранней фазе. Всё остальное — тонкая настройка, которую имеет смысл крутить уже «по результатам» (T-график, thr03, веса штрафа).

이 Q&A 공유하기