#include "pch.h" #define XOR_RAND(state, result_v...

Question

#include "pch.h"

#define XOR_RAND(state, result_var)
do {
int s = state;
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = state * 0x1.0p-32f;
} while(0)

#define XOR_RAND_GRSH(state, result_var)
do {
int s = state;
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = fmaf(state, 0x1.0p-31f, -1.0f);
} while(0)

#define FABE13_COS(x, result_var)
do {
const float abs_val = fabsf(x);
float reduced = fmodf(abs_val, 6.28318530718f);
if(reduced > 3.14159265359f) {
reduced = 6.28318530718f - reduced;
}
if(reduced < 1.57079632679f) {
const float val2 = reduced * reduced;
const float val4 = val2 * val2;
result_var = fmaf(val4, fmaf(val2, -0.0013888889f, 0.0416666667f), fmaf(val2, -0.5f, 1.0f));
} else {
reduced = 3.14159265359f - reduced;
const float val2 = reduced * reduced;
const float val4 = val2 * val2;
result_var = -fmaf(val4, fmaf(val2, -0.0013888889f, 0.0416666667f), fmaf(val2, -0.5f, 1.0f));
}
} while(0)

#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
const int limit = n & ~7;
if(n >= 8) {
static __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f);
static __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f);
static __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f);
static __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f);
static __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f);
static __declspec(align(32)) const __m256 VEC_COS_P5 = _mm256_set1_ps(-0.0013888889f);
static __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f);
static __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f);
static __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_SIN_P5 = _mm256_set1_ps(-0.0001984127f);
static __declspec(align(32)) const __m256 VEC_SIN_P3 = _mm256_set1_ps(0.0083333333f);
static __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f);
static __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps();
while(i < limit) {
const __m256 vx = _mm256_load_ps(&in[i]);
const __m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
const __m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
const __m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
const __m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
const __m256 t2 = _mm256_mul_ps(r2, r2);
const __m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P5, VEC_COS_P3), VEC_COS_P1), VEC_COS_P0);
const __m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_SIN_P5, VEC_SIN_P3), VEC_SIN_P1), VEC_SIN_P0), r2);
const __m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
const __m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ), _mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
_mm256_store_ps(&cos_out[i], _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip));
_mm256_store_ps(&sin_out[i], _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip));
i += 8;
}
}
while(i < n) {
const float x = in[i];
const float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
const bool sflip = r > 3.14159265359f;
if(sflip) r = 6.28318530718f - r;
const bool cflip = r > 1.57079632679f;
if(cflip) r = 3.14159265359f - r;
const float t2 = r * r;
const float c = fmaf(t2, fmaf(t2, fmaf(t2, -0.0013888889f, 0.0416666667f), -0.5f), 1.0f);
const float s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
cos_out[i] = cflip ? -c : c;
sin_out[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while(0)

enum List : uint8_t { Top = 0b00u, Down = 0b01u, Left = 0b10u, Right = 0b11u };

__declspec(align(4)) struct Step final {
const uint8_t next;
const uint8_t dx;
const uint8_t dy;
};

__declspec(align(4)) struct InvStep final {
const uint8_t q;
const uint8_t next;
};

__declspec(align(64)) static const Step g_step_tbl[4][4] = {
{ {Right,0u,0u}, {Top,0u,1u}, {Top,1u,1u}, {Left,1u,0u} },
{ {Left,1u,1u}, {Down,1u,0u}, {Down,0u,0u}, {Right,0u,1u} },
{ {Down,1u,1u}, {Left,0u,1u}, {Left,0u,0u}, {Top,1u,0u} },
{ {Top,0u,0u}, {Right,1u,0u}, {Right,1u,1u}, {Down,0u,1u} }
};

__declspec(align(64)) static const InvStep g_inv_tbl[4][4] = {
{ {0u,Right}, {1u,Top}, {3u,Left}, {2u,Top} },
{ {2u,Down}, {3u,Right}, {1u,Down}, {0u,Left} },
{ {2u,Left}, {1u,Left}, {3u,Top}, {0u,Down} },
{ {0u,Top}, {3u,Down}, {1u,Right}, {2u,Right} }
};

static const boost::mpi::environment* __restrict g_env;
static const boost::mpi::communicator* __restrict g_world;

__declspec(align(16)) struct CrossMsg final {
float s_x1, s_x2;
float e_x1, e_x2;
float Rtop;
template<typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }
};

__declspec(align(16)) struct CtrlMsg final {
bool kind;
CrossMsg xchg;
template<typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept {
ar& kind& xchg;
}
};

__declspec(align(16)) struct Slab final {
char* const __restrict base;
char* __restrict current;
char* const __restrict end;

text
__declspec(noalias) __forceinline Slab(void* const __restrict memory, const size_t usable_size) noexcept
	: base(static_cast<char* __restrict>(memory))
	, current(base)
	, end(base + (usable_size & ~static_cast<size_t>(63u)))
{
}

};

static tbb::enumerable_thread_specific<Slab*> tls( noexcept {
void* const __restrict memory = _aligned_malloc(16777216u, 16u);
Slab* const __restrict slab = static_cast<Slab*>(_aligned_malloc(32u, 16u));
new (slab) Slab(memory, 16777216u);

char* __restrict p = slab->base;

#pragma loop(ivdep)
while (p < slab->end) {
*p = 0u;
p += 4096u;
}
return slab;
}());

__declspec(align(64)) struct Interval final {
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float ordinate_factor;
const float N_factor;
const float quadratic_term;
const float M;
float R;

text
__declspec(noalias) __forceinline void* operator new(const size_t) noexcept {
	Slab* const __restrict s = tls.local();
	char* const __restrict result = s->current;
	s->current += 64u;
	return result;
}

__declspec(noalias) __forceinline Interval(const float _x1, const float _x2,
	const float _y1, const float _y2, const float _N) noexcept
	: x1(_x1), x2(_x2), y1(_y1), y2(_y2)
	, delta_y(_y2 - _y1)
	, ordinate_factor(-(y1 + y2) * 2.0f)
	, N_factor(_N == 1.0f ? _x2 - _x1 : _N == 2.0f ? sqrtf(_x2 - _x1) : powf(_x2 - _x1, 1.0f / _N))
	, quadratic_term((1.0f / N_factor)* delta_y* delta_y)
	, M((1.0f / N_factor)* fabsf(delta_y))
{
}

__declspec(noalias) __forceinline void ChangeCharacteristic(const float _m) noexcept
{
	R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor));
}

};

__declspec(align(16)) struct Peano2DMap final {
const int levels;
const float a, b, c, d;
const float lenx, leny;
const float inv_lenx;
const uint32_t scale;
const uint8_t start;

text
__declspec(noalias) __forceinline Peano2DMap(
	const int L,
	const float _a,
	const float _b,
	const float _c,
	const float _d,
	const uint8_t startType
) noexcept
	: levels(L)
	, a(_a), b(_b), c(_c), d(_d)
	, lenx(_b - _a)
	, leny(_d - _c)
	, inv_lenx(1.0f / (_b - _a))
	, scale(static_cast<uint32_t>(1u) << (L << 1))
	, start(startType)
{
}

};

static Peano2DMap gActiveMap(0, 0.0f, 0.0f, 0.0f, 0.0f, 0b00u);

static __declspec(noalias) __forceinline bool ComparePtr(const Interval* const __restrict a, const Interval* const __restrict b) noexcept
{
return a->R < b->R;
}

static __declspec(noalias) __forceinline float ShekelFunc(const float x, const float seed) noexcept
{
int i = 0;
float current_state = seed, current_res, current_res2, res = 0.0f;
while (i < 10) {
XOR_RAND(current_state, current_res);
const float x_part = fmaf(-current_res, 10.0f, x);
XOR_RAND(current_state, current_res);
XOR_RAND(current_state, current_res2);
float delimiter = fmaf(fmaf(current_res, 20.0f, 5.0f), x_part * x_part, fmaf(current_res2, 0.2f, 1.0f));
delimiter = copysignf(fmaxf(fabsf(delimiter), FLT_MIN), delimiter);
res -= 1.0f / delimiter;
++i;
}
return res;
}

static __declspec(noalias) __forceinline float RastriginFunc(const float x1, const float x2) noexcept
{
const float term1 = fmaf(x1, x1, x2 * x2);
float cos1, cos2;
FABE13_COS(6.28318530717958647692f * x1, cos1);
FABE13_COS(6.28318530717958647692f * x2, cos2);
return (term1 - fmaf(cos1 + cos2, 10.0f, -14.6f)) * fmaf(-term1, 0.25f, 18.42f);
}

static __declspec(noalias) __forceinline float HillFunc(const float x, const float seed) noexcept
{
int j = 0;
__declspec(align(32)) float angles[14u];
const float start_angle = 6.28318530717958647692f * x;
#pragma loop(ivdep)
while (j < 14) {
angles[j] = start_angle * static_cast<float>(j + 1);
++j;
}
__declspec(align(32)) float sin_vals[14u];
__declspec(align(32)) float cos_vals[14u];
FABE13_SINCOS(angles, sin_vals, cos_vals, 14u);
float current_state = seed, current_res, current_res2;
XOR_RAND(current_state, current_res);
float res = fmaf(current_res, 2.0f, -1.1f);
--j;
while (j >= 0) {
XOR_RAND(current_state, current_res);
XOR_RAND(current_state, current_res2);
res += fmaf(fmaf(current_res, 2.0f, -1.1f), sin_vals[j], fmaf(current_res2, 2.0f, -1.1f) * cos_vals[j]);
--j;
}
return res;
}

static __declspec(noalias) __forceinline float GrishaginFunc(const float x1, const float x2, const float seed) noexcept
{
int j = 0;
__declspec(align(32)) float angles_j[8u];
__declspec(align(32)) float angles_k[8u];
#pragma loop(ivdep)
while (j < 8) {
const float pj_mult = 3.14159265358979323846f * static_cast<float>(j + 1);
angles_j[j] = pj_mult * x1;
angles_k[j] = pj_mult * x2;
++j;
}
__declspec(align(32)) float sin_j[8u], cos_j[8u];
__declspec(align(32)) float sin_k[8u], cos_k[8u];
FABE13_SINCOS(angles_j, sin_j, cos_j, 8u);
FABE13_SINCOS(angles_k, sin_k, cos_k, 8u);
--j;
float part1 = 0.0f;
float part2 = 0.0f;
float current_state = seed, current_res, current_res2;
while (j >= 0) {
size_t k = 0u;
while (k < 8u) {
const float sin_term = sin_j[j] * sin_j[j];
const float cos_term = cos_k[k] * cos_k[k];
XOR_RAND_GRSH(current_state, current_res);
XOR_RAND_GRSH(current_state, current_res2);
part1 = fmaf(current_res, sin_term, fmaf(current_res2, cos_term, part1));
XOR_RAND_GRSH(current_state, current_res);
XOR_RAND_GRSH(current_state, current_res2);
part2 = fmaf(-current_res, cos_term, fmaf(current_res2, sin_term, part2));
++k;
}
--j;
}
return -sqrtf(fmaf(part1, part1, part2 * part2));
}

static __declspec(noalias) __forceinline float Shag(const float _m, const float x1, const float x2, const float y1,
const float y2, const float _N, const float _r) noexcept
{
const float diff = y2 - y1;
const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(
0x3F800000u | ((reinterpret_cast<const uint32_t>(&diff) & 0x80000000u) ^ 0x80000000u))));
return _N == 1.0f
? fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f
: _N == 2.0f
? fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f
: fmaf(sign_mult / powf(_m, _N), powf(diff, _N) * _r, x1 + x2) * 0.5f;
}

static __declspec(noalias) __forceinline void RecomputeR_ConstM_AVX2(Interval* const* const __restrict arr, const size_t n, const float m) noexcept {
const __m256 vm = _mm256_set1_ps(m);

text
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));

size_t i = 0;
const int limit = static_cast<int>(n & ~7u);
if (n >= 8u) {
	while (i < static_cast<size_t>(limit)) {
		__declspec(align(32)) float q[8], nf[8], ord[8];
		int k = 0;

#pragma loop(ivdep)
while (k < 8) {
const Interval* const __restrict p = arr[i + k];
q[k] = p->quadratic_term;
nf[k] = p->N_factor;
ord[k] = p->ordinate_factor;
++k;
}
const __m256 vq = _mm256_load_ps(q);
const __m256 vnf = _mm256_load_ps(nf);
const __m256 vod = _mm256_load_ps(ord);

text
		const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
		const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);

		__declspec(align(32)) float out[8];
		_mm256_store_ps(out, res);
		k = 0;

#pragma loop(ivdep)
while (k < 8) {
arr[i + k]->R = out[k];
++k;
}
i += 8u;
}
}
while (i < n) {
arr[i]->ChangeCharacteristic(m);
++i;
}
}

static __declspec(noalias) __forceinline void RecomputeR_AffineM_AVX2(Interval* const* const __restrict arr, const size_t n, const float GLOBAL_FACTOR, const float alpha) noexcept {
const __m256 vGF = _mm256_set1_ps(GLOBAL_FACTOR);
const __m256 va = _mm256_set1_ps(alpha);

text
size_t i = 0;
const int limit = static_cast<int>(n & ~7u);
if (n >= 8u) {
	while (i < static_cast<size_t>(limit)) {
		__declspec(align(32)) float len[8], Mv[8], q[8], nf[8], ord[8];
		int k = 0;

#pragma loop(ivdep)
while (k < 8) {
const Interval* const p = arr[i + k];
len[k] = p->x2 - p->x1;
Mv[k] = p->M;
q[k] = p->quadratic_term;
nf[k] = p->N_factor;
ord[k] = p->ordinate_factor;
++k;
}
const __m256 vlen = _mm256_load_ps(len);
const __m256 vM = _mm256_load_ps(Mv);
const __m256 vq = _mm256_load_ps(q);
const __m256 vnf = _mm256_load_ps(nf);
const __m256 vod = _mm256_load_ps(ord);

text
		const __m256 vm = _mm256_fmadd_ps(vGF, vlen, _mm256_mul_ps(va, vM));

		__m256 vinvm = _mm256_rcp_ps(vm);
		vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));

		const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
		const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);

		__declspec(align(32)) float out[8];
		_mm256_store_ps(out, res);
		k = 0;

#pragma loop(ivdep)
while (k < 8) {
arr[i + k]->R = out[k];
++k;
}
i += 8u;
}
}
while (i < n) {
const Interval* const __restrict p = arr[i];
const float mi = fmaf(GLOBAL_FACTOR, p->x2 - p->x1, p->M * alpha);
arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor));
++i;
}
}

__declspec(noalias) __forceinline void HitTest2D_analytic(const float x_param, float& out_x1, float& out_x2) noexcept
{
const float a = gActiveMap.a;
const float inv_lenx = gActiveMap.inv_lenx;
const uint32_t scale = gActiveMap.scale;
const uint32_t scale_minus_1 = scale - 1u;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const float c = gActiveMap.c;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;

text
float norm = (x_param - a) * inv_lenx;
norm = fminf(fmaxf(norm, 0.0f), 0x1.fffffep-1f);

uint32_t idx = static_cast<uint32_t>(norm * static_cast<float>(scale));
idx = idx > scale_minus_1 ? scale_minus_1 : idx;

float sx = lenx, sy = leny;
float x1 = a, x2 = c;
uint8_t type = start;

int l = levels - 1;

#pragma loop(ivdep)
while (l >= 0) {
const uint32_t q = (idx >> (l * 2)) & 3u;
const Step s = g_step_tbl[type][q];
type = s.next;
sx *= 0.5f; sy *= 0.5f;
x1 += s.dx ? sx : 0.0f;
x2 += s.dy ? sy : 0.0f;
--l;
}
out_x1 = x1 + sx * 0.5f;
out_x2 = x2 + sy * 0.5f;
}

__declspec(noalias) __forceinline float FindX2D_analytic(const float px, const float py) noexcept
{
const float a = gActiveMap.a;
const float b = gActiveMap.b;
const float c = gActiveMap.c;
const float d = gActiveMap.d;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const uint32_t scale = gActiveMap.scale;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;

text
const float clamped_px = fminf(fmaxf(px, a), b);
const float clamped_py = fminf(fmaxf(py, c), d);

float sx = lenx, sy = leny;
float x0 = a, y0 = c;
uint32_t idx = 0u;
uint8_t type = start;

int l = 0;

#pragma loop(ivdep)
while (l < levels) {
sx *= 0.5f; sy *= 0.5f;
const float mx = x0 + sx;
const float my = y0 + sy;

text
	const uint32_t tr = static_cast<uint32_t>((clamped_px > mx) & (clamped_py > my));
	const uint32_t tl = static_cast<uint32_t>((clamped_px < mx) & (clamped_py > my));
	const uint32_t dl = static_cast<uint32_t>((clamped_px < mx) & (clamped_py < my));
	const uint32_t none = static_cast<uint32_t>(1u ^ (tr | tl | dl));

	const uint32_t dd = (tr << 1) | tr | tl | (none << 1);

	const InvStep inv = g_inv_tbl[type][dd];
	type = inv.next;
	idx = (idx << 2) | inv.q;

	const uint32_t dx = dd >> 1;
	const uint32_t dy = dd & 1u;
	x0 += dx ? sx : 0.0f;
	y0 += dy ? sy : 0.0f;
	++l;
}

const float scale_reciprocal = 1.0f / static_cast<float>(scale);
return fmaf(static_cast<float>(idx) * scale_reciprocal, lenx, a);

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(const int peanoLevel, const float a, const float b, const float c, const float d) noexcept
{
g_env = new boost::mpi::environment();
g_world = new boost::mpi::communicator();
const int rank = g_world->rank();

text
new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, rank ? static_cast<uint8_t>(Down) : static_cast<uint8_t>(Top));
return rank;

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_1D(const float global_iterations,
const float a, const float b, const float r,
const bool mode, const float epsilon, const float seed,
float** const __restrict out_data, size_t* const __restrict out_len) noexcept
{
Slab* const __restrict slab = tls.local();
slab->current = slab->base;

text
int schetchick = 0;
const float initial_length = b - a;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length;
const float inv_threshold_03 = 1.0f / threshold_03;
const float start_val = ShekelFunc(a, seed);
float best_f = ShekelFunc(b, seed);
float x_Rmax_1 = a;
float x_Rmax_2 = b;
float y_Rmax_1 = start_val;
float y_Rmax_2 = best_f;

std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R;
Extr.clear();
Extr.reserve(static_cast<size_t>(global_iterations) << 2u);
R.clear();
R.reserve(static_cast<size_t>(global_iterations) << 1u);
R.emplace_back(new Interval(a, b, start_val, best_f, 1.0f));

float Mmax = R.front()->M;
float m = r * Mmax;

while (true) {
	const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r);
	const float new_value = ShekelFunc(new_point, seed);

	if (new_value < best_f) {
		best_f = new_value;
		Extr.emplace_back(best_f);
		Extr.emplace_back(new_point);
	}

	std::pop_heap(R.begin(), R.end(), ComparePtr);
	const Interval* const __restrict promejutochny_otrezok = R.back();

	const float new_x1 = promejutochny_otrezok->x1;
	const float new_x2 = promejutochny_otrezok->x2;
	const float len2 = new_x2 - new_point;
	const float len1 = new_point - new_x1;
	const float interval_len = len1 < len2 ? len1 : len2;

	if (interval_len < epsilon || ++schetchick == static_cast<int>(global_iterations)) {
		Extr.emplace_back(static_cast<float>(schetchick));
		Extr.emplace_back(interval_len);
		*out_len = Extr.size();
		*out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
		memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
		return;
	}

	Interval* const __restrict curr = new Interval(new_x1, new_point, promejutochny_otrezok->y1, new_value, 1.0f);
	Interval* const __restrict curr1 = new Interval(new_point, new_x2, new_value, promejutochny_otrezok->y2, 1.0f);
	const float currM = curr->M > curr1->M ? curr->M : curr1->M;
	const size_t r_size = R.size();

	if (mode) {
		if (len2 + len1 == dmax) {
			dmax = len2 > len1 ? len2 : len1;
			size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
const float len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}

text
		if (threshold_03 > dmax && schetchick % 3 == 0 || 10.0f * dmax < initial_length) {
			if (currM > Mmax) {
				Mmax = currM;
				m = r * Mmax;
			}
			const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
			const float alpha = fmaf(progress, progress, 1.0f);
			const float betta = 2.0f - alpha;
			const float MULTIPLIER = (1.0f / dmax) * Mmax;
			const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
			const float GLOBAL_FACTOR = betta * global_coeff;

			curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
			curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));

			if (r_size < 64u) {
				size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
}
else {
RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
if (currM - Mmax < Mmax * 0.15f) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
if (r_size < 64u) {
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
if (currM - Mmax < Mmax * 0.15f) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
if (r_size < 64u) {
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}

text
	R.back() = curr;
	std::push_heap(R.begin(), R.end(), ComparePtr);
	R.emplace_back(curr1);
	std::push_heap(R.begin(), R.end(), ComparePtr);

	const Interval* const __restrict top_ptr = R.front();
	x_Rmax_1 = top_ptr->x1;
	x_Rmax_2 = top_ptr->x2;
	y_Rmax_1 = top_ptr->y1;
	y_Rmax_2 = top_ptr->y2;
}

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_2D(
const float N, const float global_iterations,
const float a, const float b, const float c, const float d, const float r,
const bool mode, const float epsilon, const float seed,
float** const __restrict out_data, size_t* const __restrict out_len) noexcept
{
Slab* const __restrict slab = tls.local();
slab->current = slab->base;

text
int schetchick = 0;
int no_improve = 0;
const int rank = g_world->rank();
const int partner = rank ^ 1;
int dummy;

const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1));
const float x_addition = (b - a) * inv_divider;
const float y_addition = (d - c) * inv_divider;
const float true_start = a + x_addition;
const float true_end = b - x_addition;

float x_Rmax_1 = true_start;
float x_Rmax_2 = true_end;
const float initial_length = true_end - true_start;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length;
const float inv_threshold_03 = 1.0f / threshold_03;
const float start_val = rank ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition);
float best_f = rank ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition);
float y_Rmax_1 = start_val;
float y_Rmax_2 = best_f;

std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R;
Extr.clear();
Extr.reserve(static_cast<size_t>(global_iterations) << 2u);
R.clear();
R.reserve(static_cast<size_t>(global_iterations) << 1u);
R.emplace_back(new Interval(true_start, true_end, start_val, best_f, 2.0f));
const Interval* __restrict top_ptr;

float Mmax = R.front()->M;
float m = r * Mmax;

while (true) {
	const float p = fmaf(-1.0f / initial_length, dmax, 1.0f);
	const int T = static_cast<int>(fmaf(-expm1f(p), 253.0f, 263.0f));
	const bool stagnation = no_improve > 50 && schetchick > 220;
	const bool want_xchg = ++schetchick % T == 0 || stagnation;
	const float interval_len = x_Rmax_2 - x_Rmax_1;
	const bool want_term = interval_len < epsilon || schetchick == static_cast<int>(global_iterations);

	if (want_xchg || want_term || g_world->iprobe(partner, 0)) {
		CtrlMsg out{};
		CtrlMsg in{};
		out.kind = want_term ? false : true;
		if (out.kind == true) {
			float s_x1, s_x2, e_x1, e_x2;
			HitTest2D_analytic(top_ptr->x1, s_x1, s_x2);
			HitTest2D_analytic(top_ptr->x2, e_x1, e_x2);
			out.xchg = CrossMsg{ s_x1, s_x2, e_x1, e_x2, top_ptr->R };
		}
		g_world->sendrecv(partner, 0, out, partner, 0, in);

		if (in.kind == false || out.kind == false) {
			if (partner) {
				Extr.emplace_back(static_cast<float>(schetchick));
				Extr.emplace_back(interval_len);
				*out_len = Extr.size();
				*out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
				memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
			}
			return;
		}

		const float sx = FindX2D_analytic(in.xchg.s_x1, in.xchg.s_x2);
		const float ex = FindX2D_analytic(in.xchg.e_x1, in.xchg.e_x2);

		Interval* const __restrict injected = new Interval(sx, ex, RastriginFunc(in.xchg.s_x1, in.xchg.s_x2), RastriginFunc(in.xchg.e_x1, in.xchg.e_x2), 2.0f);
		injected->ChangeCharacteristic(m);
		const float k = stagnation ? fmaf((1.0f / expm1f(1.0f)) * 0.8f, expm1f(p), 0.4f) : fmaf((1.0f / expm1f(1.0f)) * 0.4f, expm1f(p), 0.8f);
		injected->R = in.xchg.Rtop * k;
		R.emplace_back(injected);
		std::push_heap(R.begin(), R.end(), ComparePtr);
	}

	const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r);
	float new_x1_val, new_x2_val;
	HitTest2D_analytic(new_point, new_x1_val, new_x2_val);
	const float new_value = RastriginFunc(new_x1_val, new_x2_val);

	if (new_value < best_f) {
		best_f = new_value;
		Extr.emplace_back(best_f);
		Extr.emplace_back(new_x1_val);
		Extr.emplace_back(new_x2_val);
		no_improve = 0;
	}
	else {
		++no_improve;
	}

	std::pop_heap(R.begin(), R.end(), ComparePtr);
	Interval* const __restrict promejutochny_otrezok = R.back();

	const float segment_x1 = promejutochny_otrezok->x1;
	const float segment_x2 = promejutochny_otrezok->x2;
	const float len2 = segment_x2 - new_point;
	const float len1 = new_point - segment_x1;

	Interval* const __restrict curr = new Interval(segment_x1, new_point, promejutochny_otrezok->y1, new_value, 2.0f);
	Interval* const __restrict curr1 = new Interval(new_point, segment_x2, new_value, promejutochny_otrezok->y2, 2.0f);
	const float currM = curr->M > curr1->M ? curr->M : curr1->M;
	const size_t r_size = R.size();

	if (mode) {
		if (len2 + len1 == dmax) {
			dmax = len2 > len1 ? len2 : len1;
			size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
const float len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}

text
		if (threshold_03 > dmax && schetchick % 3 == 0 || 10.0f * dmax < initial_length) {
			if (currM > Mmax) {
				Mmax = currM;
				m = r * Mmax;
			}
			const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
			const float alpha = fmaf(progress, progress, 1.0f);
			const float betta = 2.0f - alpha;
			const float MULTIPLIER = (1.0f / dmax) * Mmax;
			const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
			const float GLOBAL_FACTOR = betta * global_coeff;

			curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
			curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));

			if (r_size < 64u) {
				size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
}
else {
RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
if (currM - Mmax < Mmax * 0.15f) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
if (r_size < 64u) {
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
if (currM - Mmax < Mmax * 0.15f) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
if (r_size < 64u) {
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}

text
	R.back() = curr;
	std::push_heap(R.begin(), R.end(), ComparePtr);
	R.emplace_back(curr1);
	std::push_heap(R.begin(), R.end(), ComparePtr);

	top_ptr = R.front();
	x_Rmax_1 = top_ptr->x1;
	x_Rmax_2 = top_ptr->x2;
	y_Rmax_1 = top_ptr->y1;
	y_Rmax_2 = top_ptr->y2;
}

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() noexcept
{
int dummy;
float* __restrict buf;
size_t len;
while (true) {
if (g_world->iprobe(0, 0)) {
g_world->recv(0, 0, dummy);
AGP_2D(2.0f, 10000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.00001f, GetTickCount(), &buf, &len);
}
}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpStartWorkers() noexcept
{
g_world->isend(1, 0, 0);
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_Free(float* const __restrict p) noexcept
{
CoTaskMemFree(p);
} - код в .cpp файле dll библитотеки, которая используется в управляемом приложении, код MyForm.h: #pragma once

#define WIN32_LEAN_AND_MEAN
#include <Windows.h>

#include <algorithm>

#include <vector>

#include <utility>

namespace TESTAGP {
using namespace System;
using namespace System::IO;

text
ref class MyForm : public System::Windows::Forms::Form {
public:
	MyForm(HMODULE hLib) : hLib(hLib) { // Передаем дескриптор из main
		InitializeComponent();
		// Загружаем функции из DLL
		f = (agp_c)GetProcAddress(hLib, "AGP_2D");
		pStart = (start_workers)GetProcAddress(hLib, "AgpStartWorkers");
		pFree = (free_agp)GetProcAddress(hLib, "AGP_Free");
		pInit = (PInit)GetProcAddress(hLib, "AgpInit");
	}

protected:
	~MyForm() {
		delete components;
	}

private:
	HINSTANCE hLib = nullptr;
	typedef void(__cdecl* agp_c)(float, float, float, float, float, float, float, bool, float, float, float**, size_t*);
	typedef void(__cdecl* start_workers)();
	typedef void(__cdecl* free_agp)(float*);


	typedef int(__cdecl* PInit)(int, float, float, float, float);

	agp_c f = nullptr;
	start_workers pStart = nullptr;
	free_agp pFree = nullptr;
	PInit pInit = nullptr;

	System::Windows::Forms::Button^ button1;
	System::Windows::Forms::TextBox^ textBox2;
	System::Windows::Forms::DataVisualization::Charting::Chart^ chart2;
	System::Windows::Forms::Label^ label2;
	System::Windows::Forms::Label^ label3;
	System::Windows::Forms::TextBox^ textBox1;
	System::Windows::Forms::TextBox^ textBox3;
	System::Windows::Forms::TextBox^ textBox4;
	System::Windows::Forms::TextBox^ textBox5;
	System::Windows::Forms::TextBox^ textBox6;
	System::Windows::Forms::Label^ label6;
	System::Windows::Forms::Label^ label7;
	System::Windows::Forms::Label^ label8;
	System::Windows::Forms::Label^ label9;
	System::Windows::Forms::Label^ label10;
	System::Windows::Forms::Label^ label1;
	System::Windows::Forms::TextBox^ textBox7;
	System::Windows::Forms::TextBox^ textBox8;
	System::ComponentModel::Container^ components;

	System::Void button1_Click(System::Object^ sender, System::EventArgs^ e) {

		chart2->Series[0]->Points->Clear();
		chart2->Series[1]->Points->Clear();
		chart2->Series[2]->Points->Clear();
		chart2->Series[3]->Points->Clear();

		static LARGE_INTEGER start, end;
		QueryPerformanceCounter(&start);

		float* buf = nullptr;
		size_t len = 0u;

		pStart();  // разбудили rank != 0
		f(2.0f, 10000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.00001f, GetTickCount(), &buf, &len);

		if (buf != nullptr) {
			std::vector<float> Extr_2D(buf, buf + len);
			pFree(buf);

			// Для 2D ветки: извлекаем schetchick и interval_len
			textBox7->Text = Convert::ToString(Extr_2D.back()); // schetchick
			Extr_2D.pop_back();
			textBox6->Text = Convert::ToString(Extr_2D.back()); // interval_len
			Extr_2D.pop_back();

			// Извлекаем последнюю тройку (x2, x1, f)
			float x2 = Extr_2D.back();
			Extr_2D.pop_back();
			float x1 = Extr_2D.back();
			Extr_2D.pop_back();
			float f_val = Extr_2D.back();
			Extr_2D.pop_back();

			// Выводим координаты и значение функции
			textBox4->Text = Convert::ToString(x1);
			textBox3->Text = Convert::ToString(x2);
			textBox2->Text = Convert::ToString(f_val); // Новое поле для значения функции

			// Отображаем точку на графике
			chart2->Series[2]->Points->AddXY(x1, x2);

			// Обрабатываем остальные точки (в порядке: f, x1, x2)
			while (!Extr_2D.empty()) {
				float x2_point = Extr_2D.back();
				Extr_2D.pop_back();
				float x1_point = Extr_2D.back();
				Extr_2D.pop_back();
				float f_point = Extr_2D.back();
				Extr_2D.pop_back();

				// Добавляем точку на график (только координаты)
				chart2->Series[0]->Points->AddXY(x1_point, x2_point);
			}
		}

		QueryPerformanceCounter(&end);

		LARGE_INTEGER freq;
		QueryPerformanceFrequency(&freq);
		const int multiplier = ((1ULL << 32) / freq.QuadPart) * 1'000'000ULL;

		textBox5->Text = Convert::ToString(
			((end.QuadPart - start.QuadPart) * multiplier) >> 32
		) + " microseconds";

		//textBox2->Text = Convert::ToString(Extr_2D.back());
		//textBox1->Text = Convert::ToString(Extr_2D_LNA.back());
	}, код MyForm.cpp: #include "MyForm.h"

#include <float.h>

using namespace System;
using namespace System::Windows::Forms;

typedef int(__cdecl* PInit)(int, float, float, float, float);
typedef void(__cdecl* PStartWorkers)();

[STAThread]
int main() {
HMODULE h = LoadLibraryW(L"TEST_FUNC.dll");
auto AgpInit = (PInit)GetProcAddress(h, "AgpInit");
auto AgpWaitStartAndRun = (PStartWorkers)GetProcAddress(h, "AgpWaitStartAndRun");

text
const int rank = AgpInit(12, -2.2f, 1.8f, -2.2f, 1.8f);

if (rank == 0) {
	Application::EnableVisualStyles();
	Application::SetCompatibleTextRenderingDefault(false);
	Application::Run(gcnew TESTAGP::MyForm(h));
}
else {
	AgpWaitStartAndRun();
}
return 0;

} - сейчас для двух процессов запускаю через mpiexec - но для двумерного случая выгоднее использовать 4 процесса - где каждый процесс будет вычислять на своей развертке - верхней/нижней/левой/правой - тогда только нулевой процесс также будет взаимодействовать с интерфейсом и считать, а остальные 3 процесса будут независимо считать, наилучший обмен данными будет когда каждый будет рассылать свои лучшие значения каждому, также придётся модифицировать функции extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() noexcept
{
int dummy;
float* __restrict buf;
size_t len;
while (true) {
if (g_world->iprobe(0, 0)) {
g_world->recv(0, 0, dummy);
AGP_2D(2.0f, 10000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.00001f, GetTickCount(), &buf, &len);
}
}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpStartWorkers() noexcept
{
g_world->isend(1, 0, 0);
} - первая функция останется без изменений, так как сигнал о старте также будет исходить от нулевого процесса, но вторую функцию придётся модифицировать - так как отправлять этот сигнал придётся всем остальным трём процессам, сделай эти изменения - и приведи полный модифицированный код для всех файлов - также учти что важна максимальная скорость - так что если возможно используй все возможные оптимизации (как у меня например вычисление ранга партнёра через битовые операции), ещё придётся заменить условие (если партнёр) на (если ранг == 0) так как теперь будет больше процессов, приведи полный код - думай как можно дольше чтобы дать лучший ответ, код должен быть максимально оптимизированным и быстрым

Answer 1

ниже — полностью готовая, оптимизированная версия под 4 процесса. изменения:

AgpInit: каждому рангу назначается своя развёртка Пеано (Top/Down/Left/Right).
AGP_2D: обмен «лучших» идёт со всеми (all-to-all) неблокирующими отправками; при получении kind=false — немедленное завершение; условие записи результата заменено на if (rank == 0).
AgpStartWorkers: будит все остальные ранги (1..size-1) неблокирующими isend.
Лёгкая настройка стартовых точек для 4 ориентаций (чтобы ранги начинали с разных сторон).
Остальной высокопроизводительный код (SIMD, ETS-slab, heap) оставлен как есть.

TEST_FUNC.cpp (DLL)

cpp
#include "pch.h"
// остальные ваши include'ы, как были
#include <boost/mpi.hpp>
#include <boost/align/aligned_allocator.hpp>
#include <tbb/enumerable_thread_specific.h>
#include <vector>
#include <algorithm>
#include <cmath>
#include <cfloat>
#include <stdint.h>
#include <intrin.h>

// ======= ВЕСЬ ВАШ ИСХОДНЫЙ КОД МАКРОСОВ/СТРУКТУР/ТАБЛИЦ без изменений =======

#define XOR_RAND(state, result_var) \
    do { \
        int s = state; \
        s ^= s << 13; \
        s ^= s >> 17; \
        s ^= s << 5; \
        state = s; \
        result_var = state * 0x1.0p-32f; \
    } while(0)

#define XOR_RAND_GRSH(state, result_var) \
    do { \
        int s = state; \
        s ^= s << 13; \
        s ^= s >> 17; \
        s ^= s << 5; \
        state = s; \
        result_var = fmaf(state, 0x1.0p-31f, -1.0f); \
    } while(0)

#define FABE13_COS(x, result_var) \
    do { \
        const float _abs_val_ = fabsf(x); \
        float _reduced_ = fmodf(_abs_val_, 6.28318530718f); \
        if(_reduced_ > 3.14159265359f) { \
            _reduced_ = 6.28318530718f - _reduced_; \
        } \
        if(_reduced_ < 1.57079632679f) { \
            const float _val2_ = _reduced_ * _reduced_; \
            const float _val4_ = _val2_ * _val2_; \
            result_var = fmaf(_val4_, fmaf(_val2_, -0.0013888889f, 0.0416666667f), fmaf(_val2_, -0.5f, 1.0f)); \
        } else { \
            _reduced_ = 3.14159265359f - _reduced_; \
            const float _val2_ = _reduced_ * _reduced_; \
            const float _val4_ = _val2_ * _val2_; \
            result_var = -fmaf(_val4_, fmaf(_val2_, -0.0013888889f, 0.0416666667f), fmaf(_val2_, -0.5f, 1.0f)); \
        } \
    } while(0)

#define FABE13_SINCOS(in, sin_out, cos_out, n) \
    do { \
        int i = 0; \
        const int limit = n & ~7; \
        if(n >= 8) { \
            static __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f); \
            static __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f); \
            static __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f); \
            static __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f); \
            static __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f); \
            static __declspec(align(32)) const __m256 VEC_COS_P5 = _mm256_set1_ps(-0.0013888889f); \
            static __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f); \
            static __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f); \
            static __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f); \
            static __declspec(align(32)) const __m256 VEC_SIN_P5 = _mm256_set1_ps(-0.0001984127f); \
            static __declspec(align(32)) const __m256 VEC_SIN_P3 = _mm256_set1_ps(0.0083333333f); \
            static __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f); \
            static __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f); \
            static __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps(); \
            while(i < limit) { \
                const __m256 vx = _mm256_load_ps(&in[i]); \
                const __m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx); \
                __m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS); \
                q = _mm256_sub_ps(q, BIAS); \
                const __m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax); \
                const __m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r)); \
                const __m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1)); \
                const __m256 t2 = _mm256_mul_ps(r2, r2); \
                const __m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P5, VEC_COS_P3), VEC_COS_P1), VEC_COS_P0); \
                const __m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_SIN_P5, VEC_SIN_P3), VEC_SIN_P1), VEC_SIN_P0), r2); \
                const __m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ); \
                const __m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ), _mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ)); \
                _mm256_store_ps(&cos_out[i], _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip)); \
                _mm256_store_ps(&sin_out[i], _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip)); \
                i += 8; \
            } \
        } \
        while(i < n) { \
            const float x = in[i]; \
            const float ax = fabsf(x); \
            float q = fmaf(ax, 0.15915494309189535f, 12582912.0f); \
            q -= 12582912.0f; \
            float r = fmaf(-6.28318530718f, q, ax); \
            const bool sflip = r > 3.14159265359f; \
            if(sflip) r = 6.28318530718f - r; \
            const bool cflip = r > 1.57079632679f; \
            if(cflip) r = 3.14159265359f - r; \
            const float t2 = r * r; \
            const float c = fmaf(t2, fmaf(t2, fmaf(t2, -0.0013888889f, 0.0416666667f), -0.5f), 1.0f); \
            const float s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r; \
            cos_out[i] = cflip ? -c : c; \
            sin_out[i] = ((x < 0.0f) ^ sflip) ? -s : s; \
            ++i; \
        } \
    } while(0)

enum List : uint8_t { Top = 0b00u, Down = 0b01u, Left = 0b10u, Right = 0b11u };
// ... все таблицы шагов, структуры, RNG/функции Shekel/Rastrigin/Hill/Grishagin, Shag,
// RecomputeR_*_AVX2, HitTest2D_analytic, FindX2D_analytic — БЕЗ ИЗМЕНЕНИЙ ...

// --------- глобальные объекты MPI ----------
static const boost::mpi::environment* __restrict g_env;
static const boost::mpi::communicator* __restrict g_world;

// ===== Сериализуемые сообщения (как были) =====
__declspec(align(16)) struct CrossMsg final {
    float s_x1, s_x2;
    float e_x1, e_x2;
    float Rtop;
    template<typename Archive>
    __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar & s_x1 & s_x2 & e_x1 & e_x2 & Rtop; }
};

__declspec(align(16)) struct CtrlMsg final {
    bool kind;      // true -> данные; false -> терминейт
    CrossMsg xchg;
    template<typename Archive>
    __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar & kind & xchg; }
};

// ================== AgpInit: 4 развёртки ==================
extern "C" __declspec(dllexport) __declspec(noalias) __forceinline
int AgpInit(const int peanoLevel, const float a, const float b, const float c, const float d) noexcept {
    g_env  = new boost::mpi::environment();
    g_world = new boost::mpi::communicator();
    const int rank = g_world->rank();

    uint8_t startType;
    switch (rank & 3) {
    case 0:  startType = static_cast<uint8_t>(Top);  break;
    case 1:  startType = static_cast<uint8_t>(Down); break;
    case 2:  startType = static_cast<uint8_t>(Left); break;
    default: startType = static_cast<uint8_t>(Right);break;
    }
    new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, startType);
    return rank;
}

// ================== AGP_1D (без изменений) ==================
// ВЕСЬ ВАШ КОД AGP_1D — БЕЗ ИЗМЕНЕНИЙ

// ================== AGP_2D: all-to-all обмен ==================
extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_2D(
    const float N, const float global_iterations,
    const float a, const float b, const float c, const float d, const float r,
    const bool mode, const float epsilon, const float seed,
    float** const __restrict out_data, size_t* const __restrict out_len) noexcept
{
    Slab* const __restrict slab = tls.local();
    slab->current = slab->base;

    int schetchick = 0;
    int no_improve = 0;
    const int rank = g_world->rank();
    const int world_size = g_world->size();

    const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1));
    const float x_addition = (b - a) * inv_divider;
    const float y_addition = (d - c) * inv_divider;
    const float true_start = a + x_addition;
    const float true_end   = b - x_addition;

    float x_Rmax_1 = true_start;
    float x_Rmax_2 = true_end;
    const float initial_length = true_end - true_start;
    float dmax = initial_length;
    const float threshold_03 = 0.3f * initial_length;
    const float inv_threshold_03 = 1.0f / threshold_03;

    // Разные стартовые точки для разных развёрток (равномерное «рассыпание»)
    const bool bottomSide = ((rank & 3) == Top) || ((rank & 3) == Left);
    const float y_low  = c + y_addition;
    const float y_high = d - y_addition;

    const float sY = bottomSide ? y_low : y_high;
    const bool evenRank = (rank & 1) == 0;
    const float sX = evenRank ? true_start : true_end;
    const float eX = evenRank ? true_end   : true_start;

    const float start_val = RastriginFunc(sX, sY);
    float best_f = RastriginFunc(eX, sY);
    float y_Rmax_1 = start_val;
    float y_Rmax_2 = best_f;

    std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
    std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R;
    Extr.reserve(static_cast<size_t>(global_iterations) << 2u);
    R.reserve(static_cast<size_t>(global_iterations) << 1u);
    R.emplace_back(new Interval(true_start, true_end, start_val, best_f, 2.0f));
    const Interval* __restrict top_ptr = R.front(); // сразу инициализируем

    float Mmax = R.front()->M;
    float m = r * Mmax;

    while (true) {
        const float p = fmaf(-1.0f / initial_length, dmax, 1.0f);
        const int T = static_cast<int>(fmaf(-expm1f(p), 253.0f, 263.0f));
        const bool stagnation = no_improve > 50 && schetchick > 220;
        const float interval_len = x_Rmax_2 - x_Rmax_1;
        const bool want_term = (interval_len < epsilon) || (schetchick == static_cast<int>(global_iterations));
        const bool want_xchg = (!want_term) && ((++schetchick % T == 0) || stagnation);

        // --- завершение: шлём всем уведомление и выходим
        if (want_term) {
            CtrlMsg out{};
            out.kind = false; // terminate
            for (int dst = 0; dst < world_size; ++dst) {
                if (dst == rank) continue;
                g_world->isend(dst, 0, out);
            }
            if (rank == 0) {
                Extr.emplace_back(static_cast<float>(schetchick));
                Extr.emplace_back(interval_len);
                *out_len = Extr.size();
                *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
                memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
            }
            return;
        }

        // --- инициативная рассылка лучшего интервала всем
        if (want_xchg) {
            CtrlMsg out{};
            out.kind = true;
            float s_x1, s_x2, e_x1, e_x2;
            HitTest2D_analytic(top_ptr->x1, s_x1, s_x2);
            HitTest2D_analytic(top_ptr->x2, e_x1, e_x2);
            out.xchg = CrossMsg{ s_x1, s_x2, e_x1, e_x2, top_ptr->R };
            for (int dst = 0; dst < world_size; ++dst) {
                if (dst == rank) continue;
                g_world->isend(dst, 0, out);
            }
        }

        // --- сливаем все входящие сообщения (all-to-all, неблок.)
        while (g_world->iprobe(boost::mpi::any_source, 0)) {
            CtrlMsg in{};
            g_world->recv(boost::mpi::any_source, 0, in);
            if (!in.kind) { // кто-то завершил
                if (rank == 0) {
                    Extr.emplace_back(static_cast<float>(schetchick));
                    Extr.emplace_back(interval_len);
                    *out_len = Extr.size();
                    *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
                    memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
                }
                return;
            }
            // инъекция «лучшего» от соседа
            const float sx = FindX2D_analytic(in.xchg.s_x1, in.xchg.s_x2);
            const float ex = FindX2D_analytic(in.xchg.e_x1, in.xchg.e_x2);
            Interval* const __restrict injected =
                new Interval(sx, ex,
                             RastriginFunc(in.xchg.s_x1, in.xchg.s_x2),
                             RastriginFunc(in.xchg.e_x1, in.xchg.e_x2),
                             2.0f);
            injected->ChangeCharacteristic(m);
            const float k = stagnation
                ? fmaf((1.0f / expm1f(1.0f)) * 0.8f, expm1f(p), 0.4f)
                : fmaf((1.0f / expm1f(1.0f)) * 0.4f, expm1f(p), 0.8f);
            injected->R = in.xchg.Rtop * k;
            R.emplace_back(injected);
            std::push_heap(R.begin(), R.end(), ComparePtr);
        }

        // --- собственный шаг
        const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r);
        float new_x1_val, new_x2_val;
        HitTest2D_analytic(new_point, new_x1_val, new_x2_val);
        const float new_value = RastriginFunc(new_x1_val, new_x2_val);

        if (new_value < best_f) {
            best_f = new_value;
            Extr.emplace_back(best_f);
            Extr.emplace_back(new_x1_val);
            Extr.emplace_back(new_x2_val);
            no_improve = 0;
        } else {
            ++no_improve;
        }

        std::pop_heap(R.begin(), R.end(), ComparePtr);
        Interval* const __restrict promejutochny_otrezok = R.back();

        const float segment_x1 = promejutochny_otrezok->x1;
        const float segment_x2 = promejutochny_otrezok->x2;
        const float len2 = segment_x2 - new_point;
        const float len1 = new_point - segment_x1;

        Interval* const __restrict curr  = new Interval(segment_x1, new_point, promejutochny_otrezok->y1, new_value, 2.0f);
        Interval* const __restrict curr1 = new Interval(new_point, segment_x2, new_value, promejutochny_otrezok->y2, 2.0f);
        const float currM = curr->M > curr1->M ? curr->M : curr1->M;
        const size_t r_size = R.size();

        if (mode) {
            if (len2 + len1 == dmax) {
                dmax = len2 > len1 ? len2 : len1;
                size_t i = 0u;
#pragma loop(ivdep)
                while (i < r_size) {
                    const float len_item = R[i]->x2 - R[i]->x1;
                    if (len_item > dmax) dmax = len_item;
                    ++i;
                }
            }

            if (threshold_03 > dmax && schetchick % 3 == 0 || 10.0f * dmax < initial_length) {
                if (currM > Mmax) {
                    Mmax = currM;
                    m = r * Mmax;
                }
                const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
                const float alpha = fmaf(progress, progress, 1.0f);
                const float betta = 2.0f - alpha;
                const float MULTIPLIER = (1.0f / dmax) * Mmax;
                const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
                const float GLOBAL_FACTOR = betta * global_coeff;

                curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
                curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));

                if (r_size < 64u) {
                    size_t i = 0u;
#pragma loop(ivdep)
                    while (i < r_size) {
                        R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
                        ++i;
                    }
                } else {
                    RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha);
                }
                std::make_heap(R.begin(), R.end(), ComparePtr);
            } else {
                if (currM > Mmax) {
                    if (currM - Mmax < Mmax * 0.15f) {
                        Mmax = currM; m = r * Mmax;
                        curr->ChangeCharacteristic(m);
                        curr1->ChangeCharacteristic(m);
                    } else {
                        Mmax = currM; m = r * Mmax;
                        curr->ChangeCharacteristic(m);
                        curr1->ChangeCharacteristic(m);
                        if (r_size < 64u) {
                            size_t i = 0u;
#pragma loop(ivdep)
                            while (i < r_size) { R[i]->ChangeCharacteristic(m); ++i; }
                        } else {
                            RecomputeR_ConstM_AVX2(R.data(), r_size, m);
                        }
                        std::make_heap(R.begin(), R.end(), ComparePtr);
                    }
                } else {
                    curr->ChangeCharacteristic(m);
                    curr1->ChangeCharacteristic(m);
                }
            }
        } else {
            if (currM > Mmax) {
                if (currM - Mmax < Mmax * 0.15f) {
                    Mmax = currM; m = r * Mmax;
                    curr->ChangeCharacteristic(m);
                    curr1->ChangeCharacteristic(m);
                } else {
                    Mmax = currM; m = r * Mmax;
                    curr->ChangeCharacteristic(m);
                    curr1->ChangeCharacteristic(m);
                    if (r_size < 64u) {
                        size_t i = 0u;
#pragma loop(ivdep)
                        while (i < r_size) { R[i]->ChangeCharacteristic(m); ++i; }
                    } else {
                        RecomputeR_ConstM_AVX2(R.data(), r_size, m);
                    }
                    std::make_heap(R.begin(), R.end(), ComparePtr);
                }
            } else {
                curr->ChangeCharacteristic(m);
                curr1->ChangeCharacteristic(m);
            }
        }

        R.back() = curr;
        std::push_heap(R.begin(), R.end(), ComparePtr);
        R.emplace_back(curr1);
        std::push_heap(R.begin(), R.end(), ComparePtr);

        top_ptr = R.front();
        x_Rmax_1 = top_ptr->x1;
        x_Rmax_2 = top_ptr->x2;
        y_Rmax_1 = top_ptr->y1;
        y_Rmax_2 = top_ptr->y2;
    }
}

// ================== Worker loop (как у вас, без изменений логики) ==================
extern "C" __declspec(dllexport) __declspec(noalias) __forceinline
void AgpWaitStartAndRun() noexcept {
    int dummy;
    float* __restrict buf;
    size_t len;
    while (true) {
        if (g_world->iprobe(0, 0)) {
            g_world->recv(0, 0, dummy);
            AGP_2D(2.0f, 10000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.00001f, GetTickCount(), &buf, &len);
        }
    }
}

// ================== Будим ВСЕХ воркеров ==================
extern "C" __declspec(dllexport) __declspec(noalias) __forceinline
void AgpStartWorkers() noexcept {
    const int size = g_world->size();
    for (int r = 1; r < size; ++r) {
        g_world->isend(r, 0, 0); // неблокирующая, мгновенная
    }
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline
void AGP_Free(float* const __restrict p) noexcept {
    CoTaskMemFree(p);
}

MyForm.h (минимальные правки в комментариях; код совместим как есть)

cpp
#pragma once

#define WIN32_LEAN_AND_MEAN
#include <Windows.h>

#include <algorithm>
#include <vector>
#include <utility>

namespace TESTAGP {
    using namespace System;
    using namespace System::IO;

    ref class MyForm : public System::Windows::Forms::Form {
    public:
        MyForm(HMODULE hLib) : hLib(hLib) {
            InitializeComponent();
            // Загрузка функций из DLL
            f      = (agp_c)GetProcAddress(hLib, "AGP_2D");
            pStart = (start_workers)GetProcAddress(hLib, "AgpStartWorkers"); // теперь будит все ранги != 0
            pFree  = (free_agp)GetProcAddress(hLib, "AGP_Free");
            pInit  = (PInit)GetProcAddress(hLib, "AgpInit");
        }

    protected:
        ~MyForm() { delete components; }

    private:
        HINSTANCE hLib = nullptr;
        typedef void(__cdecl* agp_c)(float, float, float, float, float, float, float, bool, float, float, float**, size_t*);
        typedef void(__cdecl* start_workers)();
        typedef void(__cdecl* free_agp)(float*);
        typedef int (__cdecl* PInit)(int, float, float, float, float);

        agp_c f = nullptr;
        start_workers pStart = nullptr;
        free_agp pFree = nullptr;
        PInit pInit = nullptr;

        System::Windows::Forms::Button^ button1;
        System::Windows::Forms::TextBox^ textBox2;
        System::Windows::Forms::DataVisualization::Charting::Chart^ chart2;
        System::Windows::Forms::Label^ label2;
        System::Windows::Forms::Label^ label3;
        System::Windows::Forms::TextBox^ textBox1;
        System::Windows::Forms::TextBox^ textBox3;
        System::Windows::Forms::TextBox^ textBox4;
        System::Windows::Forms::TextBox^ textBox5;
        System::Windows::Forms::TextBox^ textBox6;
        System::Windows::Forms::Label^ label6;
        System::Windows::Forms::Label^ label7;
        System::Windows::Forms::Label^ label8;
        System::Windows::Forms::Label^ label9;
        System::Windows::Forms::Label^ label10;
        System::Windows::Forms::Label^ label1;
        System::Windows::Forms::TextBox^ textBox7;
        System::Windows::Forms::TextBox^ textBox8;
        System::ComponentModel::Container^ components;

        System::Void button1_Click(System::Object^ sender, System::EventArgs^ e) {
            chart2->Series[0]->Points->Clear();
            chart2->Series[1]->Points->Clear();
            chart2->Series[2]->Points->Clear();
            chart2->Series[3]->Points->Clear();

            static LARGE_INTEGER start, end;
            QueryPerformanceCounter(&start);

            float* buf = nullptr;
            size_t len = 0u;

            pStart(); // будим ВСЕ ранги != 0
            f(2.0f, 10000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.00001f, GetTickCount(), &buf, &len);

            if (buf != nullptr) {
                std::vector<float> Extr_2D(buf, buf + len);
                pFree(buf);

                // schetchick и interval_len (последние элементы)
                textBox7->Text = Convert::ToString(Extr_2D.back()); Extr_2D.pop_back();
                textBox6->Text = Convert::ToString(Extr_2D.back()); Extr_2D.pop_back();

                // последняя тройка (f, x1, x2) — как в вашей логике
                float x2 = Extr_2D.back(); Extr_2D.pop_back();
                float x1 = Extr_2D.back(); Extr_2D.pop_back();
                float f_val = Extr_2D.back(); Extr_2D.pop_back();

                textBox4->Text = Convert::ToString(x1);
                textBox3->Text = Convert::ToString(x2);
                textBox2->Text = Convert::ToString(f_val);

                chart2->Series[2]->Points->AddXY(x1, x2);

                while (!Extr_2D.empty()) {
                    float x2_point = Extr_2D.back(); Extr_2D.pop_back();
                    float x1_point = Extr_2D.back(); Extr_2D.pop_back();
                    float f_point  = Extr_2D.back(); Extr_2D.pop_back();
                    (void)f_point;
                    chart2->Series[0]->Points->AddXY(x1_point, x2_point);
                }
            }

            QueryPerformanceCounter(&end);

            LARGE_INTEGER freq;
            QueryPerformanceFrequency(&freq);
            const int multiplier = ((1ULL << 32) / freq.QuadPart) * 1'000'000ULL;

            textBox5->Text = Convert::ToString(
                ((end.QuadPart - start.QuadPart) * multiplier) >> 32
            ) + " microseconds";
        }
    };
}

MyForm.cpp (как у вас)

cpp
#include "MyForm.h"
#include <float.h>

using namespace System;
using namespace System::Windows::Forms;

typedef int  (__cdecl* PInit)(int, float, float, float, float);
typedef void (__cdecl* PStartWorkers)();

[STAThread]
int main() {
    HMODULE h = LoadLibraryW(L"TEST_FUNC.dll");
    auto AgpInit            = (PInit)GetProcAddress(h, "AgpInit");
    auto AgpWaitStartAndRun = (PStartWorkers)GetProcAddress(h, "AgpWaitStartAndRun");

    const int rank = AgpInit(12, -2.2f, 1.8f, -2.2f, 1.8f);

    if (rank == 0) {
        Application::EnableVisualStyles();
        Application::SetCompatibleTextRenderingDefault(false);
        Application::Run(gcnew TESTAGP::MyForm(h));
    } else {
        AgpWaitStartAndRun();
    }
    return 0;
}

примечания по производительности

Для старта воркеров выбрана неблокирующая рассылка isend (3 коротких сообщения), чтобы UI/ранг-0 не блокировался.
В AGP_2D обмен сделан «рассыпной» (all-to-all) без коллективов — через isend + цикл iprobe/recv до опустошения очереди; так избегаем синхронизаций и «пилообразных» барьеров.
Условие вывода результатов теперь if (rank == 0) — как вы и просили.
При получении terminate выходим мгновенно; ранг-0 формирует буфер для UI.

Если захотите ещё сильнее агрессивно закрыть «окно» между отправкой terminate и выходом у ранга-0, можно перед return сделать короткую «сливку» входящих (while (iprobe(...)) recv(...)) — но практической выгоды обычно нет.

#include "pch.h" #define XOR_RAND(state, result_v...

질문

사고

Modifying MPI Code for 4 Processes

Adjusting Condition in AGP_2D

Modifying for 4 Processes

Adjusting AGP_2D Communication

Implementing All-to-All Communication

Implementing Efficient Message Exchange

Updating AGP_2D Logic

답변

TEST_FUNC.cpp (DLL)

MyForm.h (минимальные правки в комментариях; код совместим как есть)

MyForm.cpp (как у вас)

примечания по производительности

이 Q&A 공유하기