#include "pch.h" #define LCG_RAND(state) ...

Question

#include "pch.h"

#define LCG_RAND(state)
do {
float reduced = fmodf((state), 1.618033988749895f);
reduced = fmaf(reduced, 1.618033988749895f, reduced * 0.5772156649f);
reduced -= (float)(int)(reduced);
reduced = fmaf(reduced, reduced + 1.0f, 0.0f);
reduced -= (float)(int)(reduced);
(state) = reduced;
} while (0)

#define LCG_RAND_GRSH(state)
do {
float reduced = fmodf((state), 1.618033988749895f);
reduced = fmaf(reduced, 1.618033988749895f, reduced * 0.5772156649f);
reduced -= (float)(int)(reduced);
reduced = fmaf(reduced, reduced + 1.0f, 0.0f);
reduced -= (float)(int)(reduced);
(state) = fmaf(reduced, 2.0f, -1.0f);
} while (0)

#define FABE13_COS(x, result_var)
do {
float abs_val = fabsf((x));
float reduced = fmodf(abs_val, 6.28318530718f);
if (reduced > 3.14159265359f) {
reduced = 6.28318530718f - reduced;
}
if (reduced < 1.57079632679f) {
float val2 = reduced * reduced;
result_var = fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
} else {
reduced = 3.14159265359f - reduced;
float val2 = reduced * reduced;
result_var = -fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
}
} while (0)

#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
int limit = (n) & ~7;
if ((n) >= 8) {
static thread_local __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f);
static thread_local __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f);
static thread_local __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f);
static thread_local __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f);
static thread_local __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f);
static thread_local __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f);
static thread_local __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f);
static thread_local __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f);
static thread_local __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f);
static thread_local __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f);
static thread_local __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps();
while (i < limit) {
__m256 vx = _mm256_load_ps(&(in)[i]);
__m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
__m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
__m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
__m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
__m256 t2 = _mm256_mul_ps(r2, r2);
__m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P3, VEC_COS_P1), VEC_COS_P0);
__m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, VEC_SIN_P1, VEC_SIN_P0), r2);
__m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
__m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ),
_mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
cosv = _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip);
sinv = _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip);
_mm256_store_ps(&(cos_out)[i], cosv);
_mm256_store_ps(&(sin_out)[i], sinv);
i += 8;
}
}
while (i < (n)) {
float x = (in)[i];
float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
bool sflip = (r > 3.14159265359f);
if (sflip) r = 6.28318530718f - r;
bool cflip = (r > 1.57079632679f);
if (cflip) r = 3.14159265359f - r;
float t2 = r * r;
float c = fmaf(t2, fmaf(t2, 0.0416666667f, -0.5f), 1.0f);
float s = fmaf(t2, -0.16666666f, 1.0f) * r;
cos_out[i] = cflip ? -c : c;
sin_out[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while (0)

static __declspec(noalias) __forceinline float ShekelFunc(const float x, const float seed) noexcept
{
int i = 0;
float current_state = seed, current_res, res = 0.0f;
while (i < 10) {
LCG_RAND(current_state);
const float x_part = fmaf(-current_state, 10.0f, x);
LCG_RAND(current_state);
current_res = current_state;
LCG_RAND(current_state);
float delimiter = fmaf(fmaf(current_res, 20.0f, 5.0f),
x_part * x_part,
fmaf(current_state, 0.2f, 1.0f));
delimiter = copysignf(fmaxf(fabsf(delimiter), FLT_MIN), delimiter);
res -= 1.0f / delimiter;
++i;
}
return res;
}

static __declspec(noalias) __forceinline float RastriginFunc(const float x1, const float x2) noexcept
{
const float term1 = fmaf(x1, x1, x2 * x2);
float cos1, cos2;
FABE13_COS(6.28318530717958647692f * x1, cos1);
FABE13_COS(6.28318530717958647692f * x2, cos2);
return (term1 - fmaf(cos1 + cos2, 10.0f, -14.6f)) * fmaf(-term1, 0.25f, 18.42f);
}

static __declspec(noalias) __forceinline float HillFunc(const float x, const float seed) noexcept
{
int j = 0;
static thread_local __declspec(align(64)) float angles[14u];
const float start_angle = 6.28318530717958647692f * x;
#pragma loop(ivdep)
while (j < 14) {
angles[j] = start_angle * (j + 1);
++j;
}
static thread_local __declspec(align(64)) float sin_vals[14u];
static thread_local __declspec(align(64)) float cos_vals[14u];
FABE13_SINCOS(angles, sin_vals, cos_vals, 14u);
float current_state = seed;
LCG_RAND(current_state);
float res = fmaf(current_state, 2.0f, -1.1f);
--j;
while (j >= 0) {
LCG_RAND(current_state);
float tmp_state = current_state;
LCG_RAND(current_state);
res += fmaf(fmaf(tmp_state, 2.0f, -1.1f), sin_vals[j],
fmaf(current_state, 2.0f, -1.1f) * cos_vals[j]);
--j;
}
return res;
}

static __declspec(noalias) __forceinline float GrishaginFunc(const float x1, const float x2, const float seed) noexcept
{
int j = 0;
static thread_local __declspec(align(64)) float angles_j[8u];
static thread_local __declspec(align(64)) float angles_k[8u];
#pragma loop(ivdep)
while (j < 8) {
const float pj_mult = 3.14159265358979323846f * (j + 1);
angles_j[j] = pj_mult * x1;
angles_k[j] = pj_mult * x2;
++j;
}
static thread_local __declspec(align(64)) float sin_j[8u], cos_j[8u];
static thread_local __declspec(align(64)) float sin_k[8u], cos_k[8u];
FABE13_SINCOS(angles_j, sin_j, cos_j, 8u);
FABE13_SINCOS(angles_k, sin_k, cos_k, 8u);
--j;
float part1 = 0.0f;
float part2 = 0.0f;
float current_state = seed;
while (j >= 0) {
size_t k = 0u;
while (k < 8u) {
const float sin_term = sin_j[j] * sin_j[j];
const float cos_term = cos_k[k] * cos_k[k];
LCG_RAND_GRSH(current_state);
float tmp_state = current_state;
LCG_RAND_GRSH(current_state);
part1 = fmaf(tmp_state, sin_term,
fmaf(current_state, cos_term, part1));
LCG_RAND_GRSH(tmp_state);
LCG_RAND_GRSH(current_state);
part2 = fmaf(-tmp_state, cos_term,
fmaf(current_state, sin_term, part2));
++k;
}
--j;
}
return -sqrtf(fmaf(part1, part1, part2 * part2));
}

static __declspec(noalias) __forceinline float Shag(const float _m, const float x1, const float x2, const float y1,
const float y2, const float _N, const float _r) noexcept
{
const float diff = y2 - y1;
const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(
0x3F800000 | ((((const int)&diff) & 0x80000000) ^ 0x80000000)
)));
return _N == 1.0f
? fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f
: _N == 2.0f
? fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f
: fmaf(sign_mult / powf(_m, _N), powf(diff, _N) * _r, x1 + x2) * 0.5f;
}

__declspec(align(64)) struct PreallocatedPeanoMemoryManager final {
__declspec(align(64)) struct MemoryBlock final {
char* __restrict data;
char* __restrict current;
const char* __restrict end;

text
	__declspec(noalias) __forceinline MemoryBlock() noexcept
		: data(nullptr), current(nullptr), end(nullptr)
	{
	}

	const __declspec(noalias) __forceinline void initialize(const size_t bytes) noexcept
	{
		const size_t aligned_need = (bytes + 1048638) & ~(1048575);
		void* p = VirtualAlloc(nullptr, aligned_need,
			MEM_RESERVE | MEM_COMMIT,
			PAGE_READWRITE);

		data = reinterpret_cast<char* __restrict>(p);
		const uintptr_t base = reinterpret_cast<uintptr_t>(data);
		const uintptr_t aligned = (base + 63) & ~63;
		current = reinterpret_cast<char* __restrict>(aligned);
		end = reinterpret_cast<char*>(current + bytes);
	}

	__declspec(noalias) __forceinline void* allocate_from_block(const size_t alloc_size) noexcept
	{
		const size_t n = (alloc_size + 63) & ~63;
		char* __restrict result = current;
		current += n;
		return result;
	}
};

tbb::enumerable_thread_specific<MemoryBlock> tls;

const __declspec(noalias) __forceinline size_t calculate_tree_size_bytes(const int max_depth) const noexcept
{
	static const size_t bytes[] = {
			1398080u, 5592384u, 22369600u, 89478464u, 357913920u, 1431655744u, 5726623040u
	};
	return bytes[max_depth - 7u];
}

const __declspec(noalias) __forceinline void preallocate_for_depth(const int max_depth) noexcept
{
	const size_t required_size = calculate_tree_size_bytes(max_depth);
	tls.clear();
	tls = tbb::enumerable_thread_specific<MemoryBlock>(
		[required_size]() {
			MemoryBlock b;
			b.initialize(required_size);
			return b;
		}
	);
}

__declspec(noalias) __forceinline void* allocate(const size_t size) noexcept
{
	return tls.local().allocate_from_block(size);
}

};

static PreallocatedPeanoMemoryManager peano_memory_manager;

__declspec(align(16)) struct Interval final {
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float N_factor;
const float M;
float R;

text
__declspec(noalias) __forceinline void* operator new(size_t sz) noexcept { return peano_memory_manager.allocate(sz); }

__declspec(noalias) __forceinline Interval(const float _x1, const float _x2,
	const float _y1, const float _y2, const float _N) noexcept
	: x1(_x1), x2(_x2), y1(_y1), y2(_y2),
	delta_y(_y2 - _y1),
	N_factor(_N == 1.0f ? _x2 - _x1 :
		_N == 2.0f ? sqrtf(_x2 - _x1) :
		powf(_x2 - _x1, 1.0f / _N)),
	M(fabsf(delta_y)* (1.0f / N_factor))
{
}

__declspec(noalias) __forceinline void ChangeCharacteristic(const float _m) noexcept
{
	R = fmaf(
		-(y2 + y1),
		2.0f,
		fmaf(_m, N_factor, (delta_y * delta_y) * (1.0f / (_m * N_factor))));
}

};

static __declspec(noalias) __forceinline bool ComparePtr(const Interval* __restrict a, const Interval* __restrict b) noexcept
{
return a->R < b->R;
}

const enum List : uint8_t {
Top = 0b00,
Down = 0b01,
Left = 0b10,
Right = 0b11
};

__declspec(align(16)) struct PeanoCurve_2D final {
const List Type;
const int razvertka;
const float a;
const float b;
const float c;
const float d;
const float x1;
const float x2;
PeanoCurve_2D* __restrict DownLeft;
PeanoCurve_2D* __restrict TopLeft;
PeanoCurve_2D* __restrict TopRight;
PeanoCurve_2D* __restrict DownRight;

text
__declspec(noalias) __forceinline void* operator new(size_t sz) noexcept { return peano_memory_manager.allocate(sz); }

__declspec(noalias) __forceinline PeanoCurve_2D(
	const List _Type,
	int _razvertka,
	const float _a,
	const float _b,
	const float _c,
	const float _d) noexcept
	: Type(_Type),
	razvertka(_razvertka),
	a(_a),
	b(_b),
	c(_c),
	d(_d),
	x1(fmaf(0.5f, _a, 0.5f * _b)),
	x2(fmaf(0.5f, _c, 0.5f * _d)),
	DownLeft(nullptr),
	TopLeft(nullptr),
	TopRight(nullptr),
	DownRight(nullptr)
{
	if (_razvertka-- != 0) {
		if (_razvertka > 8) {
			switch (_Type) {
			case Top:
				tbb::parallel_invoke(
					[&]() { DownLeft = new PeanoCurve_2D(Right, _razvertka, _a, x1, _c, x2); },
					[&]() { TopLeft = new PeanoCurve_2D(Top, _razvertka, _a, x1, x2, _d); },
					[&]() { TopRight = new PeanoCurve_2D(Top, _razvertka, x1, _b, x2, _d); },
					[&]() { DownRight = new PeanoCurve_2D(Left, _razvertka, x1, _b, _c, x2); }
				);
				break;
			case Down:
				tbb::parallel_invoke(
					[&]() { TopRight = new PeanoCurve_2D(Left, _razvertka, x1, _b, x2, _d); },
					[&]() { DownRight = new PeanoCurve_2D(Down, _razvertka, x1, _b, _c, x2); },
					[&]() { DownLeft = new PeanoCurve_2D(Down, _razvertka, _a, x1, _c, x2); },
					[&]() { TopLeft = new PeanoCurve_2D(Right, _razvertka, _a, x1, x2, _d); }
				);
				break;
			case Right:
				tbb::parallel_invoke(
					[&]() { DownLeft = new PeanoCurve_2D(Top, _razvertka, _a, x1, _c, x2); },
					[&]() { DownRight = new PeanoCurve_2D(Right, _razvertka, x1, _b, _c, x2); },
					[&]() { TopRight = new PeanoCurve_2D(Right, _razvertka, x1, _b, x2, _d); },
					[&]() { TopLeft = new PeanoCurve_2D(Down, _razvertka, _a, x1, x2, _d); }
				);
				break;
			default:
				tbb::parallel_invoke(
					[&]() { TopRight = new PeanoCurve_2D(Down, _razvertka, x1, _b, x2, _d); },
					[&]() { TopLeft = new PeanoCurve_2D(Left, _razvertka, _a, x1, x2, _d); },
					[&]() { DownLeft = new PeanoCurve_2D(Left, _razvertka, _a, x1, _c, x2); },
					[&]() { DownRight = new PeanoCurve_2D(Top, _razvertka, x1, _b, _c, x2); }
				);
			}
		}
		else {
			switch (_Type) {
			case Top:
				DownLeft = new PeanoCurve_2D(Right, _razvertka, _a, x1, _c, x2);
				TopLeft = new PeanoCurve_2D(Top, _razvertka, _a, x1, x2, _d);
				TopRight = new PeanoCurve_2D(Top, _razvertka, x1, _b, x2, _d);
				DownRight = new PeanoCurve_2D(Left, _razvertka, x1, _b, _c, x2);
				break;
			case Down:
				TopRight = new PeanoCurve_2D(Left, _razvertka, x1, _b, x2, _d);
				DownRight = new PeanoCurve_2D(Down, _razvertka, x1, _b, _c, x2);
				DownLeft = new PeanoCurve_2D(Down, _razvertka, _a, x1, _c, x2);
				TopLeft = new PeanoCurve_2D(Right, _razvertka, _a, x1, x2, _d);
				break;
			case Right:
				DownLeft = new PeanoCurve_2D(Top, _razvertka, _a, x1, _c, x2);
				DownRight = new PeanoCurve_2D(Right, _razvertka, x1, _b, _c, x2);
				TopRight = new PeanoCurve_2D(Right, _razvertka, x1, _b, x2, _d);
				TopLeft = new PeanoCurve_2D(Down, _razvertka, _a, x1, x2, _d);
				break;
			default:
				TopRight = new PeanoCurve_2D(Down, _razvertka, x1, _b, x2, _d);
				TopLeft = new PeanoCurve_2D(Left, _razvertka, _a, x1, x2, _d);
				DownLeft = new PeanoCurve_2D(Left, _razvertka, _a, x1, _c, x2);
				DownRight = new PeanoCurve_2D(Top, _razvertka, x1, _b, _c, x2);
			}
		}
	}
}

const __declspec(noalias) __forceinline PeanoCurve_2D* __restrict HitTest_2D(float x) const noexcept
{
	int i = 0;
	const int _razvertka = this->razvertka;
	int num;
	const float this_a = this->a;
	x -= this_a;
	const float b_minus_a = this->b - this_a;
	const float inv_b_minus_a = 1.0f / b_minus_a;
	const PeanoCurve_2D* __restrict Curr = this;
	while (i != _razvertka) {
		const int shift = 1 << ++i + i;
		num = shift * x * inv_b_minus_a;
		x = fmaf(-ldexp(1.0f, -(i << 1)) * num, b_minus_a, x);
		const List currType = Curr->Type;
		switch (num) {
		case 0:
			Curr = (currType == Top || currType == Right)
				? Curr->DownLeft
				: Curr->TopRight;
			break;
		case 1:
			Curr = (currType == Top || currType == Left)
				? Curr->TopLeft
				: Curr->DownRight;
			break;
		case 2:
			Curr = (currType == Top || currType == Right)
				? Curr->TopRight
				: Curr->DownLeft;
			break;
		default:
			Curr = (currType == Top || currType == Left)
				? Curr->DownRight
				: Curr->TopLeft;
		}
	}
	return const_cast<PeanoCurve_2D * __restrict>(Curr);
}

const __declspec(noalias) __forceinline float FindX_2D(const float target_x1, const float target_x2) const noexcept
{
	int _razvertka = this->razvertka;
	int _razvertka1 = _razvertka;
	float x1, x2, x = this->a;
	const float b_minus_a = this->b - x;
	const PeanoCurve_2D* __restrict Curr = this;
	while (_razvertka != 0) {
		const int exponent = _razvertka1 - _razvertka-- << 1;
		x1 = Curr->x1;
		x2 = Curr->x2;
		const List currType = Curr->Type;
		if (target_x1 > x1 && target_x2 > x2) {
			Curr = Curr->TopRight;
			if (currType == Top || currType == Right) {
				x = fmaf(ldexpf(1.0f, -exponent) * 0.5f, b_minus_a, x);
			}
		}
		else if (target_x1 < x1 && target_x2 > x2) {
			Curr = Curr->TopLeft;
			if (currType == Top || currType == Left) {
				x = fmaf(ldexpf(1.0f, -exponent) * 0.25f, b_minus_a, x);
			}
			else {
				x = fmaf(ldexpf(1.0f, -exponent) * 0.75f, b_minus_a, x);
			}
		}
		else if (target_x1 < x1 && target_x2 < x2) {
			Curr = Curr->DownLeft;
			if (currType == Down || currType == Left) {
				x = fmaf(ldexpf(1.0f, -exponent) * 0.5f, b_minus_a, x);
			}
		}
		else {
			Curr = Curr->DownRight;
			if (currType == Top || currType == Left) {
				x = fmaf(ldexpf(1.0f, -exponent) * 0.75f, b_minus_a, x);
			}
			else {
				x = fmaf(ldexpf(1.0f, -exponent) * 0.25f, b_minus_a, x);
			}
		}
	}
	return x;
}

};

static thread_local std::vector<float, tbb::scalable_allocator<float>> Extr;
static thread_local std::vector<Interval*, tbb::scalable_allocator<Interval*>> R;
static boost::mpi::environment* g_env;
static boost::mpi::communicator* g_world;
static std::aligned_storage_t<sizeof(PeanoCurve_2D), alignof(PeanoCurve_2D)> curveStorage;
static PeanoCurve_2D& Curve = reinterpret_cast<PeanoCurve_2D&>(curveStorage);
static boost::optionalboost::mpi::status probe_status;

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(const int peanoLevel, const float a, const float b, const float c, const float d) noexcept
{
peano_memory_manager.preallocate_for_depth(peanoLevel);
g_env = ::new boost::mpi::environment();
g_world = ::new boost::mpi::communicator();
int rank = g_world->rank();
if (rank) {
::new (&Curve) PeanoCurve_2D(List::Down, peanoLevel, a, b, c, d);
}
else {
::new (&Curve) PeanoCurve_2D(List::Top, peanoLevel, a, b, c, d);
}
return rank;
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpFinalize() noexcept
{
delete g_world;
delete g_env;
}

__declspec(align(16)) struct CrossMsg final {
float s_x1, s_x2;
float e_x1, e_x2;
float Rtop;
template <typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& __restrict const ar, const int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }
};

extern "C" __declspec(dllexport) __declspec(noalias)
void Base_LNA_1_2_Mer_AGP(
const float N, const float global_iterations,
const float a, const float b, const float c, const float d, const float r,
const bool mode, const float epsilon, const float seed,
float** __restrict out_data, size_t* __restrict out_len) noexcept
{
const int rank = g_world->rank();
const int partner = rank ^ 1;
float divider = ldexpf(1.0f, (Curve.razvertka << 1) + 1);
float inv_divider = 1.0f / divider;
float x_addition = (b - a) * inv_divider;
float y_addition = (d - c) * inv_divider;
float true_start = a + x_addition;
float true_end = b - x_addition;
float x_Rmax_1, x_Rmax_2;
float initial_length, dmax, threshold_03, inv_threshold_03;
float start_val, best_f, y_Rmax_1, y_Rmax_2;
float Mmax, m;
int schetchick = 0, mcQueenSpeed = 1;
float new_point, new_value;
const PeanoCurve_2D* __restrict pc;
float new_x1, new_x2;
Interval* __restrict promejutochny_otrezok;
Interval* __restrict curr;
Interval* __restrict curr1;
float currM, len1, len2, len_item;
size_t r_size;
float progress, alpha, betta, MULTIPLIER, global_coeff, GLOBAL_FACTOR;
Interval* __restrict top_ptr;
float interval_len;
int dummy;
const PeanoCurve_2D* __restrict p1L;
const PeanoCurve_2D* __restrict p2L;
CrossMsg outbound, inbound{};
Interval* __restrict injected;
float cooling;
int T;
float k;
int tag;

text
if (N == 1.0f) {
	initial_length = b - a;
	dmax = initial_length;
	threshold_03 = 0.3f * initial_length;
	inv_threshold_03 = 1.0f / threshold_03;
	start_val = ShekelFunc(a, seed);
	best_f = ShekelFunc(b, seed);
	x_Rmax_1 = a;
	x_Rmax_2 = b;
	y_Rmax_1 = start_val;
	y_Rmax_2 = best_f;
	Extr.reserve(static_cast<size_t>(global_iterations) * 4u);
	Extr.clear();
	R.reserve(static_cast<size_t>(global_iterations) * 2u);
	R.clear();
	R.emplace_back(new Interval(a, b, start_val, best_f, N));
	std::push_heap(R.begin(), R.end(), ComparePtr);
	Mmax = R.front()->M;
	m = r * Mmax;
	while (true) {
		new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, N, r);
		new_value = ShekelFunc(new_point, seed);
		if (new_value < best_f) {
			best_f = new_value;
			Extr.emplace_back(best_f);
			Extr.emplace_back(new_point);
		}
		std::pop_heap(R.begin(), R.end(), ComparePtr);
		promejutochny_otrezok = R.back();
		R.pop_back();
		new_x1 = promejutochny_otrezok->x1;
		new_x2 = promejutochny_otrezok->x2;
		len2 = new_x2 - new_point;
		len1 = new_point - new_x1;
		interval_len = (len1 < len2) ? len1 : len2;
		if (++schetchick == static_cast<int>(global_iterations) || interval_len < epsilon) {
			if (rank == 0) {
				Extr.emplace_back(static_cast<float>(schetchick));
				Extr.emplace_back(interval_len);
				*out_len = Extr.size();
				*out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
				memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
			}
			return;
		}
		curr = new Interval(new_x1, new_point, promejutochny_otrezok->y1, new_value, N);
		curr1 = new Interval(new_point, new_x2, new_value, promejutochny_otrezok->y2, N);
		currM = (curr->M > curr1->M) ? curr->M : curr1->M;
		r_size = R.size();
		if ((len2 + len1) == dmax) {
			dmax = (len2 > len1) ? len2 : len1;
			size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}
if (mode) {
if ((threshold_03 > dmax && fmodf(static_cast<float>(schetchick), 3.0f) == 0.0f) || 10.0f * dmax < initial_length) {
if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
progress = fmaf(-inv_threshold_03, dmax, 1.0f);
alpha = fmaf(progress, progress, 1.0f);
betta = 2.0f - alpha;
MULTIPLIER = (1.0f / dmax) * Mmax;
global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
GLOBAL_FACTOR = betta * global_coeff;
curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
Mmax = currM; m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) { R[i]->ChangeCharacteristic(m); ++i; }
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
Mmax = currM; m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) { R[i]->ChangeCharacteristic(m); ++i; }
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
R.emplace_back(curr);
std::push_heap(R.begin(), R.end(), ComparePtr);
R.emplace_back(curr1);
std::push_heap(R.begin(), R.end(), ComparePtr);
top_ptr = R.front();
x_Rmax_1 = top_ptr->x1;
x_Rmax_2 = top_ptr->x2;
y_Rmax_1 = top_ptr->y1;
y_Rmax_2 = top_ptr->y2;
}
}
else {
x_Rmax_1 = true_start;
x_Rmax_2 = true_end;
initial_length = true_end - true_start;
dmax = initial_length;
threshold_03 = 0.3f * initial_length;
inv_threshold_03 = 1.0f / threshold_03;
start_val = rank ? RastriginFunc(true_end, d - y_addition)
: RastriginFunc(true_start, c + y_addition);
y_Rmax_1 = start_val;
best_f = rank ? RastriginFunc(true_start, d - y_addition)
: RastriginFunc(true_end, c + y_addition);
y_Rmax_2 = best_f;
Extr.reserve(static_cast<size_t>(global_iterations) * 4u);
Extr.clear();
R.reserve(static_cast<size_t>(global_iterations) * 2u);
R.clear();
R.emplace_back(new Interval(true_start, true_end, start_val, best_f, N));
Mmax = R.front()->M;
m = r * Mmax;
while (true) {
if (++schetchick == static_cast<int>(global_iterations)) {
g_world->send(partner, 2, 0);
if (partner) {
top_ptr = R.front();
Extr.emplace_back(static_cast<float>(schetchick));
Extr.emplace_back(top_ptr->x2 - top_ptr->x1);
*out_len = Extr.size();
out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
}
return;
}
cooling = ldexpf(1.0f, -(1.0f / 138.63f) * schetchick);
T = static_cast<int>(fmaf(20.0f, cooling, 10.0f));
k = fmaf(0.2f, cooling, 0.7f);
if (schetchick % (T - 1) == 0) {
probe_status = g_world->iprobe(partner, boost::mpi::any_tag);
if (probe_status) {
tag = probe_status->tag();
if (tag == 0) {
g_world->recv(partner, 0, mcQueenSpeed);
}
else if (tag == 2) {
g_world->recv(partner, 2, dummy);
if (partner) {
top_ptr = R.front();
Extr.emplace_back(static_cast<float>(schetchick));
Extr.emplace_back(top_ptr->x2 - top_ptr->x1);
*out_len = Extr.size();
out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
}
return;
}
}
}
if (!(schetchick % T) == mcQueenSpeed) {
top_ptr = R.front();
p1L = Curve.HitTest_2D(top_ptr->x1);
p2L = Curve.HitTest_2D(top_ptr->x2);
outbound = CrossMsg{ p1L->x1, p1L->x2, p2L->x1, p2L->x2, top_ptr->R };
if (mcQueenSpeed) {
g_world->send(partner, 0, 0);
}
else {
mcQueenSpeed = 1;
}
g_world->send(partner, 1, outbound);
g_world->recv(partner, 1, inbound);
injected = new Interval(Curve.FindX_2D(inbound.s_x1, inbound.s_x2), Curve.FindX_2D(inbound.e_x1, inbound.e_x2), RastriginFunc(inbound.s_x1, inbound.s_x2), RastriginFunc(inbound.e_x1, inbound.e_x2), N);
injected->R = inbound.Rtop * k;
R.emplace_back(injected);
std::push_heap(R.begin(), R.end(), ComparePtr);
}
new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, N, r);
pc = Curve.HitTest_2D(new_point);
new_x1 = pc->x1;
new_x2 = pc->x2;
new_value = RastriginFunc(new_x1, new_x2);
if (new_value < best_f) {
best_f = new_value;
Extr.emplace_back(best_f);
Extr.emplace_back(new_x1);
Extr.emplace_back(new_x2);
}
std::pop_heap(R.begin(), R.end(), ComparePtr);
promejutochny_otrezok = R.back();
R.pop_back();
curr = new Interval(promejutochny_otrezok->x1, new_point, promejutochny_otrezok->y1, new_value, N);
curr1 = new Interval(new_point, promejutochny_otrezok->x2, new_value, promejutochny_otrezok->y2, N);
currM = (curr->M > curr1->M) ? curr->M : curr1->M;
len2 = promejutochny_otrezok->x2 - new_point;
len1 = new_point - promejutochny_otrezok->x1;
r_size = R.size();
if ((len2 + len1) == dmax) {
dmax = (len2 > len1) ? len2 : len1;
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}
if (mode) {
if ((threshold_03 > dmax && fmodf(static_cast<float>(schetchick), 3.0f) == 0.0f) || 10.0f * dmax < initial_length) {
if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
progress = fmaf(-inv_threshold_03, dmax, 1.0f);
alpha = fmaf(progress, progress, 1.0f);
betta = 2.0f - alpha;
MULTIPLIER = (1.0f / dmax) * Mmax;
global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
GLOBAL_FACTOR = betta * global_coeff;
curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
Mmax = currM; m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) { R[i]->ChangeCharacteristic(m); ++i; }
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
Mmax = currM; m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) { R[i]->ChangeCharacteristic(m); ++i; }
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
R.emplace_back(curr);
std::push_heap(R.begin(), R.end(), ComparePtr);
R.emplace_back(curr1);
std::push_heap(R.begin(), R.end(), ComparePtr);
top_ptr = R.front();
interval_len = top_ptr->x2 - top_ptr->x1;
if (interval_len < epsilon) {
g_world->send(partner, 2, 0);
if (partner) {
Extr.emplace_back(static_cast<float>(schetchick));
Extr.emplace_back(interval_len);
*out_len = Extr.size();
out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
}
return;
}
x_Rmax_1 = top_ptr->x1;
x_Rmax_2 = top_ptr->x2;
y_Rmax_1 = top_ptr->y1;
y_Rmax_2 = top_ptr->y2;
}
}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void Base_LNA_1_2_Mer_AGP_Free(float* p) noexcept
{
CoTaskMemFree(p);
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpStartWorkers() noexcept
{
g_world->send(1, 0, 1);
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpStopWorkers() noexcept
{
g_world->send(1, 0, 2);
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() noexcept
{
int command = 0;
while (true) {
probe_status = g_world->iprobe(0, 0);
if (probe_status) {
g_world->recv(0, 0, command);
}
if (command == 1) {
float* __restrict dummy = nullptr;
size_t dummy_len = 0u;
Base_LNA_1_2_Mer_AGP(2.0f, 1000.0f, -2.2f, 1.8f, -2.2f, 1.8f,
2.5f, false, 0.001f, GetTickCount(),
&dummy, &dummy_len);
}
else if (command == 2) {
break;
}
}
return;
} в этом коде у меня сейчас проблема не с mpi коммуникацией а в том что при первом нажатии на кнопку всё срабатывает нормально но затем происходит что-то странное и значения кажутся некорректными - мусорорм - думаю это из-за того что нужно обнулять указатели статические после использования - возвращать состояние к корректному как при первом запуске, напоминаю код в MyForm.cpp #include "MyForm.h"
#include <float.h>

using namespace System;
using namespace System::Windows::Forms;

typedef int(__cdecl* PInit)(int, float, float, float, float);
typedef void(__cdecl* PFin)();
typedef void(__cdecl* PStartWorkers)();

[STAThread]
int main() {
HMODULE h = LoadLibraryW(L"TEST_FUNC.dll");
auto AgpInit = (PInit)GetProcAddress(h, "AgpInit");
auto AgpFinalize = (PFin)GetProcAddress(h, "AgpFinalize");
auto AgpWaitStartAndRun = (PStartWorkers)GetProcAddress(h, "AgpWaitStartAndRun");

text
const int rank = AgpInit(12, -2.2f, 1.8f, -2.2f, 1.8f);

if (rank == 0) {
	Application::EnableVisualStyles();
	Application::SetCompatibleTextRenderingDefault(false);
	Application::Run(gcnew TESTAGP::MyForm(h));
}
else {
	AgpWaitStartAndRun();
}

AgpFinalize();
FreeLibrary(h);
return 0;

} код в MyForm.h ref class MyForm : public System::Windows::Forms::Form {
public:
MyForm(HMODULE hLib) : hLib(hLib) { // Передаем дескриптор из main
InitializeComponent();
// Загружаем функции из DLL
f = (agp_c)GetProcAddress(hLib, "Base_LNA_1_2_Mer_AGP");
pStart = (start_workers)GetProcAddress(hLib, "AgpStartWorkers");
pFree = (free_agp)GetProcAddress(hLib, "Base_LNA_1_2_Mer_AGP_Free");
pStop = (stop_workers)GetProcAddress(hLib, "AgpStopWorkers");
}

text
protected:
	~MyForm() {
		pStop();
		delete components;
	}

private:
	HINSTANCE hLib = nullptr;
	typedef void(__cdecl* agp_c)(float, float, float, float, float, float, float, bool, float, float, float**, size_t*);
	typedef void(__cdecl* start_workers)();
	typedef void(__cdecl* free_agp)(float*);
	typedef void(__cdecl* stop_workers)();

	agp_c f = nullptr;
	start_workers pStart = nullptr;
	free_agp pFree = nullptr;
	stop_workers pStop = nullptr;

	System::Windows::Forms::Button^ button1;
	System::Windows::Forms::TextBox^ textBox2;
	System::Windows::Forms::DataVisualization::Charting::Chart^ chart2;
	System::Windows::Forms::Label^ label2;
	System::Windows::Forms::Label^ label3;
	System::Windows::Forms::TextBox^ textBox1;
	System::Windows::Forms::TextBox^ textBox3;
	System::Windows::Forms::TextBox^ textBox4;
	System::Windows::Forms::TextBox^ textBox5;
	System::Windows::Forms::TextBox^ textBox6;
	System::Windows::Forms::Label^ label6;
	System::Windows::Forms::Label^ label7;
	System::Windows::Forms::Label^ label8;
	System::Windows::Forms::Label^ label9;
	System::Windows::Forms::Label^ label10;
	System::Windows::Forms::Label^ label1;
	System::Windows::Forms::TextBox^ textBox7;
	System::Windows::Forms::TextBox^ textBox8;
	System::ComponentModel::Container^ components;

	System::Void button1_Click(System::Object^ sender, System::EventArgs^ e) {
		chart2->Series[0]->Points->Clear();
		chart2->Series[1]->Points->Clear();
		chart2->Series[2]->Points->Clear();
		chart2->Series[3]->Points->Clear();

		static LARGE_INTEGER start, end;
		QueryPerformanceCounter(&start);

		float* buf = nullptr;
		size_t len = 0u;

		pStart();  // разбудили rank != 0
		f(2.0f, 1000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.001f, GetTickCount(), &buf, &len);

		if (buf != nullptr) {
			std::vector<float> Extr_2D(buf, buf + len);
			pFree(buf);

			// Для 2D ветки: извлекаем schetchick и interval_len
			textBox7->Text = Convert::ToString(Extr_2D.back()); // schetchick
			Extr_2D.pop_back();
			textBox6->Text = Convert::ToString(Extr_2D.back()); // interval_len
			Extr_2D.pop_back();

			// Извлекаем последнюю тройку (x2, x1, f)
			float x2 = Extr_2D.back();
			Extr_2D.pop_back();
			float x1 = Extr_2D.back();
			Extr_2D.pop_back();
			float f_val = Extr_2D.back();
			Extr_2D.pop_back();

			// Выводим координаты и значение функции
			textBox4->Text = Convert::ToString(x1);
			textBox3->Text = Convert::ToString(x2);
			textBox2->Text = Convert::ToString(f_val); // Новое поле для значения функции

			// Отображаем точку на графике
			chart2->Series[2]->Points->AddXY(x1, x2);

			// Обрабатываем остальные точки (в порядке: f, x1, x2)
			while (!Extr_2D.empty()) {
				float x2_point = Extr_2D.back();
				Extr_2D.pop_back();
				float x1_point = Extr_2D.back();
				Extr_2D.pop_back();
				float f_point = Extr_2D.back();
				Extr_2D.pop_back();

				// Добавляем точку на график (только координаты)
				chart2->Series[0]->Points->AddXY(x1_point, x2_point);
			}
		}

		QueryPerformanceCounter(&end);

		LARGE_INTEGER freq;
		QueryPerformanceFrequency(&freq);
		const int multiplier = ((1ULL << 32) / freq.QuadPart) * 1'000'000ULL;

		textBox5->Text = Convert::ToString(
			((end.QuadPart - start.QuadPart) * multiplier) >> 32
		) + " microseconds";

Answer 1

Похоже, «мусор» после второго нажатия прилетает не из-за MPI, а из-за вашего собственного аллокатора и общего глобального состояния между запусками.

Коротко, что происходит:

Вы используете один общий пул peano_memory_manager и для построения дерева Пеано (PeanoCurve_2D), и для рабочих интервалов (Interval).
Пул – это тупой bump-allocator: allocate_from_block() только сдвигает current, границ не проверяет и никогда не откатывается.
Память для пула выделяется один раз в AgpInit() и больше не трогается. На первом запуске вы израсходовали кусок на дерево и интервалы, на втором запуске начинаете выделять дальше (над областью, где уже ничего не гарантируется). Отсюда – странные/рандомные значения.
Плюс, в воркере (ранг ≠ 0) вы возвращаете буфер dummy, но его не освобождаете — на каждое «Start» течёт память. Это быстро не «ломает» логику, но усугубляет поведение.

Ниже — минимальные правки, которые возвращают состояние «как при первом запуске», но не трогают память с деревом.

Что именно правим

1) Добавляем «метку» после построения дерева и сброс к этой метке перед каждым запуском

Идея: запоминаем high-water-mark у каждого MemoryBlock после строительства дерева; перед каждым Base_LNA_1_2_Mer_AGP() откатываем current к этой метке (то есть заново используем область для интервалов), не трогая память с узлами дерева.

cpp
// В вашем MemoryBlock:
struct MemoryBlock final {
    char* data;
    char* current;
    const char* end;
    char* mark_after_tree; // <- добавили

    __forceinline void initialize(size_t bytes) noexcept {
        const size_t aligned_need = (bytes + 1048638) & ~(1048575);
        void* p = VirtualAlloc(nullptr, aligned_need, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
        data = reinterpret_cast<char*>(p);
        const uintptr_t base = reinterpret_cast<uintptr_t>(data);
        const uintptr_t aligned = (base + 63) & ~63;
        current = reinterpret_cast<char*>(aligned);
        end = current + bytes;
        mark_after_tree = current; // изначально метка = начало
    }

    __forceinline void mark_tree_done() noexcept {
        mark_after_tree = current;
    }

    __forceinline void reset_to_mark() noexcept {
        current = mark_after_tree;
    }

    __forceinline void* allocate_from_block(size_t alloc_size) noexcept {
        const size_t n = (alloc_size + 63) & ~63;
        char* result = current;
        current += n;

        // Заодно добавим проверку границы в Debug (в релизе можно убрать/ассертить).
        #if _DEBUG
        if (current > end) __debugbreak();
        #endif

        return result;
    }
};

В менеджере добавим удобные обёртки:

cpp
struct PreallocatedPeanoMemoryManager {
    tbb::enumerable_thread_specific<MemoryBlock> tls;

    void mark_tree_done_for_current_thread() noexcept {
        tls.local().mark_tree_done();
    }
    void reset_for_current_thread() noexcept {
        tls.local().reset_to_mark();
    }
};

Где вызывать:

В AgpInit(...) — после ::new (&Curve) PeanoCurve_2D(...):

cpp
// Дерево построено -> для текущего (главного) потока фиксируем метку.
peano_memory_manager.mark_tree_done_for_current_thread();

Не переживайте насчёт TBB: узлы дерева действительно создавались в разных потоках, но интервалы вы создаёте в том же потоке, где зовёте Base_LNA_1_2_Mer_AGP (ранг 0). Нам важно откатывать именно локальный MemoryBlock этого потока, чтобы итерировать область «под интервалы». Память других TLS-блоков, на которых строились части дерева, мы не трогаем.

В самом начале Base_LNA_1_2_Mer_AGP(...):

cpp
// Перед каждым запуском алгоритма очищаем «рабочую» арену интервалов:
peano_memory_manager.reset_for_current_thread();

// А эти вектора у вас уже очищаются .clear(), что нормально:
Extr.clear();
R.clear();

Этого достаточно, чтобы каждый запуск начинался в «чистой» зоне для Interval, не повреждая дерево.

Альтернатива (тоже рабочая): дать Interval свой отдельный аллокатор (второй глобальный менеджер или просто обычный new). Это изолирует «временные» объекты от дерева вообще. Если захотите — подскажу, как быстро вынести Interval::operator new на второй пул/на tbb::scalable_allocator.

2) Починить утечку в воркере

В AgpWaitStartAndRun() вы не освобождаете буфер, возвращаемый из Base_LNA_1_2_Mer_AGP. Нужно сразу звать экспорт Base_LNA_1_2_Mer_AGP_Free:

cpp
else if (command == 1) {
    float* __restrict dummy = nullptr;
    size_t dummy_len = 0;
    Base_LNA_1_2_Mer_AGP(2.0f, 1000.0f, -2.2f, 1.8f, -2.2f, 1.8f,
                         2.5f, false, 0.001f, GetTickCount(),
                         &dummy, &dummy_len);
    if (dummy) Base_LNA_1_2_Mer_AGP_Free(dummy);  // <-- обязательно
}

Без этого при каждом старте ранга ≠ 0 утекает кусок памяти.

3) Небольшие, но полезные точечные фиксы

В 2D-ветке после первого R.emplace_back(...) хорошо бы сразу привести R к куче (для единообразия):
```
cpp
R.clear();
R.emplace_back(new Interval(true_start, true_end, start_val, best_f, N));
std::push_heap(R.begin(), R.end(), ComparePtr); // можно и так; с одним элементом безопасно
```
(С одним элементом поведение и так корректное, но это уберёт риск, если кто-то позже вставит код между emplace_back и первым pop_heap.)
В условии обменов есть неочевидный приоритет !:
```
cpp
// Было:
if (!(schetchick % T) == mcQueenSpeed) {
// Обычно хотят:
if ((schetchick % T) != mcQueenSpeed) {
}
```
Если логика у вас именно «раз в T шагов при несоответствии скорости», перепишите как выше. С нынешним вариантом легко попасть в «дребезг» условий.

Shag(): вы читаете биты diff через *(const int*)&diff — это UB по strict-aliasing. Безопаснее:

cpp
const uint32_t sign = _mm_cvtss_si32(_mm_set_ss(diff)) & 0x80000000u;
const uint32_t bits = 0x3F800000u | (sign ^ 0x80000000u);
const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(bits)));

4) В UI всё ок, но порядок распаковки оставьте как есть

Вы правильно обнуляете график и освобождаете буфер через pFree(buf) после копирования в std::vector. В 2D-ветке порядок у вас f, x1, x2 — вы читаете задом-наперед, это корректно.

Что это даёт

Каждое нажатие «Старт» начинает расчёт с того же состояния рабочей памяти (интервалы создаём поверх «чистой» арены), дерево остаётся неизменным.
Устраняется накопительный эффект от bump-allocator’а и связанные «рандомные» значения.
Воркеры не текут по памяти.

Если хотите самый простой (и радикально безопасный) вариант — уберите перегруженный operator new у Interval и дайте интервалам обычные new/delete (или tbb::scalable_allocator). Тогда никаких сбросов вообще не нужно — пул останется только под дерево, и он неизменяем между запусками. Но показанный выше «mark/reset» даёт те же гарантии с минимальными правками.

#include "pch.h" #define LCG_RAND(state) ...

问题

思考

Identifying memory and calculation issues

Addressing memory allocation issues

Addressing memory safety and thread handling

Handling memory management and allocations

Proposing improvements for memory allocation strategy

Analyzing code behavior and memory management

Investigating memory and thread allocation