#include "pch.h" #define LCG_RAND(state) \ do { \ ...

Question

#include "pch.h"
#define LCG_RAND(state)
do {
float reduced = fmodf((state), 1.618033988749895f);
reduced = fmaf(reduced, 1.618033988749895f, reduced * 0.5772156649f);
reduced -= (float)(int)(reduced);
reduced = fmaf(reduced, reduced + 1.0f, 0.0f);
reduced -= (float)(int)(reduced);
(state) = reduced;
} while(0)
#define LCG_RAND_GRSH(state)
do {
float reduced = fmodf((state), 1.618033988749895f);
reduced = fmaf(reduced, 1.618033988749895f, reduced * 0.5772156649f);
reduced -= (float)(int)(reduced);
reduced = fmaf(reduced, reduced + 1.0f, 0.0f);
reduced -= (float)(int)(reduced);
(state) = fmaf(reduced, 2.0f, -1.0f);
} while(0)
#define FABE13_COS(x, result_var)
do {
float abs_val = fabsf((x));
float reduced = fmodf(abs_val, 6.28318530718f);
if (reduced > 3.14159265359f) {
reduced = 6.28318530718f -reduced;
}
if (reduced < 1.57079632679f) {
float val2 = reduced * reduced;
result_var = fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
} else {
reduced = 3.14159265359f -reduced;
float val2 = reduced * reduced;
result_var = -fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
}
} while(0)
#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
int limit = (n) & ~15;
if ((n) >= 16) {
const __m512 VEC_TWOPI = _mm512_set1_ps(6.28318530718f);
const __m512 VEC_PI = _mm512_set1_ps(3.14159265359f);
const __m512 VEC_PI_2 = _mm512_set1_ps(1.57079632679f);
const __m512 INV_TWOPI = _mm512_set1_ps(0.15915494309189535f);
const __m512 BIAS = _mm512_set1_ps(12582912.0f);
const __m512 VEC_COS_P3 = _mm512_set1_ps(0.0416666667f);
const __m512 VEC_COS_P1 = _mm512_set1_ps(-0.5f);
const __m512 VEC_COS_P0 = _mm512_set1_ps(1.0f);
const __m512 VEC_SIN_P1 = _mm512_set1_ps(-0.16666666f);
const __m512 VEC_SIN_P0 = _mm512_set1_ps(1.0f);
const __m512 VEC_NEG1 = _mm512_set1_ps(-1.0f);
const __m512 VEC_ZERO = _mm512_setzero_ps();
while (i < limit) {
__m512 vx = _mm512_load_ps(&(in)[i]);
__m512 vax = _mm512_abs_ps(vx);
__m512 q = _mm512_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm512_sub_ps(q, BIAS);
__m512 r = _mm512_fnmadd_ps(VEC_TWOPI, q, vax);
__m512 r1 = _mm512_min_ps(r, _mm512_sub_ps(VEC_TWOPI, r));
__m512 r2 = _mm512_min_ps(r1, _mm512_sub_ps(VEC_PI, r1));
__mmask16 m_pi = _mm512_cmp_ps_mask(r, VEC_PI, _CMP_GT_OQ);
__mmask16 m_pi_2 = _mm512_cmp_ps_mask(r1, VEC_PI_2, _CMP_GT_OQ);
__mmask16 m_negx = _mm512_cmp_ps_mask(vx, VEC_ZERO, _CMP_LT_OQ);
__mmask16 m_sinflip = _kxnor_mask16(m_negx, m_pi);
__m512 t2 = _mm512_mul_ps(r2, r2);
__m512 vcos = _mm512_fmadd_ps(t2, _mm512_fmadd_ps(t2, VEC_COS_P3, VEC_COS_P1), VEC_COS_P0);
__m512 vsin = _mm512_mul_ps(_mm512_fmadd_ps(t2, VEC_SIN_P1, VEC_SIN_P0), r2);
vcos = _mm512_mask_mul_ps(vcos, m_pi_2, vcos, VEC_NEG1);
vsin = _mm512_mask_mul_ps(vsin, m_sinflip, vsin, VEC_NEG1);
_mm512_store_ps(&(cos_out)[i], vcos);
_mm512_store_ps(&(sin_out)[i], vsin);
i += 16;
}
}
while (i < (n)) {
float x = (in)[i];
float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
bool sflip = (r > 3.14159265359f);
if (sflip) r = 6.28318530718f - r;
bool cflip = (r > 1.57079632679f);
if (cflip) r = 3.14159265359f - r;
float t2 = r * r;
float c = fmaf(t2, fmaf(t2, 0.0416666667f, -0.5f), 1.0f);
float s = fmaf(t2, -0.16666666f, 1.0f) * r;
cos_out[i] = cflip ? -c : c;
sin_out[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while(0)

static __declspec(noalias) __forceinline float ShekelFunc(const float x, const float seed) noexcept
{
uint64_t i = 0u;
float current_state = seed, current_res, res = 0.0f;

text
while (i < 10u) {
	LCG_RAND(current_state);
	const float x_part = fmaf(-current_state, 10.0f, x);
	LCG_RAND(current_state);
	current_res = current_state;
	LCG_RAND(current_state);
	float delimiter = fmaf(fmaf(current_res, 20.0f, 5.0f),
		x_part * x_part,
		fmaf(current_state, 0.2f, 1.0f));
	delimiter = copysignf(fmaxf(fabsf(delimiter), FLT_MIN), delimiter);
	res -= 1.0f / delimiter;
	++i;
}

return res;

}

static __declspec(noalias) __forceinline float RastriginFunc(const float x1, const float x2) noexcept
{
const float term1 = fmaf(x1, x1, x2 * x2);
float cos1, cos2;
FABE13_COS(6.28318530717958647692f * x1, cos1);
FABE13_COS(6.28318530717958647692f * x2, cos2);
return (term1 - fmaf(cos1 + cos2, 10.0f, -14.6f)) * fmaf(-term1, 0.25f, 18.42f);
}

static __declspec(noalias) __forceinline float HillFunc(const float x, const float seed) noexcept
{
int64_t j = 0;
__declspec(align(64u)) float angles[14u];
const float start_angle = 6.28318530717958647692f * x;

#pragma loop(ivdep)
while (j < 14) {
angles[j] = start_angle * (j + 1);
++j;
}

text
__declspec(align(64u)) float sin_vals[14u];
__declspec(align(64u)) float cos_vals[14u];
FABE13_SINCOS(angles, sin_vals, cos_vals, 14u);

float current_state = seed;
LCG_RAND(current_state);
float res = fmaf(current_state, 2.0f, -1.1f);

--j;
while (j >= 0) {
	LCG_RAND(current_state);
	float tmp_state = current_state;
	LCG_RAND(current_state);
	res += fmaf(fmaf(tmp_state, 2.0f, -1.1f), sin_vals[j],
		fmaf(current_state, 2.0f, -1.1f) * cos_vals[j]);
	--j;
}

return res;

}

static __declspec(noalias) __forceinline float GrishaginFunc(const float x1, const float x2, const float seed) noexcept
{
int64_t j = 0;
__declspec(align(64u)) float angles_j[8u];
__declspec(align(64u)) float angles_k[8u];

#pragma loop(ivdep)
while (j < 8) {
const float pj_mult = 3.14159265358979323846f * (j + 1);
angles_j[j] = pj_mult * x1;
angles_k[j] = pj_mult * x2;
++j;
}

text
__declspec(align(64u)) float sin_j[8u], cos_j[8u];
__declspec(align(64u)) float sin_k[8u], cos_k[8u];
FABE13_SINCOS(angles_j, sin_j, cos_j, 8u);
FABE13_SINCOS(angles_k, sin_k, cos_k, 8u);

--j;
float part1 = 0.0f;
float part2 = 0.0f;
float current_state = seed;
while (j >= 0) {
	size_t k = 0u;
	while (k < 8u) {
		const float sin_term = sin_j[j] * sin_j[j];
		const float cos_term = cos_k[k] * cos_k[k];

		LCG_RAND_GRSH(current_state);
		float tmp_state = current_state;
		LCG_RAND_GRSH(current_state);
		part1 = fmaf(tmp_state, sin_term,
			fmaf(current_state, cos_term, part1));
		LCG_RAND_GRSH(tmp_state);
		LCG_RAND_GRSH(current_state);
		part2 = fmaf(-tmp_state, cos_term,
			fmaf(current_state, sin_term, part2));
		++k;
	}
	--j;
}

return -sqrtf(fmaf(part1, part1, part2 * part2));

}

static __declspec(noalias) __forceinline float Shag(const float _m, const float x1, const float x2, const float y1,
const float y2, const uint64_t _N, const float _r) noexcept
{
const float diff = y2 - y1;
return _N == 1u
? fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f
: _N == 2u
? diff > 0.0f
? fmaf(-(1.0f / (_m * _m)), (diff) * (diff)_r, x1 + x2) * 0.5f
: fmaf(1.0f / (_m * _m), (diff) * (diff)_r, x1 + x2) * 0.5f
: diff > 0.0f
? fmaf(-(1.0f / powf(_m, _N)), powf(diff, _N) * _r, x1 + x2) * 0.5f
: fmaf(1.0f / powf(_m, _N), powf(diff, _N) * _r, x1 + x2) * 0.5f;
}

__declspec(align(64u)) struct Interval final {

public:
const float* x1;
const float* x2;
const float* y1;
const float* y2;
const float* delta_y;
const float* N_factor;
const float* M;
float* R;

text
__declspec(noalias) __forceinline Interval(const float* const _x1, const float* const _x2,
	const float* const _y1, const float* const _y2, const float* const _N) noexcept
{
	x1 = _x1;
	x2 = _x2;
	y1 = _y1;
	y2 = _y2;
	delta_y = new float(*y2 - *y1);
	N_factor = new float(*_N == 1.0f ? *_x2 - *_x1 :
		*_N == 2.0f ? sqrtf(*_x2 - *_x1) :
		powf(*_x2 - *_x1, 1.0f / *_N));
	M = new float(fabsf(*delta_y) * (1.0f / *N_factor));
	R = new float(0.0f);
}

__declspec(noalias) __forceinline void ChangeCharacteristic(const float _m) noexcept
{
	*R = fmaf(
		-(*y2 + *y1),
		2.0f,
		fmaf(_m, *N_factor, (*delta_y * *delta_y) * (1.0f / (_m * *N_factor))));

}

};

static __declspec(noalias) __forceinline bool ComparePtr(const std::unique_ptr<Interval>& __restrict const a, const std::unique_ptr<Interval>& __restrict const b) noexcept
{
return a->R < b->R;
}

const __declspec(noalias) __forceinline PeanoCurve_2D* PeanoCurve_2D::HitTest_2D(float x) const noexcept
{
int i = 0;
const int _razvertka = this->razvertka;
int num;
const float this_a = this->a;
x -= this_a;
const float b_minus_a = this->b - this_a;
const float inv_b_minus_a = 1.0f / b_minus_a;
const PeanoCurve_2D* Curr = this;

text
while (i != _razvertka) {
	const int shift = 1u << ++i + i;
	num = shift * x * inv_b_minus_a;
	x = fmaf(-ldexp(1.0f, -(i << 1)) * num, b_minus_a, x);
	const List currType = Curr->Type;

	switch (num) {
	case 0u:
		Curr = (currType == Top || currType == Right)
			? Curr->DownLeft.get()
			: Curr->TopRight.get();
		break;
	case 1u:
		Curr = (currType == Top || currType == Left)
			? Curr->TopLeft.get()
			: Curr->DownRight.get();
		break;
	case 2u:
		Curr = (currType == Top || currType == Right)
			? Curr->TopRight.get()
			: Curr->DownLeft.get();
		break;
	default:
		Curr = (currType == Top || currType == Left)
			? Curr->DownRight.get()
			: Curr->TopLeft.get();
	}
}

return Curr;

}

const __declspec(noalias) __forceinline float PeanoCurve_2D::FindX_2D(const float target_x1, const float target_x2) const noexcept
{
int _razvertka = this->razvertka;
int _razvertka1 = _razvertka;
float x1, x2, x = this->a;
const float b_minus_a = this->b - x;
const PeanoCurve_2D* Curr = this;

text
while (_razvertka != 0u) {
	const int exponent = _razvertka1 - _razvertka-- << 1u;
	x1 = Curr->x1;
	x2 = Curr->x2;
	const List currType = Curr->Type;

	if (target_x1 > x1 && target_x2 > x2) {
		Curr = Curr->TopRight.get();
		if (currType == Top || currType == Right) {
			x = fmaf(ldexpf(1.0f, -exponent) * 0.5f, b_minus_a, x);
		}
	}
	else if (target_x1 < x1 && target_x2 > x2) {
		Curr = Curr->TopLeft.get();
		if (currType == Top || currType == Left) {
			x = fmaf(ldexpf(1.0f, -exponent) * 0.25f, b_minus_a, x);
		}
		else {
			x = fmaf(ldexpf(1.0f, -exponent) * 0.75f, b_minus_a, x);
		}
	}
	else if (target_x1 < x1 && target_x2 < x2) {
		Curr = Curr->DownLeft.get();
		if (currType == Down || currType == Left) {
			x = fmaf(ldexpf(1.0f, -exponent) * 0.5f, b_minus_a, x);
		}
	}
	else {
		Curr = Curr->DownRight.get();
		if (currType == Top || currType == Left) {
			x = fmaf(ldexpf(1.0f, -exponent) * 0.75f, b_minus_a, x);
		}
		else {
			x = fmaf(ldexpf(1.0f, -exponent) * 0.25f, b_minus_a, x);
		}
	}
}

return x;

}

static std::unique_ptrboost::mpi::environment g_env;
static std::unique_ptrboost::mpi::communicator g_world;
static std::unique_ptr<PeanoCurve_2D> Curve, Curve_Inverted;

extern "C" __declspec(dllexport) int AgpInit(int peanoLevel, float a, float b, float c, float d) {
if (!g_env) g_env = std::make_uniqueboost::mpi::environment(); // MPI_Init()
if (!g_world) g_world = std::make_uniqueboost::mpi::communicator(); // MPI_COMM_WORLD
int rank = g_world->rank();
if (rank == 0) Curve = std::make_unique<PeanoCurve_2D>(List::Top, peanoLevel, a, b, c, d);
else if (rank == 1) Curve_Inverted = std::make_unique<PeanoCurve_2D>(List::Down, peanoLevel, a, b, c, d);
return rank;
}

extern "C" __declspec(dllexport) void AgpFinalize() {
Curve_Inverted.reset();
Curve.reset();
g_world.reset();
g_env.reset(); // MPI_Finalize()
}

__declspec(align(16u)) struct CrossMsg final {
float s_x1, s_x2;
float e_x1, e_x2;
float Rtop;
template <typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& __restrict ar, const uint64_t) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }
};

extern "C" __declspec(dllexport) __declspec(noalias) void Base_LNA_1_2_Mer_AGP(
const float N, const float global_iterations, const float a, const float b, const float c, const float d, const float r,
const bool mode, const float epsilon, const float seed,
float** out_data, uint64_t* out_len) noexcept
{
if (N == 1.0f) {
const float initial_length = b - a;
float dmax = initial_length;
const float start_val = ShekelFunc(a, seed);
float best_f = ShekelFunc(b, seed);
float schetchick = 0.0f;

text
	float x_Rmax_1 = a;
	float x_Rmax_2 = b;
	float y_Rmax_1 = start_val;
	float y_Rmax_2 = best_f;

	std::vector<float> Extr;
	Extr.reserve(global_iterations * 4u);
	std::vector<std::unique_ptr<Interval>> R;
	R.reserve(global_iterations * 2u);

	// Создаем начальный интервал
	R.emplace_back(std::make_unique<Interval>(&a, &b, &start_val, &best_f, &N));
	std::push_heap(R.begin(), R.end(), ComparePtr);
	float Mmax = *R.front()->M;
	float m = r * Mmax;

	const float threshold_03 = 0.3f * initial_length;
	const float inv_threshold_03 = 1.0f / threshold_03;

	float x1, y1, x2, y2, new_point, new_value;

	while (true) {
		new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, N, r);
		new_value = ShekelFunc(new_point, seed);

		if (new_value < best_f) {
			best_f = new_value;
			Extr.emplace_back(best_f);
			Extr.emplace_back(new_point);
		}

		// Извлекаем максимальный интервал
		std::pop_heap(R.begin(), R.end(), ComparePtr);
		//const std::unique_ptr<Interval> promejutochny_otrezok = std::move(R.back());
		const std::unique_ptr<Interval>& promejutochny_otrezok = R.back();
		x1 = *promejutochny_otrezok->x1;
		y1 = *promejutochny_otrezok->y1;
		x2 = *promejutochny_otrezok->x2;
		y2 = *promejutochny_otrezok->y2;
		R.pop_back();
		const size_t r_size = R.size();

		const float len2 = x2 - new_point;
		const float len1 = new_point - x1;
		const float dmin = len1 < len2 ? len1 : len2;

		// Условие завершения
		if (++schetchick == global_iterations || dmin < epsilon) {
			if (g_world->rank() == 0) {
				Extr.emplace_back(schetchick);
				Extr.emplace_back(dmin);
				*out_len = Extr.size();
				*out_data = reinterpret_cast<float*>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
				memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
			}
			return;
		}

		// Создаем новые интервалы
		std::unique_ptr<Interval> curr = std::make_unique<Interval>(&x1, &new_point, &y1, &new_value, &N);
		std::unique_ptr<Interval> curr1 = std::make_unique<Interval>(&new_point, &x2, &new_value, &y2, &N);
		const float currM = *curr->M > *curr1->M ? *curr->M : *curr1->M;

		if (len2 + len1 == dmax) {
			dmax = len2 > len1 ? len2 : len1;

#pragma loop(ivdep)
for (size_t i = 0u; i < r_size; ++i) {
const float len = *R[i]->x2 - *R[i]->x1;
if (len > dmax) {
dmax = len;
}
}
}

text
		if (mode) {
			if (threshold_03 > dmax && fmodf(schetchick, 3.0f) == 0.0f || 10.0f * dmax < initial_length) {
				if (currM > Mmax) {
					Mmax = currM;
					m = r * Mmax;
				}
				const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
				const float alpha = fmaf(progress, progress, 1.0f);
				const float betta = 2.0f - alpha;

				const float MULTIPLIER = (1.0f / dmax) * Mmax;
				const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
				const float GLOBAL_FACTOR = betta * global_coeff;

				curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, *curr->M * alpha));
				curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, *curr1->M * alpha));

#pragma loop(ivdep)
for (size_t i = 0u; i < r_size; ++i) {
const float len_item = *R[i]->x2 - *R[i]->x1;
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len_item, *R[i]->M * alpha));
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
#pragma loop(ivdep)
for (size_t i = 0u; i < r_size; ++i) {
R[i]->ChangeCharacteristic(m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
#pragma loop(ivdep)
for (size_t i = 0u; i < r_size; ++i) {
R[i]->ChangeCharacteristic(m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}

text
		// Добавляем новые интервалы
		R.emplace_back(std::move(curr));
		std::push_heap(R.begin(), R.end(), ComparePtr);
		R.emplace_back(std::move(curr1));
		std::push_heap(R.begin(), R.end(), ComparePtr);

		// Обновляем x_Rmax и y_Rmax
		const std::unique_ptr<Interval>& top = R.front();
		x_Rmax_1 = *top->x1;
		x_Rmax_2 = *top->x2;
		y_Rmax_1 = *top->y1;
		y_Rmax_2 = *top->y2;
	}
}
else {
	const int rank = g_world->rank();
	const int partner = (rank == 0) ? 1 : 0;
	const PeanoCurve_2D* __restrict LocCurve =
		rank ? Curve_Inverted.get() : Curve.get();
	const float divider = ldexpf(1.0f, (LocCurve->razvertka << 1u) + 1u);
	const float inv_divider = 1.0f / divider;
	const float x_addition = (b - a) * inv_divider;
	const float y_addition = (d - c) * inv_divider;
	const float true_start = a + x_addition;
	float x_Rmax_1 = true_start;
	const float true_end = b - x_addition;
	float x_Rmax_2 = true_end;
	const float initial_length = true_end - true_start;
	const float threshold_03 = 0.3f * initial_length;
	float dmax = initial_length;
	const float inv_threshold_03 = 1.0f / threshold_03;
	const float end_val = rank ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition);
	float y_Rmax_1 = end_val;
	float best_f = rank ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition);
	float y_Rmax_2 = best_f;

	float   schetchick = 0.0f;
	float mcQueenSpeed = 1.0f;

	std::vector<float> Extr;
	Extr.reserve(global_iterations * 4u);
	std::vector<std::unique_ptr<Interval>> R;
	R.reserve(global_iterations * 2u);

	// Создаем начальный интервал
	R.emplace_back(std::make_unique<Interval>(&true_start, &true_end, &end_val, &best_f, &N));
	float Mmax = *R.front()->M;
	float m = r * Mmax;

	while (true) {
		g_world->irecv(partner, 0, mcQueenSpeed);
		unsigned int dummy;
		boost::mpi::request recv_req = g_world->irecv(partner, 2, dummy);
		boost::optional<boost::mpi::status> status = recv_req.test();
		if (status) {
			if (partner) {
				const std::unique_ptr<Interval>& front = R.front();
				Extr.emplace_back(schetchick);
				Extr.emplace_back(front->x2 - front->x1);
				*out_len = Extr.size();
				*out_data = reinterpret_cast<float*>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
				memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
			}
			//g_world->barrier();
			return;
		}
		if (++schetchick == global_iterations) {
			g_world->isend(partner, 2, 0u);
			if (partner) {
				const std::unique_ptr<Interval>& front = R.front();
				Extr.emplace_back(schetchick);
				Extr.emplace_back(front->x2 - front->x1);
				*out_len = Extr.size();
				*out_data = reinterpret_cast<float*>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
				memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
			}
			//g_world->barrier();
			return;
		}

		// «Охлаждение» и период T (как в вашей 2D-версии)
		const float cooling = ldexpf(1.0f, -schetchick * (1.0f / 138.63f));
		const float T = floorf(fmaf(20.0f, cooling, 10.0f));
		const float    k = fmaf(0.2f, cooling, 0.7f);

		if (!(fmodf(schetchick, T)) == mcQueenSpeed) {
			// Берём top без копирования (константная ссылка на уникальный указатель)
			const std::unique_ptr<Interval>& topL = R.front();

			// HitTest для концов лучшего интервала
			const PeanoCurve_2D* __restrict p1L = LocCurve->HitTest_2D(*topL->x1);
			const PeanoCurve_2D* __restrict p2L = LocCurve->HitTest_2D(*topL->x2);

			const CrossMsg outbound{ p1L->x1, p1L->x2, p2L->x1, p2L->x2, *topL->R };
			CrossMsg inbound{};

			if (mcQueenSpeed) {
				g_world->isend(partner, 0, 0.0f);
			}
			else {
				mcQueenSpeed = 1.0f;
			}

			//g_world->sendrecv(partner, 1, outbound, partner, 1, inbound);
			///g_world->isend(partner, 1, outbound);
			boost::mpi::request reqs[2];
			reqs[0] = g_world->isend(partner, 1, outbound);
			reqs[1] = g_world->irecv(partner, 1, inbound);
			boost::mpi::wait_all(reqs, reqs + 2);
			///g_world->irecv(partner, 1, inbound);

			// Проекция удалённых точек обратно в параметр x по локальной кривой
			const float newParam1_remote1 = LocCurve->FindX_2D(inbound.s_x1, inbound.s_x2);
			const float newParam2_remote1 = LocCurve->FindX_2D(inbound.e_x1, inbound.e_x2);
			const float newParam1_remote2 = RastriginFunc(inbound.s_x1, inbound.s_x2);
			const float newParam2_remote2 = RastriginFunc(inbound.e_x1, inbound.e_x2);

			// Вставляем «инъекцию» с пониженной характеристикой
			std::unique_ptr<Interval> injected = std::make_unique<Interval>(&newParam1_remote1, &newParam2_remote1, &newParam1_remote2, &newParam2_remote2, &N);
			*injected->R = inbound.Rtop * k;

			R.emplace_back(std::move(injected));
			std::push_heap(R.begin(), R.end(), ComparePtr);
		}

		// ===== Локальный шаг LNA в параметре кривой =====
		const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, N, r);

		// Координаты по кривой Пеано
		const PeanoCurve_2D* pc = LocCurve->HitTest_2D(new_point);
		const float new_x1 = pc->x1;
		const float new_x2 = pc->x2;

		const float new_value = RastriginFunc(new_x1, new_x2);

		// Обновление экстремумов (формат записи — как решено в 2D-ветке)
		if (new_value < best_f) {
			best_f = new_value;
			Extr.emplace_back(best_f);
			Extr.emplace_back(new_x1);
			Extr.emplace_back(new_x2);
		}

		// ===== Разбиение лучшего интервала =====
		std::pop_heap(R.begin(), R.end(), ComparePtr);
		const std::unique_ptr<Interval> promejutochny_otrezok = std::move(R.back());
		R.pop_back();

		// Создаём подынтервалы с прямой инициализацией
		std::unique_ptr<Interval> curr = std::make_unique<Interval>(promejutochny_otrezok->x1, &new_point, promejutochny_otrezok->y1, &new_value, &N);
		std::unique_ptr<Interval> curr1 = std::make_unique<Interval>(&new_point, promejutochny_otrezok->x2, &new_value, promejutochny_otrezok->y2, &N);

		const float currM = *curr->M > *curr1->M ? *curr->M : *curr1->M;

		const float len2 = *promejutochny_otrezok->x2 - new_point;
		const float len1 = new_point - *promejutochny_otrezok->x1;
		const size_t r_size = R.size();

		// Обновление dmax — один проход без лишних вызовов
		if (len2 + len1 == dmax) {
			dmax = (len2 > len1 ? len2 : len1);

#pragma loop(ivdep)
for (size_t i = 0u; i < r_size; ++i) {
const float len = *R[i]->x2 - *R[i]->x1;
if (len > dmax) dmax = len;
}
}

text
		// ===== Перенастройка характеристик R (как в 1D-ветке, но для N=2) =====
		if (mode) {
			if ((threshold_03 > dmax && fmodf(schetchick, 3.0f) == 0.0f) || (10.0f * dmax < initial_length)) {
				if (currM > Mmax) {
					Mmax = currM;
					m = r * Mmax;
				}
				const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
				const float alpha = fmaf(progress, progress, 1.0f);
				const float betta = 2.0f - alpha;
				const float MULTIPLIER = (1.0f / dmax) * Mmax;
				const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
				const float GLOBAL_FACTOR = betta * global_coeff;

				curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, *curr->M * alpha));
				curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, *curr1->M * alpha));

#pragma loop(ivdep)
for (size_t i = 0u; i < r_size; ++i) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, *R[i]->x2 - *R[i]->x1, *R[i]->M * alpha));
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
Mmax = currM;
m = r * Mmax;

text
					curr->ChangeCharacteristic(m);
					curr1->ChangeCharacteristic(m);

#pragma loop(ivdep)
for (size_t i = 0u; i < r_size; ++i) {
R[i]->ChangeCharacteristic(m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
Mmax = currM;
m = r * Mmax;

text
				curr->ChangeCharacteristic(m);
				curr1->ChangeCharacteristic(m);

#pragma loop(ivdep)
for (size_t i = 0u; i < r_size; ++i) {
R[i]->ChangeCharacteristic(m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}

text
		// ===== Добавление подынтервалов и единоразовая перестройка кучи =====
		R.emplace_back(std::move(curr));
		R.emplace_back(std::move(curr1));
		std::push_heap(R.begin(), R.end(), ComparePtr); // достаточно одного push_heap после двух вставок

		// Проверка критерия сходимости по «лучшему» интервалу
		const std::unique_ptr<Interval>& top = R.front();
		const float interval_len = *top->x2 - *top->x1;
		if (interval_len < epsilon) {
			g_world->isend(partner, 2, 0u);
			if (partner) {
				Extr.emplace_back(schetchick);
				Extr.emplace_back(interval_len);
				*out_len = Extr.size();
				*out_data = reinterpret_cast<float*>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
				memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
			}
			//g_world->barrier();
			return;
		}

		// Обновляем x_Rmax и y_Rmax (константная ссылка на top, чтобы избежать копии)
		x_Rmax_1 = *top->x1;
		x_Rmax_2 = *top->x2;
		y_Rmax_1 = *top->y1;
		y_Rmax_2 = *top->y2;
	}
}

}

extern "C" __declspec(dllexport) void Base_LNA_1_2_Mer_AGP_Free(float* p) {
CoTaskMemFree(p);
}

// В DLL
extern "C" __declspec(dllexport) void AgpStartWorkers() {
float go = 1.0f;
MPI_Bcast(&go, 1, MPI_INT, /root=/0, MPI_COMM_WORLD);
}

extern "C" __declspec(dllexport) void AgpWaitStartAndRun() {
int go = 0;
MPI_Bcast(&go, 1, MPI_INT, /root=/0, MPI_COMM_WORLD); // блокируется, пока мастер не пошлёт
if (go == 1) {
float* dummy = nullptr; uint64_t dummy_len = 0;
Base_LNA_1_2_Mer_AGP(1.0f, 1000.0f, 0.0f, 10.0f, -2.2f, 1.8f,
2.5f, false, 0.001f, GetTickCount(),
&dummy, &dummy_len);
if (dummy) Base_LNA_1_2_Mer_AGP_Free(dummy);
}
} при запуске на двух процессах через mpiexec при запуске основного приложения с загрузкой этой dll получаю что в вектор экстрэмумов заносится только 2-5 значений максимум чаще 3, причём если значения параметров адекватные то значение целевой функции невозможное, похоже проводится только первые пару итераций а потом алгоритм завершается, на выходе я получаю точность по выходу точно нулевую что говорит либо о том что сразу же было попадание в одну и ту же точку либо о том что возможно данные повреждены иногда при другом коде я получал ошибку обычно это свидетельствует о том что другие данные повреждены - не могу отказаться от использования указателей в классе интервал, не могу использовать вектор интервалов а не вектор умных указателей на интервалы, мне кажется что-то не то происходит с памятью или с указателями, может они хранят не то что ожидалось, dll в управляемом коде используется c++/clr

Answer 1

Похоже, у вас тут наложилось сразу несколько “убийц” стабильности: висячие указатели на стековые переменные, неверный компаратор для кучи и несогласованный тип при MPI_Bcast. Из-за этого heap ведёт себя хаотично, интервалы получают мусор, а алгоритм заканчивается практически сразу с “невозможными” значениями.

Ниже — конкретные места и минимальные правки, которые действительно лечат проблему, не отказываясь от указателей и сохраняя vector<unique_ptr<Interval>>.

1) Висячие указатели в `Interval`

Вы храните в Interval сырьевые указатели x1/x2/y1/y2 на чужие float’ы (часто локальные временные переменные). Например:

cpp
auto curr = std::make_unique<Interval>(&x1, &new_point, &y1, &new_value, &N);

x1, y1, new_point, new_value — локальные переменные текущей итерации; после выхода из блока их адреса становятся недействительными, а Interval продолжает ими пользоваться → UB/мусор/NaN/ранний выход.

Исправление (не отказываясь от указателей)

Пусть Interval владеет своими копиями скаляров и хранит указатели на эти копии:

cpp
__declspec(align(64u)) struct Interval final {
public:
    float* x1; float* x2; float* y1; float* y2;
    float* delta_y;
    float* N_factor;
    float* M;
    float* R;

    __declspec(noalias) __forceinline
    Interval(const float* _x1, const float* _x2,
             const float* _y1, const float* _y2,
             const float* _N) noexcept
    {
        x1 = new float(*_x1);
        x2 = new float(*_x2);
        y1 = new float(*_y1);
        y2 = new float(*_y2);

        delta_y = new float(*y2 - *y1);
        const float dx = *x2 - *x1;
        N_factor = new float((*_N == 1.0f) ? dx
                          : (*_N == 2.0f) ? sqrtf(dx)
                                          : powf(dx, 1.0f / *_N));
        M = new float(fabsf(*delta_y) * (1.0f / *N_factor));
        R = new float(0.0f);
    }

    ~Interval() noexcept {
        delete x1; delete x2; delete y1; delete y2;
        delete delta_y; delete N_factor; delete M; delete R;
    }

    __declspec(noalias) __forceinline
    void ChangeCharacteristic(const float _m) noexcept {
        *R = fmaf(-(*y2 + *y1), 2.0f,
             fmaf(_m, *N_factor, (*delta_y * *delta_y) * (1.0f / (_m * *N_factor))));
    }
};

Так вы по-прежнему используете указатели внутри Interval, но больше не ссылаетесь на чужую/временную память. Это устранит “случайные нули”, NaN и “невозможные” значения.

NB: у вас ещё и явные утечки (new float(...) без деструктора). Деструктор выше их закрывает.

2) Неверный компаратор для кучи

Сейчас:

cpp
static __forceinline bool ComparePtr(const std::unique_ptr<Interval>& a,
                                     const std::unique_ptr<Interval>& b) noexcept
{
    return a->R < b->R; // <-- сравнение адресов, а не значений!
}

Это сравнивает адреса float*, а не содержимое. Куча становится “рандомной”.

Исправление

cpp
static __forceinline bool ComparePtr(const std::unique_ptr<Interval>& a,
                                     const std::unique_ptr<Interval>& b) noexcept
{
    return *a->R < *b->R; // делаем max-heap по значению R
}

Сразу после каждого создания интервала не забывайте вызвать ChangeCharacteristic(...) (у вас это уже делается в логике перестройки).

3) Аккуратнее с `pop_heap`/`pop_back`

В ветке N == 1 вы делаете:

cpp
std::pop_heap(R.begin(), R.end(), ComparePtr);
const std::unique_ptr<Interval>& promejutochny_otrezok = R.back();
// читаете поля...
R.pop_back();

Формально вы читаете до pop_back(), поэтому сейчас везёт, но безопаснее вынести владение:

cpp
std::pop_heap(R.begin(), R.end(), ComparePtr);
auto promej = std::move(R.back());
R.pop_back();

float x1 = *promej->x1;
float y1 = *promej->y1;
float x2 = *promej->x2;
float y2 = *promej->y2;

auto curr  = std::make_unique<Interval>(&x1, &new_point, &y1, &new_value, &N);
auto curr1 = std::make_unique<Interval>(&new_point, &x2, &new_value, &y2, &N);

Так вы не держите ссылку на элемент, который вот-вот удалите из вектора, и (с учётом п.1) конструктор скопирует значения, а не адреса временных переменных.

Аналогично подправьте ветку N != 1 (там вы уже делаете auto promejutochny_otrezok = std::move(R.back()); — это правильно).

4) Несогласованный тип в `MPI_Bcast`

В AgpStartWorkers вы рассылаете float go, но указываете MPI_INT:

cpp
extern "C" __declspec(dllexport) void AgpStartWorkers() {
    float go = 1.0f;
    MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD); // тип не совпадает
}

А в приёмнике:

cpp
int go = 0;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (go == 1) { ... }

Из-за этого на получателях в go оказывается битовая маска 0x3f800000 (1.0f), условие go == 1 ложно, и ветка с запуском вычислений просто не срабатывает (или срабатывает “рандомно”). Это очень похоже на ваши симптомы “идёт 2–3 итерации и всё”.

Исправление (любой единый тип)

Вариант с int:

cpp
extern "C" __declspec(dllexport) void AgpStartWorkers() {
    int go = 1;
    MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
}

extern "C" __declspec(dllexport) void AgpWaitStartAndRun() {
    int go = 0;
    MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
    if (go == 1) {
        // ...
    }
}

Или используйте везде float + MPI_FLOAT. Главное — полная согласованность типа/сигнатуры на всех рангах.

5) Лишний `irecv` без ожидания/хранения запроса

Здесь:

cpp
g_world->irecv(partner, 0, mcQueenSpeed); // request выкидывается, ничем не управляется

Такой небезопасный вызов лучше убрать или сохранять request и выполнять test()/wait(), иначе можно получить “висящие” непредсказуемые приёмы. Это не корень вашей проблемы, но источник нестабильности.

6) Условие с T

cpp
const float T = floorf(fmaf(20.0f, cooling, 10.0f));
const float k = fmaf(0.2f, cooling, 0.7f);

if (!(fmodf(schetchick, T)) == mcQueenSpeed) { ... }

!(fmodf(...)) — это булево, потом сравнивается с mcQueenSpeed (float). Срабатывает, только когда остаток строго 0 и mcQueenSpeed == 1.0f. Если вам нужна проверка “каждые T итераций”, напишите проще и корректнее:

cpp
if (fmodf(schetchick, T) == 0.0f) {
    // ...
}

7) Пара замечаний по стилю/надёжности

С std::vector + куча проще и надёжнее заменить связку make_heap/push_heap/pop_heap на std::priority_queue<unique_ptr<Interval>, vector<unique_ptr<Interval>>, ComparePtr>. Ошибок меньше — логика та же.
Проверьте, что вы всегда вызываете ChangeCharacteristic(m) для новых интервалов до того, как класть их в кучу (у вас это в ветках делается, но легко пропустить при правках).
Для отлова подобных багов сильно помогает AddressSanitizer (MSVC: x64, /fsanitize=address) и статические анализаторы (MSVC /analyze, clang-tidy). На Windows это уже давно работает.

Коротко: что, скорее всего, ломает всё прямо сейчас

ComparePtr сравнивает адресы, а не значения → кривая куча.
Interval хранит указатели на временные переменные → висячие ссылки и мусор.
MPI_Bcast рассылает float, но объявлен как MPI_INT → воркеры не стартуют как задумано.

Исправьте эти три пункта — и, с очень высокой вероятностью, “3 значения экстрэмумов” и “немедленное завершение” уйдут. Если после этого останутся странности, пришлите минимальный лог (первые 10–20 итераций: *top->R, *top->x1/x2, best_f, длина интервала), я по числам быстро подскажу, где ещё тонко.

#include "pch.h" #define LCG_RAND(state) \ do { \ ...

Question

Thought

Evaluating code behavior

Identifying bugs and memory issues

Identifying heap issues and bugs

Identifying MPI broadcasting issues

Addressing pointer issues and bugs

Resolving pointer and memory issues

Summarizing main issues in the code

Answer

1) Висячие указатели в `Interval`

Исправление (не отказываясь от указателей)

2) Неверный компаратор для кучи

Исправление

3) Аккуратнее с `pop_heap`/`pop_back`

4) Несогласованный тип в `MPI_Bcast`

Исправление (любой единый тип)

5) Лишний `irecv` без ожидания/хранения запроса

6) Условие с T

7) Пара замечаний по стилю/надёжности

Коротко: что, скорее всего, ломает всё прямо сейчас

Share this Q&A

Question

Thought

Evaluating code behavior

Identifying bugs and memory issues

Identifying heap issues and bugs

Identifying MPI broadcasting issues

Addressing pointer issues and bugs

Resolving pointer and memory issues

Summarizing main issues in the code

Answer

1) Висячие указатели в Interval

Исправление (не отказываясь от указателей)

2) Неверный компаратор для кучи

Исправление

3) Аккуратнее с pop_heap/pop_back

4) Несогласованный тип в MPI_Bcast

Исправление (любой единый тип)

5) Лишний irecv без ожидания/хранения запроса

6) Условие с T

7) Пара замечаний по стилю/надёжности

Коротко: что, скорее всего, ломает всё прямо сейчас

Share this Q&A

1) Висячие указатели в `Interval`

3) Аккуратнее с `pop_heap`/`pop_back`

4) Несогласованный тип в `MPI_Bcast`

5) Лишний `irecv` без ожидания/хранения запроса