const int rank = g_world->rank(); const int pa...

Question

text
	const int rank = g_world->rank();
	const int partner = (rank == 0) ? 1 : 0;
	const PeanoCurve_2D* __restrict LocCurve =
		rank ? Curve_Inverted.get() : Curve.get();
	const float divider = ldexpf(1.0f, (LocCurve->razvertka << 1u) + 1u);
	const float inv_divider = 1.0f / divider;
	const float x_addition = (b - a) * inv_divider;
	const float y_addition = (d - c) * inv_divider;
	const float true_start = a + x_addition;
	float x_Rmax_1 = true_start;
	const float true_end = b - x_addition;
	float x_Rmax_2 = true_end;
	const float initial_length = true_end - true_start;
	const float threshold_03 = 0.3f * initial_length;
	float dmax = initial_length;
	const float inv_threshold_03 = 1.0f / threshold_03;
	const float end_val = rank ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition);
	float y_Rmax_1 = end_val;
	float best_f = rank ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition);
	float y_Rmax_2 = best_f;

	float   schetchick = 0.0f;

	std::vector<float> Extr;
	Extr.reserve(global_iterations * 4u);
	std::vector<std::unique_ptr<Interval>> R;
	R.reserve(global_iterations * 2u);

	// Создаем начальный интервал
	R.emplace_back(std::make_unique<Interval>(&true_start, &true_end, &end_val, &best_f, &N));
	float Mmax = *R.front()->M;
	float m = r * Mmax;

	float new_point, new_value;

	std::unique_ptr<Interval> promejutochny_otrezok;

	while (true) {
		boost::optional<boost::mpi::status> probe_status = g_world->iprobe(partner, 2);
		if (probe_status) {
			// Если есть сообщение с тегом 2, выполняем блокирующий прием
			unsigned int dummy;
			g_world->recv(partner, 2, dummy);
			if (partner) {
				const std::unique_ptr<Interval>& front = R.front();
				Extr.emplace_back(schetchick);
				Extr.emplace_back(*front->x2 - *front->x1);
				*out_len = Extr.size();
				*out_data = reinterpret_cast<float*>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
				memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
			}
			return;
		}
		if (++schetchick == global_iterations) {
			g_world->send(partner, 2, 0u);
			if (partner) {
				const std::unique_ptr<Interval>& front = R.front();
				Extr.emplace_back(schetchick);
				Extr.emplace_back(front->x2 - front->x1);
				*out_len = Extr.size();
				*out_data = reinterpret_cast<float*>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
				memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
			}
			return;
		}

		// ===== Локальный шаг LNA в параметре кривой =====
		new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, N, r);

		// Координаты по кривой Пеано
		const PeanoCurve_2D* pc = LocCurve->HitTest_2D(new_point);
		const float new_x1 = pc->x1;
		const float new_x2 = pc->x2;

		new_value = RastriginFunc(new_x1, new_x2);

		// Обновление экстремумов (формат записи — как решено в 2D-ветке)
		if (new_value < best_f) {
			best_f = new_value;
			Extr.emplace_back(best_f);
			Extr.emplace_back(new_x1);
			Extr.emplace_back(new_x2);
		}

		// ===== Разбиение лучшего интервала =====
		std::pop_heap(R.begin(), R.end(), ComparePtr);
		//const std::unique_ptr<Interval> promejutochny_otrezok = std::move(R.back());
		promejutochny_otrezok = std::move(R.back());
		R.pop_back();

		// Создаём подынтервалы с прямой инициализацией
		std::unique_ptr<Interval> curr = std::make_unique<Interval>(promejutochny_otrezok->x1, &new_point, promejutochny_otrezok->y1, &new_value, &N);
		std::unique_ptr<Interval> curr1 = std::make_unique<Interval>(&new_point, promejutochny_otrezok->x2, &new_value, promejutochny_otrezok->y2, &N);

		const float currM = *curr->M > *curr1->M ? *curr->M : *curr1->M;

		const float len2 = *promejutochny_otrezok->x2 - new_point;
		const float len1 = new_point - *promejutochny_otrezok->x1;
		const size_t r_size = R.size();

		// Обновление dmax — один проход без лишних вызовов
		if (len2 + len1 == dmax) {
			dmax = (len2 > len1 ? len2 : len1);

#pragma loop(ivdep)
for (size_t i = 0u; i < r_size; ++i) {
const float len = *R[i]->x2 - *R[i]->x1;
if (len > dmax) dmax = len;
}
}

text
		// ===== Перенастройка характеристик R (как в 1D-ветке, но для N=2) =====
		if (mode) {
			if ((threshold_03 > dmax && fmodf(schetchick, 3.0f) == 0.0f) || (10.0f * dmax < initial_length)) {
				if (currM > Mmax) {
					Mmax = currM;
					m = r * Mmax;
				}
				const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
				const float alpha = fmaf(progress, progress, 1.0f);
				const float betta = 2.0f - alpha;
				const float MULTIPLIER = (1.0f / dmax) * Mmax;
				const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
				const float GLOBAL_FACTOR = betta * global_coeff;

				curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, *curr->M * alpha));
				curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, *curr1->M * alpha));

#pragma loop(ivdep)
for (size_t i = 0u; i < r_size; ++i) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, *R[i]->x2 - *R[i]->x1, *R[i]->M * alpha));
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
Mmax = currM;
m = r * Mmax;

text
					curr->ChangeCharacteristic(m);
					curr1->ChangeCharacteristic(m);

#pragma loop(ivdep)
for (size_t i = 0u; i < r_size; ++i) {
R[i]->ChangeCharacteristic(m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
Mmax = currM;
m = r * Mmax;

text
				curr->ChangeCharacteristic(m);
				curr1->ChangeCharacteristic(m);

#pragma loop(ivdep)
for (size_t i = 0u; i < r_size; ++i) {
R[i]->ChangeCharacteristic(m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}

text
		// ===== Добавление подынтервалов и единоразовая перестройка кучи =====
		R.emplace_back(std::move(curr));
		R.emplace_back(std::move(curr1));
		std::push_heap(R.begin(), R.end(), ComparePtr); // достаточно одного push_heap после двух вставок

		// Проверка критерия сходимости по «лучшему» интервалу
		const std::unique_ptr<Interval>& top = R.front();
		const float interval_len = *top->x2 - *top->x1;
		if (interval_len < epsilon) {
			g_world->send(partner, 2, 0u);
			if (partner) {
				Extr.emplace_back(schetchick);
				Extr.emplace_back(interval_len);
				*out_len = Extr.size();
				*out_data = reinterpret_cast<float*>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
				memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
			}
			return;
		}

		// Обновляем x_Rmax и y_Rmax (константная ссылка на top, чтобы избежать копии)
		x_Rmax_1 = *top->x1;
		x_Rmax_2 = *top->x2;
		y_Rmax_1 = *top->y1;
		y_Rmax_2 = *top->y2;
	} = это двумерная ветка моего экспортируемого алгоритма extern "C" __declspec(dllexport) __declspec(noalias) void Base_LNA_1_2_Mer_AGP(
const float N, const float global_iterations, const float a, const float b, const float c, const float d, const float r,
const bool mode, const float epsilon, const float seed,
float** out_data, uint64_t* out_len) noexcept, используемый функционал: #include "pch.h"

#define LCG_RAND(state)
do {
float reduced = fmodf((state), 1.618033988749895f);
reduced = fmaf(reduced, 1.618033988749895f, reduced * 0.5772156649f);
reduced -= (float)(int)(reduced);
reduced = fmaf(reduced, reduced + 1.0f, 0.0f);
reduced -= (float)(int)(reduced);
(state) = reduced;
} while(0)
#define LCG_RAND_GRSH(state)
do {
float reduced = fmodf((state), 1.618033988749895f);
reduced = fmaf(reduced, 1.618033988749895f, reduced * 0.5772156649f);
reduced -= (float)(int)(reduced);
reduced = fmaf(reduced, reduced + 1.0f, 0.0f);
reduced -= (float)(int)(reduced);
(state) = fmaf(reduced, 2.0f, -1.0f);
} while(0)
#define FABE13_COS(x, result_var)
do {
float abs_val = fabsf((x));
float reduced = fmodf(abs_val, 6.28318530718f);
if (reduced > 3.14159265359f) {
reduced = 6.28318530718f -reduced;
}
if (reduced < 1.57079632679f) {
float val2 = reduced * reduced;
result_var = fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
} else {
reduced = 3.14159265359f -reduced;
float val2 = reduced * reduced;
result_var = -fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
}
} while(0)
#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
int limit = (n) & ~15;
if ((n) >= 16) {
const __m512 VEC_TWOPI = _mm512_set1_ps(6.28318530718f);
const __m512 VEC_PI = _mm512_set1_ps(3.14159265359f);
const __m512 VEC_PI_2 = _mm512_set1_ps(1.57079632679f);
const __m512 INV_TWOPI = _mm512_set1_ps(0.15915494309189535f);
const __m512 BIAS = _mm512_set1_ps(12582912.0f);
const __m512 VEC_COS_P3 = _mm512_set1_ps(0.0416666667f);
const __m512 VEC_COS_P1 = _mm512_set1_ps(-0.5f);
const __m512 VEC_COS_P0 = _mm512_set1_ps(1.0f);
const __m512 VEC_SIN_P1 = _mm512_set1_ps(-0.16666666f);
const __m512 VEC_SIN_P0 = _mm512_set1_ps(1.0f);
const __m512 VEC_NEG1 = _mm512_set1_ps(-1.0f);
const __m512 VEC_ZERO = _mm512_setzero_ps();
while (i < limit) {
__m512 vx = _mm512_load_ps(&(in)[i]);
__m512 vax = _mm512_abs_ps(vx);
__m512 q = _mm512_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm512_sub_ps(q, BIAS);
__m512 r = _mm512_fnmadd_ps(VEC_TWOPI, q, vax);
__m512 r1 = _mm512_min_ps(r, _mm512_sub_ps(VEC_TWOPI, r));
__m512 r2 = _mm512_min_ps(r1, _mm512_sub_ps(VEC_PI, r1));
__mmask16 m_pi = _mm512_cmp_ps_mask(r, VEC_PI, _CMP_GT_OQ);
__mmask16 m_pi_2 = _mm512_cmp_ps_mask(r1, VEC_PI_2, _CMP_GT_OQ);
__mmask16 m_negx = _mm512_cmp_ps_mask(vx, VEC_ZERO, _CMP_LT_OQ);
__mmask16 m_sinflip = _kxnor_mask16(m_negx, m_pi);
__m512 t2 = _mm512_mul_ps(r2, r2);
__m512 vcos = _mm512_fmadd_ps(t2, _mm512_fmadd_ps(t2, VEC_COS_P3, VEC_COS_P1), VEC_COS_P0);
__m512 vsin = _mm512_mul_ps(_mm512_fmadd_ps(t2, VEC_SIN_P1, VEC_SIN_P0), r2);
vcos = _mm512_mask_mul_ps(vcos, m_pi_2, vcos, VEC_NEG1);
vsin = _mm512_mask_mul_ps(vsin, m_sinflip, vsin, VEC_NEG1);
_mm512_store_ps(&(cos_out)[i], vcos);
_mm512_store_ps(&(sin_out)[i], vsin);
i += 16;
}
}
while (i < (n)) {
float x = (in)[i];
float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
bool sflip = (r > 3.14159265359f);
if (sflip) r = 6.28318530718f - r;
bool cflip = (r > 1.57079632679f);
if (cflip) r = 3.14159265359f - r;
float t2 = r * r;
float c = fmaf(t2, fmaf(t2, 0.0416666667f, -0.5f), 1.0f);
float s = fmaf(t2, -0.16666666f, 1.0f) * r;
cos_out[i] = cflip ? -c : c;
sin_out[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while(0)

static __declspec(noalias) __forceinline float ShekelFunc(const float x, const float seed) noexcept
{
uint64_t i = 0u;
float current_state = seed, current_res, res = 0.0f;

text
while (i < 10u) {
	LCG_RAND(current_state);
	const float x_part = fmaf(-current_state, 10.0f, x);
	LCG_RAND(current_state);
	current_res = current_state;
	LCG_RAND(current_state);
	float delimiter = fmaf(fmaf(current_res, 20.0f, 5.0f),
		x_part * x_part,
		fmaf(current_state, 0.2f, 1.0f));
	delimiter = copysignf(fmaxf(fabsf(delimiter), FLT_MIN), delimiter);
	res -= 1.0f / delimiter;
	++i;
}

return res;

}

static __declspec(noalias) __forceinline float RastriginFunc(const float x1, const float x2) noexcept
{
const float term1 = fmaf(x1, x1, x2 * x2);
float cos1, cos2;
FABE13_COS(6.28318530717958647692f * x1, cos1);
FABE13_COS(6.28318530717958647692f * x2, cos2);
return (term1 - fmaf(cos1 + cos2, 10.0f, -14.6f)) * fmaf(-term1, 0.25f, 18.42f);
}

static __declspec(noalias) __forceinline float HillFunc(const float x, const float seed) noexcept
{
int64_t j = 0;
__declspec(align(64u)) float angles[14u];
const float start_angle = 6.28318530717958647692f * x;

#pragma loop(ivdep)
while (j < 14) {
angles[j] = start_angle * (j + 1);
++j;
}

text
__declspec(align(64u)) float sin_vals[14u];
__declspec(align(64u)) float cos_vals[14u];
FABE13_SINCOS(angles, sin_vals, cos_vals, 14u);

float current_state = seed;
LCG_RAND(current_state);
float res = fmaf(current_state, 2.0f, -1.1f);

--j;
while (j >= 0) {
	LCG_RAND(current_state);
	float tmp_state = current_state;
	LCG_RAND(current_state);
	res += fmaf(fmaf(tmp_state, 2.0f, -1.1f), sin_vals[j],
		fmaf(current_state, 2.0f, -1.1f) * cos_vals[j]);
	--j;
}

return res;

}

static __declspec(noalias) __forceinline float GrishaginFunc(const float x1, const float x2, const float seed) noexcept
{
int64_t j = 0;
__declspec(align(64u)) float angles_j[8u];
__declspec(align(64u)) float angles_k[8u];

#pragma loop(ivdep)
while (j < 8) {
const float pj_mult = 3.14159265358979323846f * (j + 1);
angles_j[j] = pj_mult * x1;
angles_k[j] = pj_mult * x2;
++j;
}

text
__declspec(align(64u)) float sin_j[8u], cos_j[8u];
__declspec(align(64u)) float sin_k[8u], cos_k[8u];
FABE13_SINCOS(angles_j, sin_j, cos_j, 8u);
FABE13_SINCOS(angles_k, sin_k, cos_k, 8u);

--j;
float part1 = 0.0f;
float part2 = 0.0f;
float current_state = seed;
while (j >= 0) {
	size_t k = 0u;
	while (k < 8u) {
		const float sin_term = sin_j[j] * sin_j[j];
		const float cos_term = cos_k[k] * cos_k[k];

		LCG_RAND_GRSH(current_state);
		float tmp_state = current_state;
		LCG_RAND_GRSH(current_state);
		part1 = fmaf(tmp_state, sin_term,
			fmaf(current_state, cos_term, part1));
		LCG_RAND_GRSH(tmp_state);
		LCG_RAND_GRSH(current_state);
		part2 = fmaf(-tmp_state, cos_term,
			fmaf(current_state, sin_term, part2));
		++k;
	}
	--j;
}

return -sqrtf(fmaf(part1, part1, part2 * part2));

}

static __declspec(noalias) __forceinline float Shag(const float _m, const float x1, const float x2, const float y1,
const float y2, const uint64_t _N, const float _r) noexcept
{
const float diff = y2 - y1;
return _N == 1u
? fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f
: _N == 2u
? diff > 0.0f
? fmaf(-(1.0f / (_m * _m)), (diff) * (diff)_r, x1 + x2) * 0.5f
: fmaf(1.0f / (_m * _m), (diff) * (diff)_r, x1 + x2) * 0.5f
: diff > 0.0f
? fmaf(-(1.0f / powf(_m, _N)), powf(diff, _N) * _r, x1 + x2) * 0.5f
: fmaf(1.0f / powf(_m, _N), powf(diff, _N) * _r, x1 + x2) * 0.5f;
}

__declspec(align(64u)) struct Interval final {

public:
const float* x1;
const float* x2;
const float* y1;
const float* y2;
const float* delta_y;
const float* N_factor;
const float* M;
float* R;

text
__declspec(noalias) __forceinline Interval(const float* const _x1, const float* const _x2,
	const float* const _y1, const float* const _y2, const float* const _N) noexcept
{
	x1 = new float(*_x1);
	x2 = new float(*_x2);
	y1 = new float(*_y1);
	y2 = new float(*_y2);
	delta_y = new float(*y2 - *y1);
	N_factor = new float(*_N == 1.0f ? *_x2 - *_x1 :
		*_N == 2.0f ? sqrtf(*_x2 - *_x1) :
		powf(*_x2 - *_x1, 1.0f / *_N));
	M = new float(fabsf(*delta_y) * (1.0f / *N_factor));
	R = new float(0.0f);
}

__declspec(noalias) __forceinline void ChangeCharacteristic(const float _m) noexcept
{
	*R = fmaf(
		-(*y2 + *y1),
		2.0f,
		fmaf(_m, *N_factor, (*delta_y * *delta_y) * (1.0f / (_m * *N_factor))));

}

~Interval() noexcept {
	delete x1; delete x2; delete y1; delete y2;
	delete delta_y; delete N_factor; delete M; delete R;
}

Interval(const Interval&) = delete;
Interval& operator=(const Interval&) = delete;

};

static __declspec(noalias) __forceinline bool ComparePtr(const std::unique_ptr<Interval>& __restrict const a, const std::unique_ptr<Interval>& __restrict const b) noexcept
{
return *a->R < *b->R;
}

const __declspec(noalias) __forceinline PeanoCurve_2D* PeanoCurve_2D::HitTest_2D(float x) const noexcept
{
int i = 0;
const int _razvertka = this->razvertka;
int num;
const float this_a = this->a;
x -= this_a;
const float b_minus_a = this->b - this_a;
const float inv_b_minus_a = 1.0f / b_minus_a;
const PeanoCurve_2D* Curr = this;

text
while (i != _razvertka) {
	const int shift = 1u << ++i + i;
	num = shift * x * inv_b_minus_a;
	x = fmaf(-ldexp(1.0f, -(i << 1)) * num, b_minus_a, x);
	const List currType = Curr->Type;

	switch (num) {
	case 0u:
		Curr = (currType == Top || currType == Right)
			? Curr->DownLeft.get()
			: Curr->TopRight.get();
		break;
	case 1u:
		Curr = (currType == Top || currType == Left)
			? Curr->TopLeft.get()
			: Curr->DownRight.get();
		break;
	case 2u:
		Curr = (currType == Top || currType == Right)
			? Curr->TopRight.get()
			: Curr->DownLeft.get();
		break;
	default:
		Curr = (currType == Top || currType == Left)
			? Curr->DownRight.get()
			: Curr->TopLeft.get();
	}
}

return Curr;

}

const __declspec(noalias) __forceinline float PeanoCurve_2D::FindX_2D(const float target_x1, const float target_x2) const noexcept
{
int _razvertka = this->razvertka;
int _razvertka1 = _razvertka;
float x1, x2, x = this->a;
const float b_minus_a = this->b - x;
const PeanoCurve_2D* Curr = this;

text
while (_razvertka != 0u) {
	const int exponent = _razvertka1 - _razvertka-- << 1u;
	x1 = Curr->x1;
	x2 = Curr->x2;
	const List currType = Curr->Type;

	if (target_x1 > x1 && target_x2 > x2) {
		Curr = Curr->TopRight.get();
		if (currType == Top || currType == Right) {
			x = fmaf(ldexpf(1.0f, -exponent) * 0.5f, b_minus_a, x);
		}
	}
	else if (target_x1 < x1 && target_x2 > x2) {
		Curr = Curr->TopLeft.get();
		if (currType == Top || currType == Left) {
			x = fmaf(ldexpf(1.0f, -exponent) * 0.25f, b_minus_a, x);
		}
		else {
			x = fmaf(ldexpf(1.0f, -exponent) * 0.75f, b_minus_a, x);
		}
	}
	else if (target_x1 < x1 && target_x2 < x2) {
		Curr = Curr->DownLeft.get();
		if (currType == Down || currType == Left) {
			x = fmaf(ldexpf(1.0f, -exponent) * 0.5f, b_minus_a, x);
		}
	}
	else {
		Curr = Curr->DownRight.get();
		if (currType == Top || currType == Left) {
			x = fmaf(ldexpf(1.0f, -exponent) * 0.75f, b_minus_a, x);
		}
		else {
			x = fmaf(ldexpf(1.0f, -exponent) * 0.25f, b_minus_a, x);
		}
	}
}

return x;

}

static std::unique_ptrboost::mpi::environment g_env;
static std::unique_ptrboost::mpi::communicator g_world;
static std::unique_ptr<PeanoCurve_2D> Curve, Curve_Inverted; extern "C" __declspec(dllexport) int AgpInit(int peanoLevel, float a, float b, float c, float d) {
if (!g_env) g_env = std::make_uniqueboost::mpi::environment(); // MPI_Init()
if (!g_world) g_world = std::make_uniqueboost::mpi::communicator(); // MPI_COMM_WORLD
int rank = g_world->rank();
if (rank == 0) Curve = std::make_unique<PeanoCurve_2D>(List::Top, peanoLevel, a, b, c, d);
else if (rank == 1) Curve_Inverted = std::make_unique<PeanoCurve_2D>(List::Down, peanoLevel, a, b, c, d);
return rank;
}

extern "C" _declspec(dllexport) void AgpFinalize() {
Curve_Inverted.reset();
Curve.reset();
g_world.reset();
g extern "C" __declspec(dllexport) void Base_LNA_1_2_Mer_AGP_Free(float* p) {
CoTaskMemFree(p);
}

// В DLL
extern "C" __declspec(dllexport) void AgpStartWorkers() {
int go = 1;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
}

extern "C" __declspec(dllexport) void AgpWaitStartAndRun() {
int go = 0;
MPI_Bcast(&go, 1, MPI_INT, /root=/0, MPI_COMM_WORLD); // блокируется, пока мастер не пошлёт
if (go == 1) {
float* dummy = nullptr; uint64_t dummy_len = 0;
Base_LNA_1_2_Mer_AGP(2.0f, 1000.0f, -2.2f, 1.8f, -2.2f, 1.8f,
2.5f, false, 0.001f, GetTickCount(),
&dummy, &dummy_len);
if (dummy) Base_LNA_1_2_Mer_AGP_Free(dummy);
}
}, код MyForm.cpp: [STAThread]
int main() {
HMODULE h = LoadLibraryW(L"TEST_FUNC.dll");
auto AgpInit = (PInit)GetProcAddress(h, "AgpInit");
auto AgpFinalize = (PFin)GetProcAddress(h, "AgpFinalize");
if (!h || !AgpInit || !AgpFinalize) { MessageBox::Show(L"Не удалось инициализировать DLL/MPI"); return 1; }

text
const int rank = AgpInit(12, -2.2f, 1.8f, -2.2f, 1.8f);

if (rank == 0) {
	Application::EnableVisualStyles();
	Application::SetCompatibleTextRenderingDefault(false);
	Application::Run(gcnew TESTAGP::MyForm(h));
}
else {
	// Воркер-процесс: ждёт команды "старт" и выполняет расчёт без UI
	auto AgpWaitStartAndRun = (PStartWorkers)GetProcAddress(h, "AgpWaitStartAndRun");
	if (AgpWaitStartAndRun) AgpWaitStartAndRun(); // см. пункт D
}

AgpFinalize();
FreeLibrary(h);
return 0;

}, MyForm.h: ref class MyForm : public System::Windows::Forms::Form {
public:
MyForm(HMODULE hLib) : hLib(hLib) { // Передаем дескриптор из main
InitializeComponent();
// Загружаем функции из DLL
f = (agp_c)GetProcAddress(hLib, "Base_LNA_1_2_Mer_AGP");
ffree = (agp_free)GetProcAddress(hLib, "Base_LNA_1_2_Mer_AGP_Free");
pStart = (start_workers)GetProcAddress(hLib, "AgpStartWorkers");
}

text
protected:
	~MyForm() {
		delete components;
	}

private:
	HINSTANCE hLib = nullptr;
	typedef void(__cdecl* agp_c)(float, float, float, float, float, float, float, bool, float, float, float**, uint64_t*);
	typedef void(__cdecl* agp_free)(float*);
	typedef void(__cdecl* start_workers)();

	agp_c f = nullptr;
	agp_free ffree = nullptr;
	start_workers pStart = nullptr;

	System::Windows::Forms::Button^ button1;
	System::Windows::Forms::TextBox^ textBox2;
	System::Windows::Forms::DataVisualization::Charting::Chart^ chart2;
	System::Windows::Forms::Label^ label2;
	System::Windows::Forms::Label^ label3;
	System::Windows::Forms::TextBox^ textBox1;
	System::Windows::Forms::TextBox^ textBox3;
	System::Windows::Forms::TextBox^ textBox4;
	System::Windows::Forms::TextBox^ textBox5;
	System::Windows::Forms::TextBox^ textBox6;
	System::Windows::Forms::Label^ label6;
	System::Windows::Forms::Label^ label7;
	System::Windows::Forms::Label^ label8;
	System::Windows::Forms::Label^ label9;
	System::Windows::Forms::Label^ label10;
	System::Windows::Forms::Label^ label1;
	System::Windows::Forms::TextBox^ textBox7;
	System::Windows::Forms::TextBox^ textBox8;
	System::ComponentModel::Container^ components;

	System::Void button1_Click(System::Object^ sender, System::EventArgs^ e) {
		chart2->Series[0]->Points->Clear();
		chart2->Series[1]->Points->Clear();
		chart2->Series[2]->Points->Clear();
		chart2->Series[3]->Points->Clear();

		static LARGE_INTEGER start, end;
		QueryPerformanceCounter(&start);

		const float seed = GetTickCount();

		float* buf = nullptr;
		uint64_t len = 0u;

		if (pStart) pStart();  // разбудили rank != 0
		f(2.0f, 1000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.001f, seed, &buf, &len);

		if (buf != nullptr) {
			std::vector<float> Extr_2D(buf, buf + len);
			ffree(buf);

			// Для 2D ветки: извлекаем schetchick и interval_len
			textBox7->Text = Convert::ToString(Extr_2D.back()); // schetchick
			Extr_2D.pop_back();
			textBox6->Text = Convert::ToString(Extr_2D.back()); // interval_len
			Extr_2D.pop_back();

			// Извлекаем последнюю тройку (x2, x1, f)
			float x2 = Extr_2D.back();
			Extr_2D.pop_back();
			float x1 = Extr_2D.back();
			Extr_2D.pop_back();
			float f_val = Extr_2D.back();
			Extr_2D.pop_back();

			// Выводим координаты и значение функции
			textBox4->Text = Convert::ToString(x1);
			textBox3->Text = Convert::ToString(x2);
			textBox2->Text = Convert::ToString(f_val); // Новое поле для значения функции

			// Отображаем точку на графике
			chart2->Series[2]->Points->AddXY(x1, x2);

			// Обрабатываем остальные точки (в порядке: f, x1, x2)
			while (!Extr_2D.empty()) {
				float x2_point = Extr_2D.back();
				Extr_2D.pop_back();
				float x1_point = Extr_2D.back();
				Extr_2D.pop_back();
				float f_point = Extr_2D.back();
				Extr_2D.pop_back();

				// Добавляем точку на график (только координаты)
				chart2->Series[0]->Points->AddXY(x1_point, x2_point);
			}
		}

		QueryPerformanceCounter(&end);

		LARGE_INTEGER freq;
		QueryPerformanceFrequency(&freq);
		const uint64_t multiplier = ((1ULL << 32) / freq.QuadPart) * 1'000'000ULL;

		textBox5->Text = Convert::ToString(
			((end.QuadPart - start.QuadPart) * multiplier) >> 32
		) + " microseconds"; когда запускаю приложение через mpiexec : C:\Users\maxim>mpiexec -n 2 D:\TEST_AGP\x64\Release\TEST_AGP.exe

Необработанное исключение: System.Runtime.InteropServices.SEHException: Внешний компонент создал исключение.
в main() в D:\TEST_AGP\MyForm.cpp:строка 28
в mainCRTStartup()

job aborted:
[ranks] message

[0] terminated

[1] process exited without calling finalize

---- error analysis -----

[1] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc000001d

---- error analysis ----- то есть у меня есть dll библиотека которую использует управляемое приложение, и библиотека и управляемое приложение используют mpi - подскажи в чём может быть проблема

Answer 1

Код падает с 0xC000001D — это STATUS_ILLEGAL_INSTRUCTION (“недопустимая инструкция”). В вашем проекте есть очень вероятная причина: вы используете AVX-512-интринсики (_mm512_*) в FABE13_SINCOS. Если процессор или ОС не поддерживают AVX-512 (а в клиентских Windows AVX-512 часто отключён даже на CPU, которые его умеют), любое выполнение такой инструкции приводит ровно к этому исключению.

Даже если у вас n < 16 и вы “логически” не заходите в AVX-512-ветку, при сборке всего TU с /arch:AVX512 компилятор может авто-векторизовать другие циклы под zmm-регистры. В итоге падение может случаться сразу при первом заходе в DLL (как у вас — на AgpInit(...) в обоих процессах под mpiexec).

Как быстро подтвердить гипотезу

Пересоберите DLL (и EXE, если там те же TU) без AVX-512:
MSVC: Project Properties → C/C++ → Code Generation → Enable Enhanced Instruction Set → No Enhanced Instructions/AVX2.
Либо просто удалите/закомментируйте AVX-512-интринсики и пересоберите.
Запустите снова:
```
bat
mpiexec -n 2 D:\TEST_AGP\x64\Release\TEST_AGP.exe
```
Если падение исчезло — причина именно в AVX-512.

Правильная фиксация

Сделайте рантайм-диспетчеризацию и оставьте по умолчанию скаляр/AVX2.

cpp
#include <immintrin.h>
#include <intrin.h>

static bool has_avx512() {
    int info[4];
    __cpuidex(info, 0, 0);
    if (info[0] < 7) return false;
    int info7[4];
    __cpuidex(info7, 7, 0);
    bool avx512f = (info7[1] & (1 << 16)) != 0;  // AVX-512F
    unsigned long long xcr0 = _xgetbv(0);
    bool os_ymm = (xcr0 & 0x6) == 0x6;                       // XMM+YMM
    bool os_zmm = (xcr0 & (1ULL<<5 | 1ULL<<6 | 1ULL<<7)) ==
                  ( (1ULL<<5) | (1ULL<<6) | (1ULL<<7) );     // ZMM opmask/ZMM
    return avx512f && os_ymm && os_zmm;
}

И в FABE13_SINCOS включайте AVX-512-ветку только если has_avx512() && (n >= 16), иначе используйте уже имеющуюся скалярную часть. Идеально — вынести AVX-512-реализацию в отдельный .cpp, собрать только его с /arch:AVX512, а основной код — с /arch:AVX2. Для MSVC можно ещё и так:

cpp
__declspec(cpu_specific(avx512f)) void FABE13_SINCOS_impl_avx512(...);
__declspec(cpu_specific(avx2))    void FABE13_SINCOS_impl_avx2(...);
__declspec(cpu_specific(sse2))    void FABE13_SINCOS_impl_sse2(...);

__declspec(cpu_dispatch(avx512f, avx2, sse2))
void FABE13_SINCOS(...);

Ещё что проверить (вдобавок к AVX-512)

Сигнатуры экспортов. Для Base_LNA_1_2_Mer_AGP у вас extern "C" __declspec(dllexport) void ..., значит имя без манглинга. Тип в форме должен соответствовать ровно:
```
cpp
typedef void(__cdecl* agp_c)(float, float, float, float, float, float, float,
                             bool, float, float, float**, uint64_t*);
```
(9 float + bool до двух указателей). Если случайно указали __stdcall или иное количество аргументов — будет краш.
AgpFinalize: убедитесь, что там действительно вызывается g_env.reset(); (чтобы сделать MPI_Finalize()), а не оборванный код — в вашем фрагменте строка с g_ обрезана.
Смешение Boost.MPI и “чистого” MPI — допустимо, но используйте один и тот же рантайм (MS-MPI) и одинаковый коммуникатор (MPI_COMM_WORLD у вас в порядке). Последовательность у вас правильная: AgpInit() → environment (MPI_Init), затем в воркере AgpWaitStartAndRun() блокируется на MPI_Bcast, мастер будит через AgpStartWorkers().

Мини-чек-лист

Пересобрать без AVX-512 → проверить.
Если всё ОК — вернуть AVX-512 только с рантайм-проверкой и/или отдельным TU.
Дважды проверить соответствие typedef/GetProcAddress и extern "C".
В AgpFinalize() гарантировать вызов MPI_Finalize через g_env.reset().

С высокой вероятностью корень проблемы — именно AVX-512. После отключения или грамотной диспетчеризации код под mpiexec -n 2 должен перестать падать.

const int rank = g_world->rank(); const int pa...

Вопрос

Мысль

Diagnosing MPI app crash

Understanding AVX-512 crash behavior

Investigating calling convention mismatch

Checking function arguments

Investigating the crash

Identifying possible crash causes

Diagnosing illegal instruction error

Ответ

Как быстро подтвердить гипотезу

Правильная фиксация

Ещё что проверить (вдобавок к AVX-512)

Мини-чек-лист

Поделиться этим Q&A