const int rank = g_world->rank(); const int pa...

Создано: 29 сентября 2025 г.

Отвечено с использованием GPT-5 Thinking от Chat01

Вопрос

text
const int rank = g_world->rank(); const int partner = (rank == 0) ? 1 : 0; const PeanoCurve_2D* __restrict LocCurve = rank ? Curve_Inverted.get() : Curve.get(); const float divider = ldexpf(1.0f, (LocCurve->razvertka << 1u) + 1u); const float inv_divider = 1.0f / divider; const float x_addition = (b - a) * inv_divider; const float y_addition = (d - c) * inv_divider; const float true_start = a + x_addition; float x_Rmax_1 = true_start; const float true_end = b - x_addition; float x_Rmax_2 = true_end; const float initial_length = true_end - true_start; const float threshold_03 = 0.3f * initial_length; float dmax = initial_length; const float inv_threshold_03 = 1.0f / threshold_03; const float end_val = rank ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition); float y_Rmax_1 = end_val; float best_f = rank ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition); float y_Rmax_2 = best_f; float schetchick = 0.0f; std::vector<float> Extr; Extr.reserve(global_iterations * 4u); std::vector<std::unique_ptr<Interval>> R; R.reserve(global_iterations * 2u); // Создаем начальный интервал R.emplace_back(std::make_unique<Interval>(&true_start, &true_end, &end_val, &best_f, &N)); float Mmax = *R.front()->M; float m = r * Mmax; float new_point, new_value; std::unique_ptr<Interval> promejutochny_otrezok; while (true) { boost::optional<boost::mpi::status> probe_status = g_world->iprobe(partner, 2); if (probe_status) { // Если есть сообщение с тегом 2, выполняем блокирующий прием unsigned int dummy; g_world->recv(partner, 2, dummy); if (partner) { const std::unique_ptr<Interval>& front = R.front(); Extr.emplace_back(schetchick); Extr.emplace_back(*front->x2 - *front->x1); *out_len = Extr.size(); *out_data = reinterpret_cast<float*>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); } return; } if (++schetchick == global_iterations) { g_world->send(partner, 2, 0u); if (partner) { const std::unique_ptr<Interval>& front = R.front(); Extr.emplace_back(schetchick); Extr.emplace_back(front->x2 - front->x1); *out_len = Extr.size(); *out_data = reinterpret_cast<float*>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); } return; } // ===== Локальный шаг LNA в параметре кривой ===== new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, N, r); // Координаты по кривой Пеано const PeanoCurve_2D* pc = LocCurve->HitTest_2D(new_point); const float new_x1 = pc->x1; const float new_x2 = pc->x2; new_value = RastriginFunc(new_x1, new_x2); // Обновление экстремумов (формат записи — как решено в 2D-ветке) if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_x1); Extr.emplace_back(new_x2); } // ===== Разбиение лучшего интервала ===== std::pop_heap(R.begin(), R.end(), ComparePtr); //const std::unique_ptr<Interval> promejutochny_otrezok = std::move(R.back()); promejutochny_otrezok = std::move(R.back()); R.pop_back(); // Создаём подынтервалы с прямой инициализацией std::unique_ptr<Interval> curr = std::make_unique<Interval>(promejutochny_otrezok->x1, &new_point, promejutochny_otrezok->y1, &new_value, &N); std::unique_ptr<Interval> curr1 = std::make_unique<Interval>(&new_point, promejutochny_otrezok->x2, &new_value, promejutochny_otrezok->y2, &N); const float currM = *curr->M > *curr1->M ? *curr->M : *curr1->M; const float len2 = *promejutochny_otrezok->x2 - new_point; const float len1 = new_point - *promejutochny_otrezok->x1; const size_t r_size = R.size(); // Обновление dmax — один проход без лишних вызовов if (len2 + len1 == dmax) { dmax = (len2 > len1 ? len2 : len1);

#pragma loop(ivdep)
for (size_t i = 0u; i < r_size; ++i) {
const float len = *R[i]->x2 - *R[i]->x1;
if (len > dmax) dmax = len;
}
}

text
// ===== Перенастройка характеристик R (как в 1D-ветке, но для N=2) ===== if (mode) { if ((threshold_03 > dmax && fmodf(schetchick, 3.0f) == 0.0f) || (10.0f * dmax < initial_length)) { if (currM > Mmax) { Mmax = currM; m = r * Mmax; } const float progress = fmaf(-inv_threshold_03, dmax, 1.0f); const float alpha = fmaf(progress, progress, 1.0f); const float betta = 2.0f - alpha; const float MULTIPLIER = (1.0f / dmax) * Mmax; const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER); const float GLOBAL_FACTOR = betta * global_coeff; curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, *curr->M * alpha)); curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, *curr1->M * alpha));

#pragma loop(ivdep)
for (size_t i = 0u; i < r_size; ++i) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, *R[i]->x2 - *R[i]->x1, *R[i]->M * alpha));
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
Mmax = currM;
m = r * Mmax;

text
curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m);

#pragma loop(ivdep)
for (size_t i = 0u; i < r_size; ++i) {
R[i]->ChangeCharacteristic(m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
Mmax = currM;
m = r * Mmax;

text
curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m);

#pragma loop(ivdep)
for (size_t i = 0u; i < r_size; ++i) {
R[i]->ChangeCharacteristic(m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}

text
// ===== Добавление подынтервалов и единоразовая перестройка кучи ===== R.emplace_back(std::move(curr)); R.emplace_back(std::move(curr1)); std::push_heap(R.begin(), R.end(), ComparePtr); // достаточно одного push_heap после двух вставок // Проверка критерия сходимости по «лучшему» интервалу const std::unique_ptr<Interval>& top = R.front(); const float interval_len = *top->x2 - *top->x1; if (interval_len < epsilon) { g_world->send(partner, 2, 0u); if (partner) { Extr.emplace_back(schetchick); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float*>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); } return; } // Обновляем x_Rmax и y_Rmax (константная ссылка на top, чтобы избежать копии) x_Rmax_1 = *top->x1; x_Rmax_2 = *top->x2; y_Rmax_1 = *top->y1; y_Rmax_2 = *top->y2; } = это двумерная ветка моего экспортируемого алгоритма extern "C" __declspec(dllexport) __declspec(noalias) void Base_LNA_1_2_Mer_AGP( const float N, const float global_iterations, const float a, const float b, const float c, const float d, const float r, const bool mode, const float epsilon, const float seed, float** out_data, uint64_t* out_len) noexcept, используемый функционал: #include "pch.h"

#define LCG_RAND(state)
do {
float reduced = fmodf((state), 1.618033988749895f);
reduced = fmaf(reduced, 1.618033988749895f, reduced * 0.5772156649f);
reduced -= (float)(int)(reduced);
reduced = fmaf(reduced, reduced + 1.0f, 0.0f);
reduced -= (float)(int)(reduced);
(state) = reduced;
} while(0)
#define LCG_RAND_GRSH(state)
do {
float reduced = fmodf((state), 1.618033988749895f);
reduced = fmaf(reduced, 1.618033988749895f, reduced * 0.5772156649f);
reduced -= (float)(int)(reduced);
reduced = fmaf(reduced, reduced + 1.0f, 0.0f);
reduced -= (float)(int)(reduced);
(state) = fmaf(reduced, 2.0f, -1.0f);
} while(0)
#define FABE13_COS(x, result_var)
do {
float abs_val = fabsf((x));
float reduced = fmodf(abs_val, 6.28318530718f);
if (reduced > 3.14159265359f) {
reduced = 6.28318530718f -reduced;
}
if (reduced < 1.57079632679f) {
float val2 = reduced * reduced;
result_var = fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
} else {
reduced = 3.14159265359f -reduced;
float val2 = reduced * reduced;
result_var = -fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
}
} while(0)
#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
int limit = (n) & ~15;
if ((n) >= 16) {
const __m512 VEC_TWOPI = _mm512_set1_ps(6.28318530718f);
const __m512 VEC_PI = _mm512_set1_ps(3.14159265359f);
const __m512 VEC_PI_2 = _mm512_set1_ps(1.57079632679f);
const __m512 INV_TWOPI = _mm512_set1_ps(0.15915494309189535f);
const __m512 BIAS = _mm512_set1_ps(12582912.0f);
const __m512 VEC_COS_P3 = _mm512_set1_ps(0.0416666667f);
const __m512 VEC_COS_P1 = _mm512_set1_ps(-0.5f);
const __m512 VEC_COS_P0 = _mm512_set1_ps(1.0f);
const __m512 VEC_SIN_P1 = _mm512_set1_ps(-0.16666666f);
const __m512 VEC_SIN_P0 = _mm512_set1_ps(1.0f);
const __m512 VEC_NEG1 = _mm512_set1_ps(-1.0f);
const __m512 VEC_ZERO = _mm512_setzero_ps();
while (i < limit) {
__m512 vx = _mm512_load_ps(&(in)[i]);
__m512 vax = _mm512_abs_ps(vx);
__m512 q = _mm512_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm512_sub_ps(q, BIAS);
__m512 r = _mm512_fnmadd_ps(VEC_TWOPI, q, vax);
__m512 r1 = _mm512_min_ps(r, _mm512_sub_ps(VEC_TWOPI, r));
__m512 r2 = _mm512_min_ps(r1, _mm512_sub_ps(VEC_PI, r1));
__mmask16 m_pi = _mm512_cmp_ps_mask(r, VEC_PI, _CMP_GT_OQ);
__mmask16 m_pi_2 = _mm512_cmp_ps_mask(r1, VEC_PI_2, _CMP_GT_OQ);
__mmask16 m_negx = _mm512_cmp_ps_mask(vx, VEC_ZERO, _CMP_LT_OQ);
__mmask16 m_sinflip = _kxnor_mask16(m_negx, m_pi);
__m512 t2 = _mm512_mul_ps(r2, r2);
__m512 vcos = _mm512_fmadd_ps(t2, _mm512_fmadd_ps(t2, VEC_COS_P3, VEC_COS_P1), VEC_COS_P0);
__m512 vsin = _mm512_mul_ps(_mm512_fmadd_ps(t2, VEC_SIN_P1, VEC_SIN_P0), r2);
vcos = _mm512_mask_mul_ps(vcos, m_pi_2, vcos, VEC_NEG1);
vsin = _mm512_mask_mul_ps(vsin, m_sinflip, vsin, VEC_NEG1);
_mm512_store_ps(&(cos_out)[i], vcos);
_mm512_store_ps(&(sin_out)[i], vsin);
i += 16;
}
}
while (i < (n)) {
float x = (in)[i];
float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
bool sflip = (r > 3.14159265359f);
if (sflip) r = 6.28318530718f - r;
bool cflip = (r > 1.57079632679f);
if (cflip) r = 3.14159265359f - r;
float t2 = r * r;
float c = fmaf(t2, fmaf(t2, 0.0416666667f, -0.5f), 1.0f);
float s = fmaf(t2, -0.16666666f, 1.0f) * r;
cos_out[i] = cflip ? -c : c;
sin_out[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while(0)

static __declspec(noalias) __forceinline float ShekelFunc(const float x, const float seed) noexcept
{
uint64_t i = 0u;
float current_state = seed, current_res, res = 0.0f;

text
while (i < 10u) { LCG_RAND(current_state); const float x_part = fmaf(-current_state, 10.0f, x); LCG_RAND(current_state); current_res = current_state; LCG_RAND(current_state); float delimiter = fmaf(fmaf(current_res, 20.0f, 5.0f), x_part * x_part, fmaf(current_state, 0.2f, 1.0f)); delimiter = copysignf(fmaxf(fabsf(delimiter), FLT_MIN), delimiter); res -= 1.0f / delimiter; ++i; } return res;

}

static __declspec(noalias) __forceinline float RastriginFunc(const float x1, const float x2) noexcept
{
const float term1 = fmaf(x1, x1, x2 * x2);
float cos1, cos2;
FABE13_COS(6.28318530717958647692f * x1, cos1);
FABE13_COS(6.28318530717958647692f * x2, cos2);
return (term1 - fmaf(cos1 + cos2, 10.0f, -14.6f)) * fmaf(-term1, 0.25f, 18.42f);
}

static __declspec(noalias) __forceinline float HillFunc(const float x, const float seed) noexcept
{
int64_t j = 0;
__declspec(align(64u)) float angles[14u];
const float start_angle = 6.28318530717958647692f * x;

#pragma loop(ivdep)
while (j < 14) {
angles[j] = start_angle * (j + 1);
++j;
}

text
__declspec(align(64u)) float sin_vals[14u]; __declspec(align(64u)) float cos_vals[14u]; FABE13_SINCOS(angles, sin_vals, cos_vals, 14u); float current_state = seed; LCG_RAND(current_state); float res = fmaf(current_state, 2.0f, -1.1f); --j; while (j >= 0) { LCG_RAND(current_state); float tmp_state = current_state; LCG_RAND(current_state); res += fmaf(fmaf(tmp_state, 2.0f, -1.1f), sin_vals[j], fmaf(current_state, 2.0f, -1.1f) * cos_vals[j]); --j; } return res;

}

static __declspec(noalias) __forceinline float GrishaginFunc(const float x1, const float x2, const float seed) noexcept
{
int64_t j = 0;
__declspec(align(64u)) float angles_j[8u];
__declspec(align(64u)) float angles_k[8u];

#pragma loop(ivdep)
while (j < 8) {
const float pj_mult = 3.14159265358979323846f * (j + 1);
angles_j[j] = pj_mult * x1;
angles_k[j] = pj_mult * x2;
++j;
}

text
__declspec(align(64u)) float sin_j[8u], cos_j[8u]; __declspec(align(64u)) float sin_k[8u], cos_k[8u]; FABE13_SINCOS(angles_j, sin_j, cos_j, 8u); FABE13_SINCOS(angles_k, sin_k, cos_k, 8u); --j; float part1 = 0.0f; float part2 = 0.0f; float current_state = seed; while (j >= 0) { size_t k = 0u; while (k < 8u) { const float sin_term = sin_j[j] * sin_j[j]; const float cos_term = cos_k[k] * cos_k[k]; LCG_RAND_GRSH(current_state); float tmp_state = current_state; LCG_RAND_GRSH(current_state); part1 = fmaf(tmp_state, sin_term, fmaf(current_state, cos_term, part1)); LCG_RAND_GRSH(tmp_state); LCG_RAND_GRSH(current_state); part2 = fmaf(-tmp_state, cos_term, fmaf(current_state, sin_term, part2)); ++k; } --j; } return -sqrtf(fmaf(part1, part1, part2 * part2));

}

static __declspec(noalias) __forceinline float Shag(const float _m, const float x1, const float x2, const float y1,
const float y2, const uint64_t _N, const float _r) noexcept
{
const float diff = y2 - y1;
return _N == 1u
? fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f
: _N == 2u
? diff > 0.0f
? fmaf(-(1.0f / (_m * _m)), (diff) * (diff)_r, x1 + x2) * 0.5f
: fmaf(1.0f / (_m * _m), (diff) * (diff)
_r, x1 + x2) * 0.5f
: diff > 0.0f
? fmaf(-(1.0f / powf(_m, _N)), powf(diff, _N) * _r, x1 + x2) * 0.5f
: fmaf(1.0f / powf(_m, _N), powf(diff, _N) * _r, x1 + x2) * 0.5f;
}

__declspec(align(64u)) struct Interval final {

public:
const float* x1;
const float* x2;
const float* y1;
const float* y2;
const float* delta_y;
const float* N_factor;
const float* M;
float* R;

text
__declspec(noalias) __forceinline Interval(const float* const _x1, const float* const _x2, const float* const _y1, const float* const _y2, const float* const _N) noexcept { x1 = new float(*_x1); x2 = new float(*_x2); y1 = new float(*_y1); y2 = new float(*_y2); delta_y = new float(*y2 - *y1); N_factor = new float(*_N == 1.0f ? *_x2 - *_x1 : *_N == 2.0f ? sqrtf(*_x2 - *_x1) : powf(*_x2 - *_x1, 1.0f / *_N)); M = new float(fabsf(*delta_y) * (1.0f / *N_factor)); R = new float(0.0f); } __declspec(noalias) __forceinline void ChangeCharacteristic(const float _m) noexcept { *R = fmaf( -(*y2 + *y1), 2.0f, fmaf(_m, *N_factor, (*delta_y * *delta_y) * (1.0f / (_m * *N_factor)))); } ~Interval() noexcept { delete x1; delete x2; delete y1; delete y2; delete delta_y; delete N_factor; delete M; delete R; } Interval(const Interval&) = delete; Interval& operator=(const Interval&) = delete;

};

static __declspec(noalias) __forceinline bool ComparePtr(const std::unique_ptr<Interval>& __restrict const a, const std::unique_ptr<Interval>& __restrict const b) noexcept
{
return *a->R < *b->R;
}

const __declspec(noalias) __forceinline PeanoCurve_2D* PeanoCurve_2D::HitTest_2D(float x) const noexcept
{
int i = 0;
const int _razvertka = this->razvertka;
int num;
const float this_a = this->a;
x -= this_a;
const float b_minus_a = this->b - this_a;
const float inv_b_minus_a = 1.0f / b_minus_a;
const PeanoCurve_2D* Curr = this;

text
while (i != _razvertka) { const int shift = 1u << ++i + i; num = shift * x * inv_b_minus_a; x = fmaf(-ldexp(1.0f, -(i << 1)) * num, b_minus_a, x); const List currType = Curr->Type; switch (num) { case 0u: Curr = (currType == Top || currType == Right) ? Curr->DownLeft.get() : Curr->TopRight.get(); break; case 1u: Curr = (currType == Top || currType == Left) ? Curr->TopLeft.get() : Curr->DownRight.get(); break; case 2u: Curr = (currType == Top || currType == Right) ? Curr->TopRight.get() : Curr->DownLeft.get(); break; default: Curr = (currType == Top || currType == Left) ? Curr->DownRight.get() : Curr->TopLeft.get(); } } return Curr;

}

const __declspec(noalias) __forceinline float PeanoCurve_2D::FindX_2D(const float target_x1, const float target_x2) const noexcept
{
int _razvertka = this->razvertka;
int _razvertka1 = _razvertka;
float x1, x2, x = this->a;
const float b_minus_a = this->b - x;
const PeanoCurve_2D* Curr = this;

text
while (_razvertka != 0u) { const int exponent = _razvertka1 - _razvertka-- << 1u; x1 = Curr->x1; x2 = Curr->x2; const List currType = Curr->Type; if (target_x1 > x1 && target_x2 > x2) { Curr = Curr->TopRight.get(); if (currType == Top || currType == Right) { x = fmaf(ldexpf(1.0f, -exponent) * 0.5f, b_minus_a, x); } } else if (target_x1 < x1 && target_x2 > x2) { Curr = Curr->TopLeft.get(); if (currType == Top || currType == Left) { x = fmaf(ldexpf(1.0f, -exponent) * 0.25f, b_minus_a, x); } else { x = fmaf(ldexpf(1.0f, -exponent) * 0.75f, b_minus_a, x); } } else if (target_x1 < x1 && target_x2 < x2) { Curr = Curr->DownLeft.get(); if (currType == Down || currType == Left) { x = fmaf(ldexpf(1.0f, -exponent) * 0.5f, b_minus_a, x); } } else { Curr = Curr->DownRight.get(); if (currType == Top || currType == Left) { x = fmaf(ldexpf(1.0f, -exponent) * 0.75f, b_minus_a, x); } else { x = fmaf(ldexpf(1.0f, -exponent) * 0.25f, b_minus_a, x); } } } return x;

}

static std::unique_ptrboost::mpi::environment g_env;
static std::unique_ptrboost::mpi::communicator g_world;
static std::unique_ptr<PeanoCurve_2D> Curve, Curve_Inverted; extern "C" __declspec(dllexport) int AgpInit(int peanoLevel, float a, float b, float c, float d) {
if (!g_env) g_env = std::make_uniqueboost::mpi::environment(); // MPI_Init()
if (!g_world) g_world = std::make_uniqueboost::mpi::communicator(); // MPI_COMM_WORLD
int rank = g_world->rank();
if (rank == 0) Curve = std::make_unique<PeanoCurve_2D>(List::Top, peanoLevel, a, b, c, d);
else if (rank == 1) Curve_Inverted = std::make_unique<PeanoCurve_2D>(List::Down, peanoLevel, a, b, c, d);
return rank;
}

extern "C" _declspec(dllexport) void AgpFinalize() {
Curve_Inverted.reset();
Curve.reset();
g_world.reset();
g
extern "C" __declspec(dllexport) void Base_LNA_1_2_Mer_AGP_Free(float* p) {
CoTaskMemFree(p);
}

// В DLL
extern "C" __declspec(dllexport) void AgpStartWorkers() {
int go = 1;
MPI_Bcast(&go, 1, MPI_INT, 0, MPI_COMM_WORLD);
}

extern "C" __declspec(dllexport) void AgpWaitStartAndRun() {
int go = 0;
MPI_Bcast(&go, 1, MPI_INT, /root=/0, MPI_COMM_WORLD); // блокируется, пока мастер не пошлёт
if (go == 1) {
float* dummy = nullptr; uint64_t dummy_len = 0;
Base_LNA_1_2_Mer_AGP(2.0f, 1000.0f, -2.2f, 1.8f, -2.2f, 1.8f,
2.5f, false, 0.001f, GetTickCount(),
&dummy, &dummy_len);
if (dummy) Base_LNA_1_2_Mer_AGP_Free(dummy);
}
}, код MyForm.cpp: [STAThread]
int main() {
HMODULE h = LoadLibraryW(L"TEST_FUNC.dll");
auto AgpInit = (PInit)GetProcAddress(h, "AgpInit");
auto AgpFinalize = (PFin)GetProcAddress(h, "AgpFinalize");
if (!h || !AgpInit || !AgpFinalize) { MessageBox::Show(L"Не удалось инициализировать DLL/MPI"); return 1; }

text
const int rank = AgpInit(12, -2.2f, 1.8f, -2.2f, 1.8f); if (rank == 0) { Application::EnableVisualStyles(); Application::SetCompatibleTextRenderingDefault(false); Application::Run(gcnew TESTAGP::MyForm(h)); } else { // Воркер-процесс: ждёт команды "старт" и выполняет расчёт без UI auto AgpWaitStartAndRun = (PStartWorkers)GetProcAddress(h, "AgpWaitStartAndRun"); if (AgpWaitStartAndRun) AgpWaitStartAndRun(); // см. пункт D } AgpFinalize(); FreeLibrary(h); return 0;

}, MyForm.h: ref class MyForm : public System::Windows::Forms::Form {
public:
MyForm(HMODULE hLib) : hLib(hLib) { // Передаем дескриптор из main
InitializeComponent();
// Загружаем функции из DLL
f = (agp_c)GetProcAddress(hLib, "Base_LNA_1_2_Mer_AGP");
ffree = (agp_free)GetProcAddress(hLib, "Base_LNA_1_2_Mer_AGP_Free");
pStart = (start_workers)GetProcAddress(hLib, "AgpStartWorkers");
}

text
protected: ~MyForm() { delete components; } private: HINSTANCE hLib = nullptr; typedef void(__cdecl* agp_c)(float, float, float, float, float, float, float, bool, float, float, float**, uint64_t*); typedef void(__cdecl* agp_free)(float*); typedef void(__cdecl* start_workers)(); agp_c f = nullptr; agp_free ffree = nullptr; start_workers pStart = nullptr; System::Windows::Forms::Button^ button1; System::Windows::Forms::TextBox^ textBox2; System::Windows::Forms::DataVisualization::Charting::Chart^ chart2; System::Windows::Forms::Label^ label2; System::Windows::Forms::Label^ label3; System::Windows::Forms::TextBox^ textBox1; System::Windows::Forms::TextBox^ textBox3; System::Windows::Forms::TextBox^ textBox4; System::Windows::Forms::TextBox^ textBox5; System::Windows::Forms::TextBox^ textBox6; System::Windows::Forms::Label^ label6; System::Windows::Forms::Label^ label7; System::Windows::Forms::Label^ label8; System::Windows::Forms::Label^ label9; System::Windows::Forms::Label^ label10; System::Windows::Forms::Label^ label1; System::Windows::Forms::TextBox^ textBox7; System::Windows::Forms::TextBox^ textBox8; System::ComponentModel::Container^ components; System::Void button1_Click(System::Object^ sender, System::EventArgs^ e) { chart2->Series[0]->Points->Clear(); chart2->Series[1]->Points->Clear(); chart2->Series[2]->Points->Clear(); chart2->Series[3]->Points->Clear(); static LARGE_INTEGER start, end; QueryPerformanceCounter(&start); const float seed = GetTickCount(); float* buf = nullptr; uint64_t len = 0u; if (pStart) pStart(); // разбудили rank != 0 f(2.0f, 1000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.001f, seed, &buf, &len); if (buf != nullptr) { std::vector<float> Extr_2D(buf, buf + len); ffree(buf); // Для 2D ветки: извлекаем schetchick и interval_len textBox7->Text = Convert::ToString(Extr_2D.back()); // schetchick Extr_2D.pop_back(); textBox6->Text = Convert::ToString(Extr_2D.back()); // interval_len Extr_2D.pop_back(); // Извлекаем последнюю тройку (x2, x1, f) float x2 = Extr_2D.back(); Extr_2D.pop_back(); float x1 = Extr_2D.back(); Extr_2D.pop_back(); float f_val = Extr_2D.back(); Extr_2D.pop_back(); // Выводим координаты и значение функции textBox4->Text = Convert::ToString(x1); textBox3->Text = Convert::ToString(x2); textBox2->Text = Convert::ToString(f_val); // Новое поле для значения функции // Отображаем точку на графике chart2->Series[2]->Points->AddXY(x1, x2); // Обрабатываем остальные точки (в порядке: f, x1, x2) while (!Extr_2D.empty()) { float x2_point = Extr_2D.back(); Extr_2D.pop_back(); float x1_point = Extr_2D.back(); Extr_2D.pop_back(); float f_point = Extr_2D.back(); Extr_2D.pop_back(); // Добавляем точку на график (только координаты) chart2->Series[0]->Points->AddXY(x1_point, x2_point); } } QueryPerformanceCounter(&end); LARGE_INTEGER freq; QueryPerformanceFrequency(&freq); const uint64_t multiplier = ((1ULL << 32) / freq.QuadPart) * 1'000'000ULL; textBox5->Text = Convert::ToString( ((end.QuadPart - start.QuadPart) * multiplier) >> 32 ) + " microseconds"; когда запускаю приложение через mpiexec : C:\Users\maxim>mpiexec -n 2 D:\TEST_AGP\x64\Release\TEST_AGP.exe

Необработанное исключение: System.Runtime.InteropServices.SEHException: Внешний компонент создал исключение.
в main() в D:\TEST_AGP\MyForm.cpp:строка 28
в mainCRTStartup()

job aborted:
[ranks] message

[0] terminated

[1] process exited without calling finalize

---- error analysis -----

[1] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc000001d

---- error analysis ----- то есть у меня есть dll библиотека которую использует управляемое приложение, и библиотека и управляемое приложение используют mpi - подскажи в чём может быть проблема

Мысль

Diagnosing MPI app crash

I’m investigating an issue with an MPI app using Boost.MPI and MPI_Bcast, where a SEHException with exit code 0xc000001d occurred. This suggests a CPU instruction issue — possibly using AVX-512 intrinsics on a system that doesn’t support them. It seems there’s no runtime check for AVX-512 in the code, leading to crashes.

Understanding AVX-512 crash behavior

When the code is compiled with AVX-512 intrinsics, running it on a CPU without AVX-512 support can cause an illegal instruction error, but only if the code path with n>=16 is executed. For smaller values of n (like 8 or 14), the AVX-512 block is skipped, so the error is prevented.

Investigating calling convention mismatch

In some cases, the value of 'n' is small, but in functions like FABE13_SINCOS, it’s 14, which should be safe. Still, the crash (0xc000001d) might come from mismatches between calling conventions (__vectorcall vs. __cdecl) or function pointer issues, possibly in how GetProcAddress deals with them. I'm counting the arguments in the typedef, which lists 10 floats, but the code passes 11.

Checking function arguments

The function has 9 float arguments and 1 bool, matching the typedef that specifies 9 floats after the bool. I was unsure for a moment, but counting carefully, it all adds up. Now, the exception happens in main at line 28. I’ll need to investigate what’s happening on that line.

Investigating the crash

The crash with exit code 0xc000001d likely stems from using AVX-512 intrinsics on a CPU that doesn't support it. While AVX-512 might not be used directly in the function, the compiler could still include them. Adding CPU feature detection or removing AVX-512 intrinsics could resolve this, providing a fallback to AVX2 or scalar code.

Identifying possible crash causes

Several possibilities could be causing the crash: misuse of AVX-512 instructions, mismatched calling conventions, mixing MPI runtime versions, or mishandling memory allocation. The likely culprit is an illegal instruction due to AVX-512 usage, especially with mismatched CPU or OS support. I recommend avoiding AVX-512, adding runtime checking, and ensuring appropriate MPI handling for the OS.

Diagnosing illegal instruction error

It looks like the likely cause of the error is the illegal instruction related to AVX-512. The user should disable AVX-512 compilation flags, check their CPU capabilities using a CPUID check, and ensure the MPI setup is correct. A good test would be to try running without mpiexec or to isolate the crash by calling AgpInit alone.

Thought for 2m 23s

Ответ

Код падает с 0xC000001D — это STATUS_ILLEGAL_INSTRUCTION (“недопустимая инструкция”). В вашем проекте есть очень вероятная причина: вы используете AVX-512-интринсики (_mm512_*) в FABE13_SINCOS. Если процессор или ОС не поддерживают AVX-512 (а в клиентских Windows AVX-512 часто отключён даже на CPU, которые его умеют), любое выполнение такой инструкции приводит ровно к этому исключению.

Даже если у вас n < 16 и вы “логически” не заходите в AVX-512-ветку, при сборке всего TU с /arch:AVX512 компилятор может авто-векторизовать другие циклы под zmm-регистры. В итоге падение может случаться сразу при первом заходе в DLL (как у вас — на AgpInit(...) в обоих процессах под mpiexec).

Как быстро подтвердить гипотезу

  1. Пересоберите DLL (и EXE, если там те же TU) без AVX-512:
    MSVC: Project Properties → C/C++ → Code Generation → Enable Enhanced Instruction SetNo Enhanced Instructions/AVX2.
    Либо просто удалите/закомментируйте AVX-512-интринсики и пересоберите.
  2. Запустите снова:
    bat
    mpiexec -n 2 D:\TEST_AGP\x64\Release\TEST_AGP.exe
    Если падение исчезло — причина именно в AVX-512.

Правильная фиксация

Сделайте рантайм-диспетчеризацию и оставьте по умолчанию скаляр/AVX2.

cpp
#include <immintrin.h> #include <intrin.h> static bool has_avx512() { int info[4]; __cpuidex(info, 0, 0); if (info[0] < 7) return false; int info7[4]; __cpuidex(info7, 7, 0); bool avx512f = (info7[1] & (1 << 16)) != 0; // AVX-512F unsigned long long xcr0 = _xgetbv(0); bool os_ymm = (xcr0 & 0x6) == 0x6; // XMM+YMM bool os_zmm = (xcr0 & (1ULL<<5 | 1ULL<<6 | 1ULL<<7)) == ( (1ULL<<5) | (1ULL<<6) | (1ULL<<7) ); // ZMM opmask/ZMM return avx512f && os_ymm && os_zmm; }

И в FABE13_SINCOS включайте AVX-512-ветку только если has_avx512() && (n >= 16), иначе используйте уже имеющуюся скалярную часть. Идеально — вынести AVX-512-реализацию в отдельный .cpp, собрать только его с /arch:AVX512, а основной код — с /arch:AVX2. Для MSVC можно ещё и так:

cpp
__declspec(cpu_specific(avx512f)) void FABE13_SINCOS_impl_avx512(...); __declspec(cpu_specific(avx2)) void FABE13_SINCOS_impl_avx2(...); __declspec(cpu_specific(sse2)) void FABE13_SINCOS_impl_sse2(...); __declspec(cpu_dispatch(avx512f, avx2, sse2)) void FABE13_SINCOS(...);

Ещё что проверить (вдобавок к AVX-512)

  • Сигнатуры экспортов. Для Base_LNA_1_2_Mer_AGP у вас extern "C" __declspec(dllexport) void ..., значит имя без манглинга. Тип в форме должен соответствовать ровно:
    cpp
    typedef void(__cdecl* agp_c)(float, float, float, float, float, float, float, bool, float, float, float**, uint64_t*);
    (9 float + bool до двух указателей). Если случайно указали __stdcall или иное количество аргументов — будет краш.
  • AgpFinalize: убедитесь, что там действительно вызывается g_env.reset(); (чтобы сделать MPI_Finalize()), а не оборванный код — в вашем фрагменте строка с g_ обрезана.
  • Смешение Boost.MPI и “чистого” MPI — допустимо, но используйте один и тот же рантайм (MS-MPI) и одинаковый коммуникатор (MPI_COMM_WORLD у вас в порядке). Последовательность у вас правильная: AgpInit()environment (MPI_Init), затем в воркере AgpWaitStartAndRun() блокируется на MPI_Bcast, мастер будит через AgpStartWorkers().

Мини-чек-лист

  1. Пересобрать без AVX-512 → проверить.
  2. Если всё ОК — вернуть AVX-512 только с рантайм-проверкой и/или отдельным TU.
  3. Дважды проверить соответствие typedef/GetProcAddress и extern "C".
  4. В AgpFinalize() гарантировать вызов MPI_Finalize через g_env.reset().

С высокой вероятностью корень проблемы — именно AVX-512. После отключения или грамотной диспетчеризации код под mpiexec -n 2 должен перестать падать.

Поделиться этим Q&A