C++/CLR MyForm.h #pragma once #define WIN32_LEAN_A...

Question

C++/CLR MyForm.h #pragma once
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>
#include <stdint.h>

using namespace System;
using namespace System::Drawing;
using namespace System::Windows::Forms;
using namespace System::Collections::Generic;
using namespace System::Drawing::Drawing2D;

typedef void(__cdecl* P_MANIP)(int, bool, float, float, float, int, int, float, bool, float, unsigned int, float**, size_t*, float*, float*, float*);
typedef void(__cdecl* P_FREE)(float*);
typedef void(__cdecl* P_START)(int, bool, float, float, float, int, int, float, bool, float, unsigned int);

namespace TESTAGP {
public ref class MyForm : public Form {
public:
MyForm(HMODULE hLib) : hLib(hLib) {
this->SetStyle(ControlStyles::AllPaintingInWmPaint | ControlStyles::UserPaint | ControlStyles::OptimizedDoubleBuffer, true);
this->Text = L"AGP Manipulator 2D"; this->ClientSize = System::Drawing::Size(1000, 700);
this->Resize += gcnew EventHandler(this, &MyForm::OnResize);
fManip = (P_MANIP)GetProcAddress(hLib, "AGP_Manip2D");
pFree = (P_FREE)GetProcAddress(hLib, "AGP_Free");
pStart = (P_START)GetProcAddress(hLib, "AgpStartManipND");
angles = gcnew List<float>(0.0f);
lengths = gcnew List<float>(0.0f);
InitUI();
ResetRandomConfig();
}
protected:
~MyForm() {}
private:
HMODULE hLib; P_MANIP fManip; P_FREE pFree; P_START pStart;
int nSegments; bool variableLengths; List<float>^ angles; List<float>^ lengths;
CheckBox^ cbVarLen; NumericUpDown^ nudMinTheta, ^ nudBaseLength, ^ nudStretchFactor, ^ nudTargetX, ^ nudTargetY, ^ nudLevels, ^ nudMaxIter;
CheckBox^ cbAdaptive; NumericUpDown^ nudR, ^ nudEps; Button^ btnAdd, ^ btnRem, ^ btnOptimize; Label^ lblInfo;
System::UInt32 rngState = 0xA5C39E0Du;

text
	void WireInvalidate(Control^ c) {
		// чтобы после любых изменений интерфейса можно было сразу жать «Оптимизировать»
		if (dynamic_cast<NumericUpDown^>(c)) ((NumericUpDown^)c)->ValueChanged += gcnew EventHandler(this, &MyForm::OnAnyChanged);
		else if (dynamic_cast<CheckBox^>(c)) ((CheckBox^)c)->CheckedChanged += gcnew EventHandler(this, &MyForm::OnAnyChanged);
	}

	void InitUI() {
		int y = 10, w = 180, h = 24, pad = 8, currentX = 10;
		Label^ L;

		// Мин угол (рад) — по умолчанию 1.5 рад (было 0.5)
		L = gcnew Label(); L->Text = L"Мин. угол (рад)"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
		nudMinTheta = gcnew NumericUpDown(); nudMinTheta->Location = Point(currentX, y + h + 2); nudMinTheta->Width = w;
		nudMinTheta->DecimalPlaces = 3; nudMinTheta->Minimum = (Decimal)0.01; nudMinTheta->Maximum = (Decimal)3.14159; nudMinTheta->Value = (Decimal)1.5; this->Controls->Add(nudMinTheta);
		WireInvalidate(nudMinTheta);

		// Базовая длина — диапазон [0.5; 2.0]
		currentX += w + 20; L = gcnew Label(); L->Text = L"Базовая длина"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
		nudBaseLength = gcnew NumericUpDown(); nudBaseLength->Location = Point(currentX, y + h + 2); nudBaseLength->Width = w;
		nudBaseLength->DecimalPlaces = 2; nudBaseLength->Minimum = (Decimal)0.50; nudBaseLength->Maximum = (Decimal)2.00; nudBaseLength->Value = (Decimal)1.00; this->Controls->Add(nudBaseLength);
		WireInvalidate(nudBaseLength);

		// Коэф. растяжения — [1.0; 1.5] с мелким шагом
		currentX += w + 20; L = gcnew Label(); L->Text = L"Коэф. растяжения"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
		nudStretchFactor = gcnew NumericUpDown(); nudStretchFactor->Location = Point(currentX, y + h + 2); nudStretchFactor->Width = w;
		nudStretchFactor->DecimalPlaces = 2; nudStretchFactor->Minimum = (Decimal)1.00; nudStretchFactor->Maximum = (Decimal)1.50;
		nudStretchFactor->Increment = (Decimal)0.01; nudStretchFactor->Value = (Decimal)1.50; this->Controls->Add(nudStretchFactor);
		WireInvalidate(nudStretchFactor);

		// Переменные длины (флаг)
		currentX += w + 20; cbVarLen = gcnew CheckBox();
		cbVarLen->Text = L"Переменные длины"; cbVarLen->Location = Point(currentX, y + h + 2);
		cbVarLen->Width = w; cbVarLen->Checked = false; this->Controls->Add(cbVarLen);
		WireInvalidate(cbVarLen);

		// Следующая строка
		currentX = 10; y += h * 2 + pad + 10;

		// Цель X/Y
		L = gcnew Label(); L->Text = L"Цель X"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
		nudTargetX = gcnew NumericUpDown(); nudTargetX->Location = Point(currentX, y + h + 2); nudTargetX->Width = w;
		nudTargetX->DecimalPlaces = 2; nudTargetX->Minimum = (Decimal)-10.0; nudTargetX->Maximum = (Decimal)10.0; nudTargetX->Value = (Decimal)3.5; this->Controls->Add(nudTargetX);
		WireInvalidate(nudTargetX);

		currentX += w + 20; L = gcnew Label(); L->Text = L"Цель Y"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
		nudTargetY = gcnew NumericUpDown(); nudTargetY->Location = Point(currentX, y + h + 2); nudTargetY->Width = w;
		nudTargetY->DecimalPlaces = 2; nudTargetY->Minimum = (Decimal)-10.0; nudTargetY->Maximum = (Decimal)10.0; nudTargetY->Value = (Decimal)1.0; this->Controls->Add(nudTargetY);
		WireInvalidate(nudTargetY);

		// Глубина — минимум 7, максимум 20 (как просили)
		currentX += w + 20; L = gcnew Label(); L->Text = L"Глубина"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
		nudLevels = gcnew NumericUpDown(); nudLevels->Location = Point(currentX, y + h + 2); nudLevels->Width = w;
		nudLevels->Minimum = 7; nudLevels->Maximum = 20; nudLevels->Value = 12; this->Controls->Add(nudLevels);
		WireInvalidate(nudLevels);

		// Надёжность r — [1.0; 20.0], по умолчанию 2.5
		currentX += w + 20; L = gcnew Label(); L->Text = L"Надежность (r)"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
		nudR = gcnew NumericUpDown(); nudR->Location = Point(currentX, y + h + 2); nudR->Width = w;
		nudR->DecimalPlaces = 2; nudR->Minimum = (Decimal)1.00; nudR->Maximum = (Decimal)20.00; nudR->Value = (Decimal)2.50; this->Controls->Add(nudR);
		WireInvalidate(nudR);

		// Адаптивная схема (флаг)
		currentX += w + 20; cbAdaptive = gcnew CheckBox();
		cbAdaptive->Text = L"Адаптивная"; cbAdaptive->Location = Point(currentX, y + h + 2);
		cbAdaptive->Width = w; cbAdaptive->Checked = true; this->Controls->Add(cbAdaptive);
		WireInvalidate(cbAdaptive);

		// Следующая строка
		y += h * 2 + pad + 10; currentX = 10;

		// Точность — по умолчанию 0,00001; минимум 1e-9; максимум 0.1
		L = gcnew Label(); L->Text = L"Точность"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
		nudEps = gcnew NumericUpDown(); nudEps->Location = Point(currentX, y + h + 2); nudEps->Width = w;
		// ВАЖНО: ставим 9 знаков, чтобы можно было выставить 1e-9; значение по умолчанию 0.00001
		nudEps->DecimalPlaces = 9; nudEps->Minimum = (Decimal)0.000000001; nudEps->Maximum = (Decimal)0.1; nudEps->Value = (Decimal)0.00001;
		// Примечание: WinForms будет показывать с 9 знаками; это нормально функционально.
		this->Controls->Add(nudEps);
		WireInvalidate(nudEps);

		// Макс. итераций — по умолчанию оставить 1000, но максимум поднять до 500000
		currentX += w + 20; L = gcnew Label(); L->Text = L"Макс. итераций"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
		nudMaxIter = gcnew NumericUpDown(); nudMaxIter->Location = Point(currentX, y + h + 2); nudMaxIter->Width = w;
		nudMaxIter->Minimum = 10; nudMaxIter->Maximum = 500000; nudMaxIter->Value = 1000; this->Controls->Add(nudMaxIter);
		WireInvalidate(nudMaxIter);

		// Кнопки и инфо
		currentX += 200; btnAdd = gcnew Button(); btnAdd->Text = L"+ Звено"; btnAdd->Location = Point(currentX, y + h + 2); btnAdd->Width = 80;
		btnAdd->Click += gcnew EventHandler(this, &MyForm::OnAddClick); this->Controls->Add(btnAdd);

		currentX += 85; btnRem = gcnew Button(); btnRem->Text = L"- Звено"; btnRem->Location = Point(currentX, y + h + 2); btnRem->Width = 80;
		btnRem->Click += gcnew EventHandler(this, &MyForm::OnRemClick); this->Controls->Add(btnRem);

		currentX += 125; btnOptimize = gcnew Button(); btnOptimize->Text = L"Оптимизировать"; btnOptimize->Location = Point(currentX, y + h + 2); btnOptimize->Width = 120;
		btnOptimize->Click += gcnew EventHandler(this, &MyForm::OnOptimizeClick); this->Controls->Add(btnOptimize);

		currentX += 125; lblInfo = gcnew Label(); lblInfo->Location = Point(currentX, y); lblInfo->Size = System::Drawing::Size(250, 60);
		lblInfo->BorderStyle = BorderStyle::FixedSingle; lblInfo->Text = L"Готов"; this->Controls->Add(lblInfo);
	}

	void ResetRandomConfig() {
		nSegments = 1;
		angles->Clear(); lengths->Clear();
		// По умолчанию первый сегмент смотрит строго вверх (π/2)
		angles->Add((float)(3.14159265358979323846 / 2.0));
		lengths->Add((float)nudBaseLength->Value);
		variableLengths = false;
		this->Invalidate();
	}

	float Rand01() { rngState ^= rngState << 13; rngState ^= rngState >> 17; rngState ^= rngState << 5; return (float)((double)(unsigned)rngState / 4294967296.0); }
	float RandAngle() { return (Rand01() * 6.28318530717958647692f) - 3.14159265358979323846f; }

	System::Void OnResize(System::Object^, System::EventArgs^) { this->Invalidate(); }
	System::Void OnAnyChanged(System::Object^, System::EventArgs^) { this->Invalidate(); }

	System::Void OnAddClick(System::Object^, System::EventArgs^) {
		++nSegments;
		angles->Add(RandAngle());
		lengths->Add((float)nudBaseLength->Value);
		this->Invalidate();
	}
	System::Void OnRemClick(System::Object^, System::EventArgs^) {
		if (nSegments > 1) { --nSegments; angles->RemoveAt(angles->Count - 1); lengths->RemoveAt(lengths->Count - 1); this->Invalidate(); }
	}

	System::Void OnOptimizeClick(System::Object^, System::EventArgs^) {
		if (!fManip) return;
		variableLengths = cbVarLen->Checked;
		float minTheta = (float)nudMinTheta->Value;
		float tx = (float)nudTargetX->Value, ty = (float)nudTargetY->Value;
		int levels = (int)nudLevels->Value, maxIter = (int)nudMaxIter->Value;
		bool adaptive = cbAdaptive->Checked;
		float r_param = (float)nudR->Value, eps = (float)nudEps->Value;
		unsigned int seed = (unsigned)GetTickCount();

		if (pStart) pStart(nSegments, variableLengths, minTheta, tx, ty, levels, maxIter, r_param, adaptive, eps, seed);

		LARGE_INTEGER t0, t1, fq; QueryPerformanceCounter(&t0);
		float* bestQ = nullptr; size_t bestQLen = 0; float bestX = 0, bestY = 0, bestF = 0;
		fManip(nSegments, variableLengths, minTheta, tx, ty, levels, maxIter, r_param, adaptive, eps, seed,
			&bestQ, &bestQLen, &bestX, &bestY, &bestF);
		QueryPerformanceCounter(&t1); QueryPerformanceFrequency(&fq);
		double micros = 1e6 * (double)(t1.QuadPart - t0.QuadPart) / (double)fq.QuadPart;

		if (bestQ && bestQLen >= (size_t)nSegments) {
			angles->Clear(); for (int i = 0; i < nSegments; ++i) angles->Add(bestQ[i]);
			if (variableLengths) { lengths->Clear(); for (int i = 0; i < nSegments; ++i) lengths->Add(bestQ[nSegments + i]); }
			else { lengths->Clear(); for (int i = 0; i < nSegments; ++i) lengths->Add((float)nudBaseLength->Value); }
			if (pFree) pFree(bestQ);
		}
		lblInfo->Text = String::Format(L"Результат:\nЦель: {0:F5}\nТочка: ({1:F3}, {2:F3})\nВремя: {3:F0} мкс", bestF, bestX, bestY, micros);
		this->Invalidate();
	}

protected:
	virtual void OnPaint(PaintEventArgs^ e) override {
		Form::OnPaint(e); Graphics^ g = e->Graphics; g->SmoothingMode = System::Drawing::Drawing2D::SmoothingMode::HighQuality; g->Clear(this->BackColor);
		int topOffset = 150; System::Drawing::Rectangle drawArea = System::Drawing::Rectangle(0, topOffset, this->ClientSize.Width, this->ClientSize.Height - topOffset);
		g->FillRectangle(Brushes::White, drawArea); int leftWallX = drawArea.Left + this->ClientSize.Width * 25 / 100;
		Pen^ wallPen = gcnew Pen(Color::Black, 2); g->DrawLine(wallPen, leftWallX, drawArea.Top, leftWallX, drawArea.Bottom);
		HatchBrush^ hatchBrush = gcnew HatchBrush(HatchStyle::BackwardDiagonal, Color::LightGray, Color::White); int leftHatchWidth = 100;
		g->FillRectangle(hatchBrush, leftWallX - leftHatchWidth, drawArea.Top, leftHatchWidth, drawArea.Height);
		float targetX = (float)nudTargetX->Value; float targetY = (float)nudTargetY->Value; float scale = 160.0f; int baseX = leftWallX; int baseY = drawArea.Top + drawArea.Height / 2;
		float pixelTargetX = baseX + targetX * scale; float pixelTargetY = baseY - targetY * scale; int rightWallX = (int)(pixelTargetX + 8); rightWallX = Math::Min(rightWallX, drawArea.Right - 10);
		Pen^ dashedPen = gcnew Pen(Color::Black, 2); dashedPen->DashStyle = DashStyle::Dash; g->DrawLine(dashedPen, rightWallX, drawArea.Top, rightWallX, drawArea.Bottom);
		int rightHatchWidth = leftHatchWidth; g->FillRectangle(hatchBrush, rightWallX, drawArea.Top, rightHatchWidth, drawArea.Height);
		Pen^ targetPen = gcnew Pen(Color::Green, 1.5f); targetPen->DashStyle = DashStyle::Dot; g->DrawEllipse(targetPen, pixelTargetX - 8.0f, pixelTargetY - 8.0f, 16.0f, 16.0f);
		cli::array<PointF>^ pts = gcnew cli::array<PointF>(nSegments + 1); pts[0] = PointF((float)baseX, (float)baseY); float x = 0.0f, y = 0.0f, phi = 0.0f;
		for (int i = 0; i < nSegments; ++i) {
			array<float>^ anglesArray = angles->ToArray(); array<float>^ lengthsArray = lengths->ToArray();
			float theta = anglesArray[i]; float L = lengthsArray[i]; phi += theta; x += L * (float)Math::Cos((double)phi); y += L * (float)Math::Sin((double)phi);
			pts[i + 1] = PointF(baseX + x * scale, baseY - y * scale);
		}
		Pen^ penRod = gcnew Pen(Color::Red, 6.0f); for (int i = 0; i < nSegments; ++i) g->DrawLine(penRod, pts[i], pts[i + 1]);
		SolidBrush^ brJoint = gcnew SolidBrush(Color::Blue); const float R = 6.0f; for (int i = 0; i <= nSegments; ++i) g->FillEllipse(brJoint, pts[i].X - R, pts[i].Y - R, 2 * R, 2 * R);
		delete wallPen; delete dashedPen; delete targetPen; delete penRod; delete brJoint; delete hatchBrush;
	}
};

} MyForm.cpp: // MyForm.cpp — точка входа GUI, без функциональных изменений, просто подключение DLL и запуск формы

#include "MyForm.h"
#include <float.h>

using namespace System;
using namespace System::Windows::Forms;

typedef int(__cdecl* PInit)(int, float, float, float, float);
typedef void(__cdecl* PStartWorkers)();

[STAThread]
int main() {
HMODULE h = LoadLibraryW(L"TEST_FUNC.dll");
auto AgpInit = (PInit)GetProcAddress(h, "AgpInit");
auto AgpWaitStartAndRun = (PStartWorkers)GetProcAddress(h, "AgpWaitStartAndRun");

text
// Инициализация: глубина 12 как и была
const int rank = AgpInit(12, -2.2f, 1.8f, -2.2f, 1.8f);

if (!rank) {
	Application::EnableVisualStyles();
	Application::SetCompatibleTextRenderingDefault(false);
	Application::Run(gcnew TESTAGP::MyForm(h));
}
else {
	AgpWaitStartAndRun();
}
return 0;

} код .cpp файла dll: // agp_algo.cpp
// Полностью обновлённый файл DLL с учётом всех требований.
// ВАЖНО: собирается как C++17/20, /arch:AVX2, /Oi, /O2, /fp:fast желательны.

#include "pch.h"

// ==================== ВАШИ МАКРО И ФУНКЦИИ (без изменений интерфейсов), мелкие доработки дальше ====================

#define XOR_RAND(state, result_var)
do {
uint32_t s = (state);
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
(state) = s;
float tmp = (float)((double)(s) * (1.0/4294967296.0));
result_var = tmp;
} while (0)

#define XOR_RAND_GRSH(state, result_var)
do {
uint32_t s = (state);
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
(state) = s;
result_var = fmaf((float)(int32_t)s, 0x1.0p-31f, -1.0f);
} while (0)

#define FABE13_COS(x, result_var)
do {
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530718f);
if (r > 3.14159265359f)
r = 6.28318530718f - r;
if (r < 1.57079632679f) {
const float t2 = r * r;
const float t4 = t2 * t2;
result_var = fmaf(t4, fmaf(t2, -0.0013888889f, 0.0416666667f), fmaf(t2, -0.5f, 1.0f));
} else {
r = 3.14159265359f - r;
const float t2 = r * r;
const float t4 = t2 * t2;
result_var = -fmaf(t4, fmaf(t2, -0.0013888889f, 0.0416666667f), fmaf(t2, -0.5f, 1.0f));
}
} while (0)

#define FABE13_SIN(x, result_var)
do {
const float x = (x);
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530718f);
bool sfl = r > 3.14159265359f;
if (sfl)
r = 6.28318530718f - r;
bool cfl = r > 1.57079632679f;
if (cfl)
r = 3.14159265359f - r;
const float t2 = r * r;
float _s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
result_var = ((x < 0.0f) ^ sfl) ? -_s : _s;
} while (0)

#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
const int limit = (n) & ~7;
if ((n) >= 8) {
static __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f);
static __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f);
static __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f);
static __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f);
static __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f);
static __declspec(align(32)) const __m256 VEC_COS_P5 = _mm256_set1_ps(-0.0013888889f);
static __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f);
static __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f);
static __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_SIN_P5 = _mm256_set1_ps(-0.0001984127f);
static __declspec(align(32)) const __m256 VEC_SIN_P3 = _mm256_set1_ps(0.0083333333f);
static __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f);
static __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps();
while (i < limit) {
const __m256 vx = _mm256_load_ps(&(in)[i]);
const __m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
const __m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
const __m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
const __m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
const __m256 t2 = _mm256_mul_ps(r2, r2);
const __m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P5, VEC_COS_P3), VEC_COS_P1), VEC_COS_P0);
const __m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_SIN_P5, VEC_SIN_P3), VEC_SIN_P1), VEC_SIN_P0), r2);
const __m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
const __m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ), _mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
_mm256_store_ps(&(cos_out)[i], _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip));
_mm256_store_ps(&(sin_out)[i], _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip));
i += 8;
}
}
while (i < (n)) {
const float x = (in)[i];
const float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
const bool sflip = r > 3.14159265359f;
if (sflip)
r = 6.28318530718f - r;
const bool cflip = r > 1.57079632679f;
if (cflip)
r = 3.14159265359f - r;
const float t2 = r * r;
const float c = fmaf(t2, fmaf(t2, fmaf(t2, -0.0013888889f, 0.0416666667f), -0.5f), 1.0f);
const float s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
(cos_out)[i] = cflip ? -c : c;
(sin_out)[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while (0)

// ---------------- Peano / Morton таблички и MPI-глобали (без изменения интерфейса) ----------------

enum List : uint8_t { Top = 0b00u, Down = 0b01u, Left = 0b10u, Right = 0b11u };

__declspec(align(4)) struct Step final { const uint8_t next, dx, dy; };
__declspec(align(4)) struct InvStep final { const uint8_t q, next; };

__declspec(align(64)) static const Step g_step_tbl[4][4] = {
{ { Right,0u,0u }, { Top,0u,1u }, { Top,1u,1u }, { Left,1u,0u } },
{ { Left,1u,1u }, { Down,1u,0u }, { Down,0u,0u }, { Right,0u,1u } },
{ { Down,1u,1u }, { Left,0u,1u }, { Left,0u,0u }, { Top,1u,0u } },
{ { Top,0u,0u }, { Right,1u,0u }, { Right,1u,1u }, { Down,0u,1u } }
};

__declspec(align(64)) static const InvStep g_inv_tbl[4][4] = {
{ { 0u,Right }, { 1u,Top }, { 3u,Left }, { 2u,Top } },
{ { 2u,Down }, { 3u,Right }, { 1u,Down }, { 0u,Left } },
{ { 2u,Left }, { 1u,Left }, { 3u,Top }, { 0u,Down } },
{ { 0u,Top }, { 3u,Down }, { 1u,Right }, { 2u,Right } }
};

static const boost::mpi::environment* g_env;
static const boost::mpi::communicator* g_world;

// ---------------- Служебные структуры сообщений и slab-аллокатор ----------------

__declspec(align(16)) struct CrossMsg final {
float s_x1, s_x2, e_x1, e_x2, Rtop;
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }
};
__declspec(align(16)) struct CtrlMsg final {
bool kind; CrossMsg xchg;
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& kind& xchg; }
};

__declspec(align(16)) struct Slab final {
char* const base; char* current; char* const end;
__forceinline Slab(void* const memory, const size_t usable) : base((char*)memory), current(base), end(base + (usable & ~(size_t)63u)) {}
};
static tbb::enumerable_thread_specific<Slab*> tls( noexcept {
void* memory = _aligned_malloc(16777216u, 64u);
Slab* slab = (Slab*)_aligned_malloc(32u, 64u);
new (slab) Slab(memory, 16777216u);
char* p = slab->base;
while (p < slab->end) { *p = 0; p += 4096u; }
return slab;
});

// ---------------- Параметры активной 2D-карты (как было) ----------------

__declspec(align(16)) struct Peano2DMap final {
const int levels; const float a, b, c, d; const float lenx, leny; const float inv_lenx; const uint32_t scale; const uint8_t start;
__forceinline Peano2DMap(int L, float _a, float _b, float _c, float _d, uint8_t st)
: levels(L), a(_a), b(_b), c(_c), d(_d),
lenx(_b - _a), leny(_d - _c), inv_lenx(1.0f / (_b - _a)),
scale((uint32_t)1u << (L << 1)), start(st) {
}
};
static Peano2DMap gActiveMap(0, 0, 0, 0, 0, 0);

// ---------------- Интервалы и сравнения (как было) ----------------

__declspec(align(64)) struct Interval1D final {
const float x1, x2, y1, y2, delta_y, ordinate_factor, N_factor, quadratic_term, M; float R;
__forceinline void* operator new(size_t) noexcept { Slab* s = tls.local(); char* r = s->current; s->current += 64u; return r; }
__forceinline Interval1D(float _x1, float _x2, float _y1, float _y2, float _N) noexcept
: x1(_x1), x2(_x2), y1(_y1), y2(_y2), delta_y(_y2 - _y1), ordinate_factor(-(y1 + y2) * 2.0f),
N_factor(_N == 1.0f ? _x2 - _x1 : sqrtf(_x2 - _x1)),
quadratic_term((1.0f / N_factor)* delta_y* delta_y),
M((1.0f / N_factor)* fabsf(delta_y)) {
}
__forceinline void ChangeCharacteristic(float _m) noexcept { R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor)); }
};

__declspec(align(64)) struct IntervalND final {
const float x1, x2, y1, y2, delta_y, ordinate_factor;
float N_factor, quadratic_term, M, R;
uint64_t i1, i2; float diam; int span_level;
__forceinline void* operator new(size_t) noexcept { Slab* s = tls.local(); char* r = s->current; s->current += 64u; return r; }
__forceinline IntervalND(float _x1, float _x2, float _y1, float _y2) noexcept
: x1(_x1), x2(_x2), y1(_y1), y2(_y2), delta_y(_y2 - _y1), ordinate_factor(-(y1 + y2) * 2.0f),
N_factor(0), quadratic_term(0), M(0), R(0), i1(0), i2(0), diam(0), span_level(0) {
}
__forceinline void compute_span_level(const struct MortonND& map) noexcept; // объявим ниже
__forceinline void set_metric(float d_alpha) noexcept { N_factor = d_alpha; quadratic_term = (1.0f / N_factor) * delta_y * delta_y; M = (1.0f / N_factor) * fabsf(delta_y); }
__forceinline void ChangeCharacteristic(float _m) noexcept { R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor)); }
};

__forceinline bool ComparePtr1D(const Interval1D* a, const Interval1D* b) noexcept { return a->R < b->R; }
__forceinline bool ComparePtrND(const IntervalND* a, const IntervalND* b) noexcept { return a->R < b->R; }

// ---------------- Массовые пересчёты R (как было) ----------------

__forceinline void RecomputeR_ConstM_AVX2_1D(Interval1D* const* arr, size_t n, float m) {
const __m256 vm = _mm256_set1_ps(m);
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
size_t i = 0, limit = n & ~7ull;
alignas(32) float q[8], nf[8], od[8], out[8];
for (; i < limit; i += 8) {
for (int k = 0; k < 8; ++k) { const Interval1D* p = arr[i + k]; q[k] = p->quadratic_term; nf[k] = p->N_factor; od[k] = p->ordinate_factor; }
const __m256 vq = _mm256_load_ps(q), vnf = _mm256_load_ps(nf), vod = _mm256_load_ps(od);
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
_mm256_store_ps(out, res);
for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k];
}
for (; i < n; ++i) arr[i]->ChangeCharacteristic(m);
}
__forceinline void RecomputeR_AffineM_AVX2_1D(Interval1D* const* arr, size_t n, float GF, float alpha) {
const __m256 vGF = _mm256_set1_ps(GF), va = _mm256_set1_ps(alpha);
size_t i = 0, limit = n & ~7ull;
alignas(32) float ln[8], Mv[8], q[8], nf[8], od[8], out[8];
for (; i < limit; i += 8) {
for (int k = 0; k < 8; ++k) {
const Interval1D* p = arr[i + k];
ln[k] = p->x2 - p->x1; Mv[k] = p->M; q[k] = p->quadratic_term; nf[k] = p->N_factor; od[k] = p->ordinate_factor;
}
const __m256 vln = _mm256_load_ps(ln), vM = _mm256_load_ps(Mv), vq = _mm256_load_ps(q), vnf = _mm256_load_ps(nf), vod = _mm256_load_ps(od);
const __m256 vm = _mm256_fmadd_ps(vGF, vln, _mm256_mul_ps(va, vM));
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
_mm256_store_ps(out, res);
for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k];
}
for (; i < n; ++i) {
const Interval1D* p = arr[i];
const float mi = fmaf(GF, (p->x2 - p->x1), p->M * alpha);
arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor));
}
}
__forceinline void RecomputeR_ConstM_AVX2_ND(IntervalND* const* arr, size_t n, float m) {
const __m256 vm = _mm256_set1_ps(m);
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
size_t i = 0, limit = n & ~7ull;
alignas(32) float q[8], nf[8], od[8], out[8];
for (; i < limit; i += 8) {
for (int k = 0; k < 8; ++k) { const IntervalND* p = arr[i + k]; q[k] = p->quadratic_term; nf[k] = p->N_factor; od[k] = p->ordinate_factor; }
const __m256 vq = _mm256_load_ps(q), vnf = _mm256_load_ps(nf), vod = _mm256_load_ps(od);
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
_mm256_store_ps(out, res);
for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k];
}
for (; i < n; ++i) arr[i]->ChangeCharacteristic(m);
}
__forceinline void RecomputeR_AffineM_AVX2_ND(IntervalND* const* arr, size_t n, float GF, float alpha) {
const __m256 vGF = _mm256_set1_ps(GF), va = _mm256_set1_ps(alpha);
size_t i = 0, limit = n & ~7ull;
alignas(32) float ln[8], Mv[8], q[8], nf[8], od[8], out[8];
for (; i < limit; i += 8) {
for (int k = 0; k < 8; ++k) {
const IntervalND* p = arr[i + k];
ln[k] = p->x2 - p->x1; Mv[k] = p->M; q[k] = p->quadratic_term; nf[k] = p->N_factor; od[k] = p->ordinate_factor;
}
const __m256 vln = _mm256_load_ps(ln), vM = _mm256_load_ps(Mv), vq = _mm256_load_ps(q), vnf = _mm256_load_ps(nf), vod = _mm256_load_ps(od);
const __m256 vm = _mm256_fmadd_ps(vGF, vln, _mm256_mul_ps(va, vM));
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
_mm256_store_ps(out, res);
for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k];
}
for (; i < n; ++i) {
const IntervalND* p = arr[i];
const float mi = fmaf(GF, (p->x2 - p->x1), p->M * alpha);
arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor));
}
}

// ---------------- Быстрые степени (как было) ----------------

__forceinline float fast_pow_int(float v, int n) {
float r;
switch (n) {
case 3: { float v2 = v * v; r = v2 * v; } break;
case 4: { float v2 = v * v; r = v2 * v2; } break;
case 5: { float v2 = v * v; r = v2 * v2 * v; } break;
case 6: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v2; } break;
case 7: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v2 * v; } break;
case 8: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v4; } break;
case 9: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v3; } break;
case 10: { float v2 = v * v; float v4 = v2 * v2; float v8 = v4 * v4; r = v8 * v2; } break;
case 11: { float v2 = v * v; float v4 = v2 * v2; float v8 = v4 * v4; r = v8 * v2 * v; } break;
case 12: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v6; } break;
case 13: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v6 * v; } break;
case 14: { float v7 = v * v * v * v * v * v * v; r = v7 * v7; } break;
case 15: { float v7 = v * v * v * v * v * v * v; r = v7 * v7 * v; } break;
default: { float v2 = v * v; float v4 = v2 * v2; float v8 = v4 * v4; r = v8 * v8; }
}
return r;
}

// ---------------- Функция шага (как было) ----------------

__forceinline float Shag(float _m, float x1, float x2, float y1, float y2, float _N, float _r) {
const float diff = y2 - y1;
const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(0x3F800000u | (((((uint32_t)&diff)) & 0x80000000u) ^ 0x80000000u))));
if (_N == 1)
return fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f;
if (_N == 2)
return fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f;
const float invmN = 1.0f / fast_pow_int(_m, _N);
const float dN = fast_pow_int(fabsf(diff), _N);
return fmaf(sign_mult * invmN, dN * _r, x1 + x2) * 0.5f;
}

// ---------------- Morton-кэш ----------------

struct MortonCachePerRank {
std::vector<int> permCache;
std::vector<uint64_t> invMaskCache;
uint32_t baseSeed;
};

static MortonCachePerRank g_mc;

static __forceinline uint64_t gray_encode(uint64_t x) noexcept { return x ^ (x >> 1); }
static __forceinline uint64_t gray_decode(uint64_t g) noexcept { g ^= g >> 32; g ^= g >> 16; g ^= g >> 8; g ^= g >> 4; g ^= g >> 2; g ^= g >> 1; return g; }

// ---------------- MortonND: ВАЖНО — поддержка levels*dim > 64 через «чанки» ----------------

struct MortonND final {
int dim, levels; // полные уровни
int eff_levels; // уровни в одном 64-битном чанке (<= floor(63/dim))
int extra_levels; // остаток уровней за пределами первого чанка (>=0)
int chunks; // количество чанков
std::vector<int> chunk_bits; // кол-во уровней в каждом чанке
std::vector<uint64_t> chunk_bases; // 2^(dim*chunk_bits[c]) на каждый чанк
uint64_t scale; // БАЗА для индекса старшего чанка (совместима со старыми местами использования)
std::vector<float> low, high, step, invStep, baseOff;
std::vector<int> perm;
std::vector<uint64_t> invMask; // длиной levels (используем кусочно)
std::vector<uint64_t> pextMask; // маски для СТАРШЕГО чанка (для эвристик/диаметра)
std::vector<uint64_t> pextMaskChunks; // маски (chunk,dim) для map01ToPoint
float invScaleLevel;
bool use_gray;

text
static inline uint64_t make_mask(int dim, int Lc, int d) {
	uint64_t m = 0ull, bitpos = (uint64_t)d;
	for (int b = 0; b < Lc; ++b) { m |= 1ull << bitpos; bitpos += (uint64_t)dim; }
	return m;
}

__forceinline MortonND(int D, int L, const float* lows, const float* highs, const MortonCachePerRank& mc)
	: dim(D), levels(L),
	eff_levels((std::max)(1, (int)(63 / (D ? D : 1)))),
	extra_levels((L > eff_levels) ? (L - eff_levels) : 0),
	chunks((extra_levels > 0) ? (1 + (extra_levels + eff_levels - 1) / eff_levels) : 1),
	low(lows, lows + D), high(highs, highs + D),
	step(D, 0.0f), invStep(D, 0.0f), baseOff(D, 0.0f),
	perm(mc.permCache.begin(), mc.permCache.begin() + D),
	invMask(mc.invMaskCache.begin(), mc.invMaskCache.begin() + D),
	invScaleLevel(1.0f / (float)((uint64_t)1 << L)), use_gray(true) {

	// Размер шага и базовое смещение — по ПОЛНОЙ глубине:
	for (int d = 0; d < dim; ++d) {
		float rng = high[d] - low[d];
		float st = rng * invScaleLevel;
		step[d] = st;
		invStep[d] = 1.0f / st;
		baseOff[d] = fmaf(0.5f, st, low[d]); // центр ячейки на полном уровне
	}

	// Инициализация чанков
	chunk_bits.resize(chunks);
	pextMaskChunks.resize((size_t)chunks * (size_t)dim);
	chunk_bases.resize(chunks);
	int remaining = levels;
	for (int c = 0; c < chunks; ++c) {
		int Lc = (c == 0) ? (std::min)(eff_levels, remaining) : (std::min)(eff_levels, remaining);
		chunk_bits[c] = Lc;
		remaining -= Lc;
		uint64_t baseC = (uint64_t)1 << (dim * Lc);
		chunk_bases[c] = baseC;
		for (int d = 0; d < dim; ++d)
			pextMaskChunks[(size_t)c * (size_t)dim + (size_t)d] = make_mask(dim, Lc, d);
	}

	// Маски для старшего чанка (для диаметра/эвристик)
	pextMask.resize(dim);
	for (int d = 0; d < dim; ++d)
		pextMask[d] = make_mask(dim, chunk_bits[0], d);

	// Маски инверсий — используем только нужные биты для каждого чанка
	// (оставляем как есть; сдвиги/маски применим динамически)

	// БАЗОВАЯ «scale» — база старшего чанка: 2^(dim*eff_levels_actual_for_chunk0)
	scale = (uint64_t)1 << (dim * chunk_bits[0]);
}

// Диаметр между двумя индексами старшего чанка, с учётом «скрытых» уровней: добавим extra_levels в nfree как константу
__forceinline float block_diameter(uint64_t i1, uint64_t i2) const noexcept {
	if (i1 > i2) std::swap(i1, i2);
	float s2 = 0.0f;
	for (int d = 0; d < dim; ++d) {
		const int pd = perm[d];
		const uint64_t varying = (i1 ^ i2) & pextMask[d];
		const int nfree_hi = _mm_popcnt_u64(varying);
		const int nfree_total = nfree_hi + (levels - chunk_bits[0]);
		const float range = step[pd] * (ldexpf(1.0f, nfree_total) - 1.0f);
		s2 = fmaf(range, range, s2);
	}
	return sqrtf(s2);
}

// Преобразование t∈[0,1) → точка, используя все чанки (точность сохраняется)
__forceinline void map01ToPoint(float t, float* __restrict out) const noexcept {
	if (t <= 0.0f) t = 0.0f;
	else if (t >= 1.0f) t = 0x1.fffffep-1f;

	// накапливаем биты для каждой размерности
	uint64_t accBits[32] = { 0ull };
	int accShifted[32] = { 0 };

	double u = (double)t; // используем double для дробной части
	for (int c = 0; c < chunks; ++c) {
		const int Lc = chunk_bits[c];
		const uint64_t baseC = chunk_bases[c];
		double scaled = u * (double)baseC;
		uint64_t idxc = (scaled >= (double)baseC) ? (baseC - 1ull) : (uint64_t)scaled;
		u = scaled - (double)idxc; // дробная часть для следующего чанка
		if (use_gray) idxc = gray_encode(idxc);

		// какой набор бит инверсии используется для этого чанка:
		int shift_from_top = 0;
		for (int k = 0; k <= c; ++k) shift_from_top += chunk_bits[k];
		int inv_shift = levels - shift_from_top; // смещение начала подпоследовательности в общем инверс-маске

		for (int d = 0; d < dim; ++d) {
			int pd = perm[d];
			uint64_t mask = pextMaskChunks[(size_t)c * (size_t)dim + (size_t)d];
			uint64_t bits = _pext_u64(idxc, mask);
			if (inv_shift >= 0) {
				uint64_t invMaskSegment = 0ull;
				if (chunk_bits[c] < 63) {
					uint64_t take = ((uint64_t)1 << chunk_bits[c]) - 1ull;
					invMaskSegment = (invMask[pd] >> inv_shift) & take;
				}
				bits ^= invMaskSegment;
			}
			accBits[pd] = (accBits[pd] << Lc) | bits;
			accShifted[pd] += Lc;
		}
	}
	for (int d = 0; d < dim; ++d)
		out[d] = fmaf(step[d], (float)accBits[d], baseOff[d]);
}

// t для лучшего-приблизительного позиционирования (используем только старший чанк — быстро ради эвристик)
__forceinline float pointToT(const float* __restrict q) const noexcept {
	const int bitsFull = levels;
	const int bitsCoarse = chunk_bits[0];
	uint64_t idx0 = 0ull;
	for (int d = 0; d < dim; ++d) {
		int pd = perm[d];
		float v = (q[pd] - baseOff[pd]) * invStep[pd];
		int64_t cell = (int64_t)_mm_cvt_ss2si(_mm_round_ss(_mm_setzero_ps(), _mm_set_ss(v), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
		// отбрасываем младшие (bitsFull - bitsCoarse) разрядов:
		if (cell < 0) cell = 0;
		int64_t maxv = ((int64_t)1 << bitsFull) - 1;
		if (cell > maxv) cell = maxv;
		uint64_t b = (uint64_t)cell >> (bitsFull - bitsCoarse);
		// инверсия для старшего чанка:
		uint64_t invMask0 = 0ull;
		if (bitsCoarse < 63) { uint64_t take = ((uint64_t)1 << bitsCoarse) - 1ull; invMask0 = (invMask[pd] >> (bitsFull - bitsCoarse)) & take; }
		b ^= invMask0;
		idx0 |= _pdep_u64(b, pextMask[d]);
	}
	if (use_gray) idx0 = gray_decode(idx0);
	return ((float)idx0 + 0.5f) / (float)scale;
}

};

__forceinline void IntervalND::compute_span_level(const MortonND& map) noexcept {
span_level = 0;
for (int d = 0; d < map.dim; ++d) {
uint64_t varying = (i1 ^ i2) & map.pextMask[d];
span_level += _mm_popcnt_u64(varying);
}
// учтём, что ниже старшего чанка есть ещё уровни:
span_level += (map.levels - map.chunk_bits[0]) * map.dim;
span_level = (std::min)(span_level, 11);
}

// ---------------- Стоимость для манипулятора (добавлены приоритет положительных углов и экспоненциальный штраф) ----------------

struct ManipCost final {
int n; bool variableLen; float targetX, targetY; float minTheta;
// коэффициенты для «арки» и экспоненциального штрафа
float archBiasW, archBiasK; // слабый приоритет положительных углов
float sharpW; // вес штрафа за |θ| < minTheta
ManipCost(int _n, bool _variableLen, float _targetX, float _targetY, float _minTheta)
: n(_n), variableLen(_variableLen), targetX(_targetX), targetY(_targetY), minTheta(_minTheta),
archBiasW(0.02f), archBiasK(3.0f), sharpW(0.05f) {
}
__forceinline float operator()(const float* __restrict q, float& out_x, float& out_y) const noexcept {
const float* th = q;
const float* L = variableLen ? (q + n) : nullptr;
__declspec(align(64)) float phi[32], s_arr[32], c_arr[32];
float x = 0.0f, y = 0.0f, phi_acc = 0.0f, penC = 0.0f, archPen = 0.0f;

text
	// накопление фаз, затем sincos
	for (int i = 0; i < n; ++i) { phi_acc += th[i]; phi[i] = phi_acc; }
	FABE13_SINCOS(phi, s_arr, c_arr, n);

	const float Lc = 1.0f;
	if (variableLen) { for (int i = 0; i < n; ++i) { float Li = L[i]; x = fmaf(Li, c_arr[i], x); y = fmaf(Li, s_arr[i], y); } }
	else { for (int i = 0; i < n; ++i) { x = fmaf(Lc, c_arr[i], x); y = fmaf(Lc, s_arr[i], y); } }

	// штрафы
	for (int i = 0; i < n; ++i) {
		float ai = fabsf(th[i]);
		float v = minTheta - ai;               // «слишком острый» (малый) угол
		if (v > 0.0f) {
			// экспоненциально нарастающий штраф (через exp2 для скорости)
			float scale = 2.0f / (minTheta + 1e-6f);
			penC += sharpW * (exp2f(scale * v) - 1.0f);
		}
		// лёгкий приоритет положительных углов (арка): softplus(k * (-theta))
		// = ln(1 + exp(k*(-θ))) — минимальный на θ>=0, растёт на θ<0
		float t = -th[i] * archBiasK;
		// быстрая и стабильная softplus:
		float sp = (t > 10.f) ? t : log1pf(expf(t));
		archPen += archBiasW * sp;
	}

	float dx = x - targetX, dy = y - targetY;
	float dist = sqrtf(fmaf(dx, dx, dy * dy));
	out_x = x; out_y = y;
	return dist + penC + archPen;
}

};

// ---------------- Попадание по карте (как было) ----------------

__forceinline void HitTest2D_analytic(float x_param, float& out_x1, float& out_x2) {
const float a = gActiveMap.a, inv_lenx = gActiveMap.inv_lenx;
const uint32_t scale = gActiveMap.scale, scale_minus_1 = scale - 1u;
const float lenx = gActiveMap.lenx, leny = gActiveMap.leny, c = gActiveMap.c;
const uint8_t start = gActiveMap.start; const int levels = gActiveMap.levels;

text
float norm = (x_param - a) * inv_lenx;
norm = fminf(fmaxf(norm, 0.0f), 0x1.fffffep-1f);
uint32_t idx = (uint32_t)(norm * (float)scale);
idx = idx > scale_minus_1 ? scale_minus_1 : idx;

float sx = lenx, sy = leny; float x1 = a, x2 = c; uint8_t type = start; int l = levels - 1;
while (l >= 0) {
	const uint32_t q = (idx >> (l * 2)) & 3u;
	const Step s = g_step_tbl[type][q];
	type = s.next; sx *= 0.5f; sy *= 0.5f;
	x1 += s.dx ? sx : 0.0f; x2 += s.dy ? sy : 0.0f; --l;
}
out_x1 = x1 + sx * 0.5f; out_x2 = x2 + sy * 0.5f;

}
__forceinline float FindX2D_analytic(float px, float py) {
const float a = gActiveMap.a, b = gActiveMap.b, c = gActiveMap.c, d = gActiveMap.d;
const float lenx = gActiveMap.lenx, leny = gActiveMap.leny; const uint32_t scale = gActiveMap.scale;
const uint8_t start = gActiveMap.start; const int levels = gActiveMap.levels;
const float clamped_px = fminf(fmaxf(px, a), b), clamped_py = fminf(fmaxf(py, c), d);
float sx = lenx, sy = leny; float x0 = a, y0 = c; uint32_t idx = 0u; uint8_t type = start; int l = 0;
while (l < levels) {
sx *= 0.5f; sy *= 0.5f; const float mx = x0 + sx, my = y0 + sy;
const uint32_t tr = (uint32_t)((clamped_px > mx) & (clamped_py > my));
const uint32_t tl = (uint32_t)((clamped_px < mx) & (clamped_py > my));
const uint32_t dl = (uint32_t)((clamped_px < mx) & (clamped_py < my));
const uint32_t none = (uint32_t)(1u ^ (tr | tl | dl));
const uint32_t dd = (tr << 1) | tr | tl | (none << 1);
const InvStep inv = g_inv_tbl[type][dd];
type = inv.next; idx = (idx << 2) | inv.q;
const uint32_t dx = dd >> 1, dy = dd & 1u; x0 += dx ? sx : 0.0f; y0 += dy ? sy : 0.0f; ++l;
}
const float scale_recip = 1.0f / (float)scale;
return fmaf((float)idx * scale_recip, lenx, a);
}

// ---------------- Сообщения ND (как было) ----------------

__declspec(align(16)) struct MultiCrossMsg final {
float intervals[15]; uint8_t count;
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& intervals& count; }
};
__declspec(align(16)) struct BestSolutionMsg final {
float bestF, bestX, bestY, bestQ[32]; uint8_t dim;
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& bestF& bestX& bestY& bestQ& dim; }
};
__declspec(align(16)) struct CtrlMsgND final {
uint8_t kind; CrossMsg xchg; MultiCrossMsg multiXchg; BestSolutionMsg bestSol;
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept {
ar& kind; if (kind == 1) ar& xchg; else if (kind == 2) ar& multiXchg; else if (kind == 3) ar& bestSol;
}
};

// ---------------- Генерация сидов (как было) ----------------

static __forceinline int generate_lhs_seeds_lite(const MortonND& map, const int dim, float* __restrict S, int stride, uint32_t seed) {
int temp_dim = dim; const int ns = --temp_dim * temp_dim; uint32_t st = seed;
alignas(32) int permutations[32][16];
for (int d = 0; d < dim; ++d) {
for (int s = 0; s < ns; ++s) permutations[d][s] = s;
for (int s = ns - 1; s > 0; --s) { st ^= st << 13; st ^= st >> 17; st ^= st << 5; int j = st % (s + 1); std::swap(permutations[d][s], permutations[d][j]); }
}
for (int s = 0; s < ns; ++s) {
for (int d = 0; d < dim; ++d) {
st ^= st << 13; st ^= st >> 17; st ^= st << 5;
float u = (st & 0xFFFFFF) * 5.9604645e-8f;
int stratum = permutations[d][s];
float pos = ((float)stratum + u) / (float)ns;
int pd = map.perm[d];
float lo = map.low[pd], hi = map.high[pd];
S[s * stride + d] = fmaf(pos, (hi - lo), lo);
}
}
return ns;
}

static __forceinline int generate_heuristic_seeds(const ManipCost& cost, const MortonND& map, int dim, float* __restrict S, int stride, uint32_t seed) {
const int n = cost.n; const bool VL = cost.variableLen;
const float tx = cost.targetX, ty = cost.targetY;
int total_seeds = 0;
// 1) направленное в сторону цели
{
float* s0 = S + total_seeds * stride;
float phi = atan2f(ty, tx);
float rho = sqrtf(fmaf(tx, tx, ty * ty));
float len = fminf(fmaxf(rho / (float)n, 0.5f), 2.0f); // новый минимум 0.5
for (int i = 0; i < n; ++i) s0[i] = phi / (float)n;
if (VL) for (int i = 0; i < n; ++i) s0[n + i] = len;
total_seeds++;
}
// 2) попеременно
{
float* s1 = S + total_seeds * stride;
float phi = atan2f(ty, tx);
for (int i = 0; i < n; ++i) s1[i] = 0.5f * phi * ((i & 1) ? -1.0f : 1.0f);
if (VL) for (int i = 0; i < n; ++i) s1[n + i] = 1.0f * (0.8f + 0.4f * (float)i / (float)n);
total_seeds++;
}
// 3) затухающая линейка
{
float* s2 = S + total_seeds * stride;
const float inv = (n > 1) ? 1.0f / (float)(n - 1) : 0.0f;
float phi = atan2f(ty, tx);
for (int i = 0; i < n; ++i) { float pr = (float)i * inv; s2[i] = phi * (1.0f - 0.3f * pr); }
if (VL) { for (int i = 0; i < n; ++i) { float si = sinf(1.5f * (float)i); s2[n + i] = (1.0f + 0.2f * si); } }
total_seeds++;
}
// 4) LHS лайт
int lhs_count = generate_lhs_seeds_lite(map, dim, S + total_seeds * stride, stride, seed);
total_seeds += lhs_count;
return total_seeds;
}

// ---------------- Основной ND-ветвитель с улучшениями ----------------

static __forceinline void agp_run_branch_mpi(
const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed,
std::vector<IntervalND*>& H, std::vector<float>& bestQ, float& bestF, float& bestX, float& bestY, float M_prior = 1e-3f)
{
const int n = cost.n;
const int dim = n + (cost.variableLen ? n : 0);

text
alignas(32) float M_by_span[12]; for (int i = 0; i < 12; ++i) M_by_span[i] = M_prior;
float Mmax = M_prior;

alignas(64) float q_local[32], phi[32], s_arr[32], c_arr[32], sum_s[32], sum_c[32], q_try[32];
bestQ.reserve(dim);
float x = 0.0f, y = 0.0f;
int no_improve = 0;

auto t_to_idx = [&](float t) -> uint64_t {
	float tt = (t <= 0.0f) ? 0.0f : (t >= 1.0f ? std::nextafterf(1.0f, 0.0f) : t);
	uint64_t idx = (uint64_t)((double)tt * (double)map.scale); // индекс старшего чанка
	if (idx >= map.scale) idx = map.scale - 1ull;
	return idx;
	};
auto update_pockets_and_Mmax = [&](IntervalND* I) {
	const int k = I->span_level; if (I->M > M_by_span[k]) M_by_span[k] = I->M; if (M_by_span[k] > Mmax) Mmax = M_by_span[k];
	};

float a = 0.0f, b = 1.0f;

// --- локальная оценка + небольшой градиентный шаг + ДОТОЧКА ПО ПОСЛЕДНЕМУ ЗВЕНУ ---
auto evalAt = [&](float t) -> float {
	map.map01ToPoint(t, q_local);
	float f = cost(q_local, x, y);

	if (f < bestF * 1.25f) {
		// подготовим суммы для градиента
		float acc = 0.0f; for (int i = 0; i < n; ++i) { acc += q_local[i]; phi[i] = acc; }
		FABE13_SINCOS(phi, s_arr, c_arr, n);
		float as = 0.0f, ac = 0.0f;
		for (int k = n - 1; k >= 0; --k) {
			const float Lk = cost.variableLen ? q_local[n + k] : 1.0f;
			as += Lk * s_arr[k]; ac += Lk * c_arr[k]; sum_s[k] = as; sum_c[k] = ac;
		}
		const float dx = x - cost.targetX, dy = y - cost.targetY;
		float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1e-8f;

		// градиент с учётом новых штрафов:
		float eta = 0.125f;
		for (int stepI = 0; stepI < 3; ++stepI) {
			for (int i = 0; i < n; ++i) {
				float gpen = 0.0f;
				// экспоненциальный штраф за малые |θ|
				{
					float ai = fabsf(q_local[i]);
					float v = cost.minTheta - ai;
					if (v > 0.0f) {
						float scale = 2.0f / (cost.minTheta + 1e-6f);
						float e = exp2f(scale * v);
						float dpen_dtheta = cost.sharpW * (e * 0.69314718055994530941723212145818f * scale) * (-copysignf(1.0f, q_local[i]));
						gpen += dpen_dtheta;
					}
				}
				// «арка»: производная softplus(k*(-θ)) = -k * sigmoid(k*(-θ))
				{
					float tsg = -q_local[i] * cost.archBiasK;
					float sig = 1.0f / (1.0f + expf(-tsg));
					gpen += -(cost.archBiasW * cost.archBiasK) * sig;
				}

				float g = (dx * (-sum_s[i]) + dy * (sum_c[i])) / dist + gpen;
				q_try[i] = q_local[i] - eta * g;

				// ГРАНИЦЫ УГЛОВ (рад): для i==0 [-60°, +150°], для остальных [-150°, +150°]
				const float deg2rad = 3.14159265358979323846f / 180.0f;
				const float lo0 = -60.0f * deg2rad, hi0 = 150.0f * deg2rad;
				const float lo = -150.0f * deg2rad, hi = 150.0f * deg2rad;
				const float Lb = (i == 0) ? lo0 : lo;
				const float Hb = (i == 0) ? hi0 : hi;
				if (q_try[i] < Lb) q_try[i] = Lb; else if (q_try[i] > Hb) q_try[i] = Hb;
			}
			if (cost.variableLen) {
				for (int i = 0; i < n; ++i) {
					float g = (dx * c_arr[i] + dy * s_arr[i]) / dist;
					float v = q_local[n + i] - eta * g;
					// НОВЫЕ ГРАНИЦЫ ДЛИН: [0.5; 2.0]
					if (v < 0.5f) v = 0.5f; else if (v > 2.0f) v = 2.0f;
					q_try[n + i] = v;
				}
			}
			float x2, y2; float f2 = cost(q_try, x2, y2);
			if (f2 < f) { memcpy(q_local, q_try, dim * sizeof(float)); f = f2; x = x2; y = y2; break; }
			eta *= 0.5f;
		}

		// ДОТОЧКА ПО ПОСЛЕДНЕМУ УГЛУ — чтобы «довернуть» последний сегмент
		const int last = n - 1;
		const float deg2rad = 3.14159265358979323846f / 180.0f;
		const float lo = (last == 0) ? (-60.0f * deg2rad) : (-150.0f * deg2rad);
		const float hi = 150.0f * deg2rad;
		float bestLocF = f;
		float saved = q_local[last];
		for (float delta = 0.05f; delta >= 0.00625f; delta *= 0.5f) {
			for (int sgn = -1; sgn <= 1; sgn += 2) {
				float cand = saved + sgn * delta;
				if (cand < lo) cand = lo; else if (cand > hi) cand = hi;
				float backup = q_local[last]; q_local[last] = cand;
				float x2, y2; float f2 = cost(q_local, x2, y2);
				if (f2 < bestLocF) { bestLocF = f2; x = x2; y = y2; saved = cand; }
				q_local[last] = backup;
			}
		}
		if (bestLocF < f) { q_local[last] = saved; f = bestLocF; }
	}

	if (f < bestF) { bestF = f; bestQ.assign(q_local, q_local + dim); bestX = x; bestY = y; no_improve = 0; }
	else { ++no_improve; }
	return f;
	};

float f_a = evalAt(a), f_b = evalAt(b);
const int K = (std::min)((std::max)(2 * dim, 8), 128);

H.reserve((size_t)maxIter + K + 16);
const int rank = g_world->rank();
const int world = g_world->size();

alignas(64) float seeds[64 * 32];
const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, seed + rank * 7919u);

for (int i = 0; i < seedCnt && i < 64; ++i) {
	const float* s = seeds + i * 32;
	float t_seed = map.pointToT(s);
	float interval_size = (i < 3) ? (0.0004f * (float)dim)
		: (0.00031f * (float)dim) * exp2f((1.0f / (float)(seedCnt - 4)) * log2f(0.00025f / 0.00031f) * (float)(i - 3));
	float t1 = fmaxf(a, t_seed - interval_size), t2 = fminf(b, t_seed + interval_size);
	if (t2 <= t1) continue;
	alignas(64) float q1[32], q2[32]; float x1, y1, x2, y2;
	map.map01ToPoint(t1, q1); float f1 = cost(q1, x1, y1);
	map.map01ToPoint(t2, q2); float f2 = cost(q2, x2, y2);
	IntervalND* I = new IntervalND(t1, t2, f1, f2);
	I->i1 = t_to_idx(t1); I->i2 = t_to_idx(t2);
	I->diam = map.block_diameter(I->i1, I->i2);
	I->compute_span_level(map);
	I->set_metric(I->diam);
	update_pockets_and_Mmax(I);
	I->ChangeCharacteristic(r * Mmax);
	if (i < 3) I->R *= fmaf(0.01f, (float)dim, 0.85f);
	else {
		float start_mult = 0.214f * (float)dim;
		float end_mult = 0.174f * (float)dim;
		float mult = start_mult * exp2f((1.0f / (float)(seedCnt - 4)) * log2f(end_mult / start_mult) * (float)(i - 3));
		I->R *= mult;
	}
	H.emplace_back(I);
	std::push_heap(H.begin(), H.end(), ComparePtrND);
	if (f1 < bestF) { bestF = f1; bestQ.assign(q1, q1 + dim); bestX = x1; bestY = y1; }
	if (f2 < bestF) { bestF = f2; bestQ.assign(q2, q2 + dim); bestX = x2; bestY = y2; }
}

float prev_t = a, prev_f = f_a;
for (int k = 1; k <= K; ++k) {
	float t = a + (b - a) * ((float)k / (K + 1)) + (float)rank / (float)(world * (K + 1));
	float f = evalAt(t);
	IntervalND* I = new IntervalND(prev_t, t, prev_f, f);
	I->i1 = t_to_idx(prev_t); I->i2 = t_to_idx(t);
	I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam);
	update_pockets_and_Mmax(I);
	I->ChangeCharacteristic(r * Mmax);
	H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND);
	prev_t = t; prev_f = f;
}
IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b);
tail->i1 = t_to_idx(prev_t); tail->i2 = t_to_idx(b);
tail->diam = map.block_diameter(tail->i1, tail->i2); tail->compute_span_level(map); tail->set_metric(tail->diam);
update_pockets_and_Mmax(tail);
tail->ChangeCharacteristic(r * Mmax);
H.emplace_back(tail); std::push_heap(H.begin(), H.end(), ComparePtrND);

float dmax = b - a, initial_len = dmax, thr03 = 0.3f * initial_len, inv_thr03 = 1.0f / thr03;
int it = 0;

// новые «мягко-зависящие-от-размерности» константы
auto kickEveryByDim = [&](int dim) -> int {
	// 60..120, убывает экспоненциально с размерностью
	float z = 120.0f * exp2f(-0.05f * (float)dim);
	if (z < 60.0f) z = 60.0f;
	return (int)z;
	};
auto noImproveThrByDim = [&](int dim) -> int {
	float z = 80.0f * exp2f(-0.08f * (float)dim);
	if (z < 30.0f) z = 30.0f;
	return (int)z;
	};

while (it < maxIter) {
	// подталкивание около лучшего — зависящее от размерности
	if ((it % kickEveryByDim(dim)) == 0 && no_improve > noImproveThrByDim(dim) && !bestQ.empty()) {
		float t_best = map.pointToT(bestQ.data());
		for (int i = 0; i < 2; ++i) {
			float off = (i == 0) ? 0.01f : -0.01f;
			float t_seed = fminf(b, fmaxf(a, t_best + off));
			float f_seed = evalAt(t_seed);
			IntervalND* J = new IntervalND(t_seed - 0.005f, t_seed + 0.005f, f_seed, f_seed);
			J->i1 = t_to_idx(t_seed - 0.005f); J->i2 = t_to_idx(t_seed + 0.005f);
			J->diam = map.block_diameter(J->i1, J->i2); J->compute_span_level(map); J->set_metric(J->diam);
			update_pockets_and_Mmax(J);
			J->ChangeCharacteristic(r * Mmax); J->R *= 0.9f;
			H.emplace_back(J); std::push_heap(H.begin(), H.end(), ComparePtrND);
		}
		no_improve = 0;
	}

	const float p = fmaf(-1.0f / initial_len, dmax, 1.0f);
	bool   stagnation = (no_improve > 100) && (it > 270);

	// Плавная адаптация «T» к размерности (вместо констант 264/277)
	float A = 200.0f + 64.0f * exp2f(-0.06f * (float)dim);
	float B = 210.0f + 67.0f * exp2f(-0.06f * (float)dim);
	const int T = (int)fmaf(-expm1f(p), A, B);

	float  r_eff = fmaxf(1.0f, r * (0.7f + 0.3f * (1.0f - p)));

	std::pop_heap(H.begin(), H.end(), ComparePtrND);
	IntervalND* cur = H.back(); H.pop_back();

	const float x1 = cur->x1, x2 = cur->x2, y1 = cur->y1, y2 = cur->y2;
	float m = r_eff * Mmax;
	float tNew = Shag(m, x1, x2, y1, y2, (float)dim, r);
	tNew = fminf(fmaxf(tNew, a), b);
	float fNew = evalAt(tNew);

	IntervalND* L = new IntervalND(x1, tNew, y1, fNew);
	IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2);

	L->i1 = t_to_idx(x1); L->i2 = t_to_idx(tNew);
	Rv->i1 = t_to_idx(tNew); Rv->i2 = t_to_idx(x2);
	L->diam = map.block_diameter(L->i1, L->i2); Rv->diam = map.block_diameter(Rv->i1, Rv->i2);
	L->compute_span_level(map); Rv->compute_span_level(map);
	L->set_metric(L->diam); Rv->set_metric(Rv->diam);

	float Mloc = (std::max)(L->M, Rv->M);
	update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv);

	const float prevMmax = Mmax; if (Mloc > Mmax) Mmax = Mloc;
	m = r_eff * Mmax;

	if (adaptive) {
		float len1 = tNew - x1, len2 = x2 - tNew;
		if (len1 + len2 == dmax) {
			dmax = (std::max)(len1, len2);
			for (auto pI : H) { float Ls = pI->x2 - pI->x1; if (Ls > dmax) dmax = Ls; }
		}
		// НЕ ТРОГАЕМ усл. с thr03 — строго как просили
		if ((thr03 > dmax && !(it % 3)) || (10.0f * dmax < initial_len)) {
			const float progress = fmaf(-dmax, inv_thr03, 1.0f);
			const float alpha = progress * progress;
			const float beta = fmaf(-alpha, 1.0f, 2.0f);
			const float MULT = (1.0f / dmax) * Mmax;
			const float global_coeff = fmaf(MULT, r_eff, -MULT);
			const float GF = fmaf(beta, global_coeff, 0.0f);
			L->ChangeCharacteristic(fmaf(GF, len1, L->M * alpha));
			Rv->ChangeCharacteristic(fmaf(GF, len2, Rv->M * alpha));
			size_t sz = H.size();
			RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha);
			std::make_heap(H.begin(), H.end(), ComparePtrND);
		}
		else {
			if (Mloc > prevMmax) {
				L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m);
				if (Mloc > 1.15f * prevMmax) {
					size_t sz = H.size(); RecomputeR_ConstM_AVX2_ND(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtrND);
				}
			}
			else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); }
		}
	}
	else {
		if (Mloc > prevMmax) {
			L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m);
			if (Mloc > 1.15f * prevMmax) { size_t sz = H.size(); RecomputeR_ConstM_AVX2_ND(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtrND); }
		}
		else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); }
	}

	H.push_back(L);  std::push_heap(H.begin(), H.end(), ComparePtrND);
	H.push_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND);
	if (!H.empty()) {
		_mm_prefetch((const char*)H[0], _MM_HINT_T0);
		if (H.size() > 1) _mm_prefetch((const char*)H[1], _MM_HINT_T0);
	}

	IntervalND* top = H.front();
	float interval_len = top->x2 - top->x1;

	// обмен/терминация по расписанию:
	bool want_term = (exp2f(log2f(interval_len) / (float)dim) < eps) || (it == maxIter - 1);
	if (!(it % T) || want_term) {
		CtrlMsgND out; out.kind = want_term ? 0 : 2;
		if (!want_term) {
			uint8_t cnt = (uint8_t)((H.size() >= 3) ? 3 : H.size());
			out.multiXchg.count = cnt;
			float* dest = out.multiXchg.intervals;
			IntervalND* t1 = H[0];
			IntervalND* t2 = (H.size() > 1 ? H[1] : H[0]);
			IntervalND* t3 = (H.size() > 2 ? H[2] : H[H.size() - 1]);
			IntervalND* tops[3] = { t1, t2, t3 };
			for (uint8_t i2 = 0; i2 < cnt; ++i2) {
				IntervalND* Tt = tops[i2];
				dest[0] = Tt->x1; dest[1] = 0.0f; dest[2] = Tt->x2; dest[3] = 0.0f; dest[4] = Tt->R;
				dest += 5;
			}
		}
		for (int i2 = 0; i2 < world; ++i2) if (i2 != rank) g_world->isend(i2, 0, out);
		if (want_term) break;
	}

	if (!(it % 500) && !bestQ.empty()) {
		CtrlMsgND out; out.kind = 3; out.bestSol.bestF = bestF; out.bestSol.bestX = bestX; out.bestSol.bestY = bestY;
		out.bestSol.dim = (uint8_t)bestQ.size(); memcpy(out.bestSol.bestQ, bestQ.data(), bestQ.size() * sizeof(float));
		for (int i2 = 0; i2 < world; ++i2) if (i2 != rank) g_world->isend(i2, 0, out);
	}

	while (g_world->iprobe(boost::mpi::any_source, 0)) {
		CtrlMsgND in; g_world->recv(boost::mpi::any_source, 0, in);
		if (in.kind == 0) { if (!rank) break; else return; }
		else if (in.kind == 1) {
			float sx = in.xchg.s_x1, ex = in.xchg.e_x1;
			if (sx < 0.0f) sx = 0.0f; if (ex > 1.0f) ex = 1.0f;
			if (ex > sx) {
				alignas(64) float tmp[32]; float tx, ty;
				map.map01ToPoint(sx, tmp); float y1i = cost(tmp, tx, ty);
				map.map01ToPoint(ex, tmp); float y2i = cost(tmp, tx, ty);
				IntervalND* inj = new IntervalND(sx, ex, y1i, y2i);
				inj->i1 = t_to_idx(sx); inj->i2 = t_to_idx(ex);
				inj->diam = map.block_diameter(inj->i1, inj->i2);
				inj->compute_span_level(map); inj->set_metric(inj->diam);
				update_pockets_and_Mmax(inj); inj->ChangeCharacteristic(r * Mmax);
				if (!H.empty()) { _mm_prefetch((const char*)H[0], _MM_HINT_T0); if (H.size() > 1) _mm_prefetch((const char*)H[1], _MM_HINT_T0); }
				IntervalND* topH = H.front();
				if (inj->R > 1.15f * topH->R) {
					float p2 = fmaf(-1.0f / initial_len, dmax, 1.0f);
					float k = (no_improve > 100 && it > 270) ? fmaf(0.5819767068693265f, expm1f(p2), 0.3f)
						: fmaf(0.3491860241215959f, expm1f(p2), 0.6f);
					inj->R = in.xchg.Rtop * k;
					H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtrND);
				}
			}
		}
		else if (in.kind == 2) {
			const MultiCrossMsg& mX = in.multiXchg;
			for (uint8_t ii = 0; ii < mX.count; ++ii) {
				const float* d = &mX.intervals[ii * 5];
				float sx = d[0], ex = d[2];
				if (sx < 0.0f) sx = 0.0f; if (ex > 1.0f) ex = 1.0f;
				if (ex > sx) {
					alignas(64) float tmp[32]; float tx, ty;
					map.map01ToPoint(sx, tmp); float y1i = cost(tmp, tx, ty);
					map.map01ToPoint(ex, tmp); float y2i = cost(tmp, tx, ty);
					IntervalND* inj = new IntervalND(sx, ex, y1i, y2i);
					inj->i1 = t_to_idx(sx); inj->i2 = t_to_idx(ex);
					inj->diam = map.block_diameter(inj->i1, inj->i2);
					inj->compute_span_level(map); inj->set_metric(inj->diam);
					update_pockets_and_Mmax(inj); inj->ChangeCharacteristic(r * Mmax);
					if (!H.empty()) { _mm_prefetch((const char*)H[0], _MM_HINT_T0); if (H.size() > 1) _mm_prefetch((const char*)H[1], _MM_HINT_T0); }
					IntervalND* topH = H.front();
					if (inj->R > 1.15f * topH->R) {
						float p2 = fmaf(-1.0f / initial_len, dmax, 1.0f);
						float k = (no_improve > 100 && it > 270) ? fmaf(0.5819767068693265f, expm1f(p2), 0.3f)
							: fmaf(0.3491860241215959f, expm1f(p2), 0.6f);
						inj->R = d[4] * k;
						H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtrND);
					}
				}
			}
		}
		else if (in.kind == 3) {
			const BestSolutionMsg& bm = in.bestSol;
			if (bm.bestF < bestF * 1.15f) {
				alignas(64) float tmp_q[32]; memcpy(tmp_q, bm.bestQ, bm.dim * sizeof(float));
				float t_best = map.pointToT(tmp_q);
				float t1 = fmaxf(a, t_best - 0.001f), t2 = fminf(b, t_best + 0.001f);
				if (t2 > t1) {
					alignas(64) float tq1[32], tq2[32]; float xx1, yy1, xx2, yy2;
					map.map01ToPoint(t1, tq1); float f1 = cost(tq1, xx1, yy1);
					map.map01ToPoint(t2, tq2); float f2 = cost(tq2, xx2, yy2);
					IntervalND* I = new IntervalND(t1, t2, f1, f2);
					I->i1 = t_to_idx(t1); I->i2 = t_to_idx(t2);
					I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map);
					I->set_metric(I->diam); update_pockets_and_Mmax(I);
					I->ChangeCharacteristic(r * Mmax); I->R *= 0.90f;
					H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND);
				}
				if (bm.bestF < bestF) { bestF = bm.bestF; bestX = bm.bestX; bestY = bm.bestY; bestQ.assign(bm.bestQ, bm.bestQ + bm.dim); }
			}
		}
	}
	++it;
}

}

struct BestPacket {
float bestF; int dim; float bestX; float bestY;
template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar& bestF& dim& bestX& bestY; }
};

// ---------------- ВНЕШНИЕ ФУНКЦИИ DLL ----------------

// Главная ND оптимизация с новыми границами углов/длин
extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_Manip2D(int nSegments, bool variableLengths, float minTheta, float targetX, float targetY,
int peanoLevels, int maxIterPerBranch, float r, bool adaptiveMode, float epsilon,
unsigned int seed, float** out_bestQ, size_t* out_bestQLen, float* out_bestX,
float* out_bestY, float* out_bestF)
{
const int dim = nSegments + (variableLengths ? nSegments : 0);

text
// Перестановка/инверсии (как было)
g_mc.permCache.resize(dim); for (int i = 0; i < dim; ++i) g_mc.permCache[i] = i;
uint32_t s = g_mc.baseSeed;
for (int i = dim - 1; i > 0; --i) { s ^= s << 13; s ^= s >> 17; s ^= s << 5; uint32_t j = s % (uint32_t)(i + 1); std::swap(g_mc.permCache[i], g_mc.permCache[j]); }
g_mc.invMaskCache.resize(dim); for (int k = 0; k < dim; ++k) { s ^= s << 13; s ^= s >> 17; s ^= s << 5; g_mc.invMaskCache[k] = (uint64_t)s; }

// НОВЫЕ ГРАНИЦЫ:
// углы: θ0 ∈ [-60°, +150°], θi>0 ∈ [-150°, +150°]; длины ∈ [0.5, 2.0]
const float deg2rad = 3.14159265358979323846f / 180.0f;
const float theta0Min = -60.0f * deg2rad, theta0Max = 150.0f * deg2rad;
const float thetaMin = -150.0f * deg2rad, thetaMax = 150.0f * deg2rad;
const float lenMin = 0.5f, lenMax = 2.0f;

std::vector<float> low;  low.reserve(dim);
std::vector<float> high; high.reserve(dim);
for (int i = 0; i < nSegments; ++i) { low.push_back(i == 0 ? theta0Min : thetaMin); high.push_back(i == 0 ? theta0Max : thetaMax); }
if (variableLengths) { for (int i = 0; i < nSegments; ++i) { low.push_back(lenMin); high.push_back(lenMax); } }

ManipCost cost(nSegments, variableLengths, targetX, targetY, minTheta);

const int rank = g_world->rank(), world = g_world->size();
std::vector<float> bestQ; float bestF = FLT_MAX, bx = 0.0f, by = 0.0f;

const int levels0 = (std::min)(peanoLevels, 8);
const int maxIter0 = (int)(maxIterPerBranch * 0.2f);
MortonND map0(dim, levels0, low.data(), high.data(), g_mc);

std::vector<IntervalND*> H_coarse; std::vector<float> bestQ_coarse;
float bestF_coarse = FLT_MAX, bx_coarse = 0.0f, by_coarse = 0.0f;

float M_prior = (variableLengths ? 2.0f * nSegments : 2.0f * nSegments) * (1.0f / (float)(1u << levels0));
if (variableLengths) M_prior += 1.41421356237f * (1.0f / (float)(1u << levels0));

agp_run_branch_mpi(map0, cost, maxIter0, r, adaptiveMode, epsilon, seed,
	H_coarse, bestQ_coarse, bestF_coarse, bx_coarse, by_coarse, M_prior);

if (bestF_coarse < bestF) { bestF = bestF_coarse; bestQ = bestQ_coarse; bx = bx_coarse; by = by_coarse; }

if (levels0 < peanoLevels) {
	MortonND map1(dim, peanoLevels, low.data(), high.data(), g_mc);
	std::vector<IntervalND*> H_fine;
	std::vector<float> bestQ_fine = bestQ;
	float bestF_fine = bestF, bx_fine = bx, by_fine = by;

	float M_prior_fine = (variableLengths ? 2.0f * nSegments : 2.0f * nSegments) * (1.0f / (float)(1u << peanoLevels));
	if (variableLengths) M_prior_fine += 1.41421356237f * (1.0f / (float)(1u << peanoLevels));

	if (!H_coarse.empty()) {
		std::sort(H_coarse.begin(), H_coarse.end(), [](const IntervalND* a, const IntervalND* b) { return a->R < b->R; });
		const size_t topCount = (size_t)(H_coarse.size() * 0.3f);
		auto t_to_idx_fine = [&](float t) -> uint64_t {
			float tt = (t <= 0.0f) ? 0.0f : (t >= 1.0f ? std::nextafterf(1.0f, 0.0f) : t);
			uint64_t idx = (uint64_t)((double)tt * (double)map1.scale);
			if (idx >= map1.scale) idx = map1.scale - 1ull;
			return idx;
			};
		for (size_t i = 0; i < topCount && i < H_coarse.size(); ++i) {
			const IntervalND* C = H_coarse[i];
			alignas(64) float q1[32], q2[32]; float x1, y1, x2, y2;
			map1.map01ToPoint(C->x1, q1); float f1 = cost(q1, x1, y1);
			map1.map01ToPoint(C->x2, q2); float f2 = cost(q2, x2, y2);
			IntervalND* I = new IntervalND(C->x1, C->x2, f1, f2);
			I->i1 = t_to_idx_fine(C->x1); I->i2 = t_to_idx_fine(C->x2);
			I->diam = map1.block_diameter(I->i1, I->i2);
			I->set_metric(I->diam);
			H_fine.push_back(I);
			if (f1 < bestF_fine) { bestF_fine = f1; bestQ_fine.assign(q1, q1 + dim); bx_fine = x1; by_fine = y1; }
			if (f2 < bestF_fine) { bestF_fine = f2; bestQ_fine.assign(q2, q2 + dim); bx_fine = x2; by_fine = y2; }
		}
		std::make_heap(H_fine.begin(), H_fine.end(), ComparePtrND);
	}

	agp_run_branch_mpi(map1, cost, maxIterPerBranch - maxIter0, r, adaptiveMode, epsilon, seed,
		H_fine, bestQ_fine, bestF_fine, bx_fine, by_fine, M_prior_fine);

	if (bestF_fine < bestF) { bestF = bestF_fine; bestQ = bestQ_fine; bx = bx_fine; by = by_fine; }
}

BestPacket me{ bestF, dim, bx, by };
if (!rank) {
	std::vector<float> winnerQ = bestQ;
	float winF = bestF, wx = bx, wy = by;
	for (int i = 1; i < world; ++i) {
		BestPacket bp; g_world->recv(i, 2, bp);
		std::vector<float> qin; g_world->recv(i, 3, qin);
		if (bp.bestF < winF) { winF = bp.bestF; wx = bp.bestX; wy = bp.bestY; winnerQ = qin; }
	}
	*out_bestQLen = winnerQ.size();
	*out_bestQ = (float*)CoTaskMemAlloc(sizeof(float) * (*out_bestQLen));
	memcpy(*out_bestQ, winnerQ.data(), sizeof(float) * (*out_bestQLen));
	*out_bestX = wx; *out_bestY = wy; *out_bestF = winF;
}
else {
	g_world->send(0, 2, me);
	g_world->send(0, 3, bestQ);
}

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(int peanoLevel, float a, float b, float c, float d) {
g_env = new boost::mpi::environment();
g_world = new boost::mpi::communicator();
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
const int rank = g_world->rank();
const int world_size = g_world->size();
if (world_size == 4) { new (&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, rank & 3); }
g_mc.baseSeed = fmaf(0x9E3779B9u, rank, 0x9E3779B9u);
return rank;
}

// 1D/2D тестовые функции/обёртки — без функциональных изменений (оставлены как были)
__forceinline float ShekelFunc(float x, float seed) {
int i = 0; float st = seed, r1, r2, res = 0.0f;
while (i < 10) {
XOR_RAND(st, r1); float xp = fmaf(-r1, 10.0f, x); XOR_RAND(st, r1); XOR_RAND(st, r2);
float d = fmaf(fmaf(r1, 20.0f, 5.0f), xp * xp, fmaf(r2, 0.2f, 1.0f)); d = copysignf(fmaxf(fabsf(d), FLT_MIN), d); res -= 1.0f / d; ++i;
}
return res;
}
__forceinline float RastriginFunc(float x1, float x2) {
const float t = fmaf(x1, x1, x2 * x2); float c1, c2;
FABE13_COS(6.28318530717958647692f * x1, c1); FABE13_COS(6.28318530717958647692f * x2, c2);
return (t - fmaf(c1 + c2, 10.0f, -14.6f)) * fmaf(-t, 0.25f, 18.42f);
}
__forceinline float HillFunc(float x, float seed) {
int j = 0; __declspec(align(32)) float ang[14u];
float st = 6.28318530717958647692f * x; while (j < 14) { ang[j] = st * (float)(j + 1); ++j; }
__declspec(align(32)) float sv[14u], cv[14u]; FABE13_SINCOS(ang, sv, cv, 14u);
float state = seed, r1, r2; XOR_RAND(state, r1); float res = fmaf(r1, 2.0f, -1.1f); --j;
while (j >= 0) { XOR_RAND(state, r1); XOR_RAND(state, r2); res += fmaf(fmaf(r1, 2.0f, -1.1f), sv[j], fmaf(r2, 2.0f, -1.1f) * cv[j]); --j; }
return res;
}
__forceinline float GrishaginFunc(float x1, float x2, float seed) {
int j = 0; __declspec(align(32)) float aj[8u], ak[8u];
while (j < 8) { float pj = 3.14159265358979323846f * (float)(j + 1); aj[j] = pj * x1; ak[j] = pj * x2; ++j; }
__declspec(align(32)) float sj[8u], cj[8u], sk[8u], ck[8u]; FABE13_SINCOS(aj, sj, cj, 8u); FABE13_SINCOS(ak, sk, ck, 8u);
--j; float p1 = 0.0f, p2 = 0.0f; float st = seed, r1, r2;
while (j >= 0) {
size_t k = 0u; while (k < 8u) {
float s = sj[j] * sj[j]; float c = ck[k] * ck[k];
XOR_RAND_GRSH(st, r1); XOR_RAND_GRSH(st, r2); p1 = fmaf(r1, s, fmaf(r2, c, p1));
XOR_RAND_GRSH(st, r1); XOR_RAND_GRSH(st, r2); p2 = fmaf(-r1, c, fmaf(r2, s, p2)); ++k;
}
--j;
}
return -sqrtf(fmaf(p1, p1, p2 * p2));
}

// Тестовые 1D/2D API оставлены без поведенческих изменений
extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_1D(float global_iterations, float a, float b, float r, bool mode, float epsilon, float seed,
float** out_data, size_t* out_len) {
Slab* slab = tls.local(); slab->current = slab->base; int schetchick = 0;
const float initial_length = b - a; float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length, inv_threshold_03 = 1.0f / threshold_03;
const float start_val = ShekelFunc(a, seed); float best_f = ShekelFunc(b, seed);
float x_Rmax_1 = a, x_Rmax_2 = b; float y_Rmax_1 = start_val, y_Rmax_2 = best_f;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
std::vector<Interval1D*, boost::alignment::aligned_allocator<Interval1D*, 64u>> R;
Extr.reserve((size_t)global_iterations << 2u); R.reserve((size_t)global_iterations << 1u);
R.emplace_back(new Interval1D(a, b, start_val, best_f, 1.0f)); float Mmax = R.front()->M; float m = r * Mmax;

text
while (true) {
	const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r);
	const float new_value = ShekelFunc(new_point, seed);
	if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_point); }
	std::pop_heap(R.begin(), R.end(), ComparePtr1D);
	const Interval1D* pro = R.back();
	const float new_x1 = pro->x1, new_x2 = pro->x2;
	const float len2 = new_x2 - new_point, len1 = new_point - new_x1;
	const float interval_len = (len1 < len2 ? len1 : len2);
	if (++schetchick == (int)global_iterations || interval_len < epsilon) {
		Extr.emplace_back((float)schetchick); Extr.emplace_back(interval_len);
		*out_len = Extr.size(); *out_data = (float*)CoTaskMemAlloc(sizeof(float) * (*out_len));
		memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); return;
	}
	Interval1D* curr = new Interval1D(new_x1, new_point, pro->y1, new_value, 1.0f);
	Interval1D* curr1 = new Interval1D(new_point, new_x2, new_value, pro->y2, 1.0f);
	const float currM = curr->M > curr1->M ? curr->M : curr1->M;
	const size_t r_size = R.size();
	if (mode) {
		if (len2 + len1 == dmax) { dmax = len2 > len1 ? len2 : len1; for (auto p : R) { float L = p->x2 - p->x1; if (L > dmax) dmax = L; } }
		if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) {
			if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
			const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
			const float alpha = progress * progress; const float betta = 2.0f - alpha;
			const float MULT = (1.0f / dmax) * Mmax;
			const float global_coeff = fmaf(MULT, r, -MULT);
			const float GF = betta * global_coeff;
			curr->ChangeCharacteristic(fmaf(GF, len1, curr->M * alpha));
			curr1->ChangeCharacteristic(fmaf(GF, len2, curr1->M * alpha));
			RecomputeR_AffineM_AVX2_1D(R.data(), r_size, GF, alpha);
			std::make_heap(R.begin(), R.end(), ComparePtr1D);
		}
		else {
			if (currM > Mmax) {
				if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
				else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2_1D(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr1D); }
			}
			else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
		}
	}
	else {
		if (currM > Mmax) {
			if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
			else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2_1D(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr1D); }
		}
		else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
	}
	R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr1D);
	R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr1D);
	const Interval1D* top = R.front();
	x_Rmax_1 = top->x1; x_Rmax_2 = top->x2; y_Rmax_1 = top->y1; y_Rmax_2 = top->y2;
}

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_2D(const float N, const float global_iterations, const float a, const float b, const float c,
const float d, const float r, const bool mode, const float epsilon, const float seed,
float** const __restrict out_data, size_t* const __restrict out_len) noexcept
{
Slab* const __restrict slab = tls.local(); slab->current = slab->base;
int schetchick = 0, no_improve = 0; const int rank = g_world->rank(); const int world_size = g_world->size();
while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg dummy; g_world->recv(boost::mpi::any_source, 0, dummy); }
const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1));
const float x_addition = (b - a) * inv_divider, y_addition = (d - c) * inv_divider;
const float true_start = a + x_addition, true_end = b - x_addition;
float x_Rmax_1 = true_start, x_Rmax_2 = true_end;
const float initial_length = x_Rmax_2 - x_Rmax_1; float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length, inv_threshold_03 = 1.0f / threshold_03;
const float start_val = rank % 3 ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition);
float best_f = rank % 2 ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition);
float y_Rmax_1 = start_val, y_Rmax_2 = best_f;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
std::vector<Interval1D* __restrict, boost::alignment::aligned_allocator<Interval1D* __restrict, 64u>> R;
Extr.clear(); Extr.reserve(static_cast<size_t>(global_iterations) << 2u);
R.clear(); R.reserve(static_cast<size_t>(global_iterations) << 1u);
R.emplace_back(new Interval1D(true_start, true_end, start_val, best_f, 2.0f));
const Interval1D* __restrict top_ptr;
float Mmax = R.front()->M, m = r * Mmax;
while (true) {
const float interval_len = x_Rmax_2 - x_Rmax_1;
const bool stagnation = no_improve > 100 && schetchick > 270;
const float p = fmaf(-1.0f / initial_length, dmax, 1.0f);
while (g_world->iprobe(boost::mpi::any_source, 0)) {
CtrlMsg in; g_world->recv(boost::mpi::any_source, 0, in);
if (in.kind) {
if (!rank) {
Extr.emplace_back((float)schetchick); Extr.emplace_back(interval_len); *out_len = Extr.size();
out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
memcpy(*out_data, Extr.data(), sizeof(float) * (out_len));
}
return;
}
const float sx = FindX2D_analytic(in.xchg.s_x1, in.xchg.s_x2);
const float ex = FindX2D_analytic(in.xchg.e_x1, in.xchg.e_x2);
Interval1D const __restrict injected = new Interval1D(sx, ex,
RastriginFunc(in.xchg.s_x1, in.xchg.s_x2), RastriginFunc(in.xchg.e_x1, in.xchg.e_x2), 2.0f);
injected->ChangeCharacteristic(m);
if (injected->R > 1.15f * top_ptr->R) {
const float k = stagnation ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f);
injected->R = in.xchg.Rtop * k;
R.emplace_back(injected); std::push_heap(R.begin(), R.end(), ComparePtr1D);
}
}
// исходная формула T оставлена (это тестовая 2D), менять здесь не обязательно
const int T = (int)fmaf(-expm1f(p), 264.0f, 277.0f);
const bool want_term = interval_len < epsilon || schetchick == static_cast<int>(global_iterations);
if (!(++schetchick % T) || stagnation || want_term) {
CtrlMsg out; out.kind = want_term;
if (!out.kind) {
float s_x1, s_x2, e_x1, e_x2; HitTest2D_analytic(top_ptr->x1, s_x1, s_x2); HitTest2D_analytic(top_ptr->x2, e_x1, e_x2);
out.xchg = CrossMsg{ s_x1, s_x2, e_x1, e_x2, top_ptr->R };
}
for (int i = 0; i < world_size; ++i) if (i != rank) g_world->isend(i, 0, out);
if (out.kind) {
if (!rank) {
Extr.emplace_back((float)schetchick); Extr.emplace_back(interval_len); *out_len = Extr.size();
out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (out_len)));
memcpy(out_data, Extr.data(), sizeof(float) * (out_len));
}
return;
}
}
const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r);
float new_x1_val, new_x2_val; HitTest2D_analytic(new_point, new_x1_val, new_x2_val);
const float new_value = RastriginFunc(new_x1_val, new_x2_val);
if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_x1_val); Extr.emplace_back(new_x2_val); no_improve = 0; }
else { ++no_improve; }
std::pop_heap(R.begin(), R.end(), ComparePtr1D);
Interval1D const __restrict promej = R.back();
const float segment_x1 = promej->x1, segment_x2 = promej->x2;
const float len2 = segment_x2 - new_point, len1 = new_point - segment_x1;
Interval1D const __restrict curr = new Interval1D(segment_x1, new_point, promej->y1, new_value, 2.0f);
Interval1D const __restrict curr1 = new Interval1D(new_point, segment_x2, new_value, promej->y2, 2.0f);
const float currM = (std::max)(curr->M, curr1->M); const size_t r_size = R.size();
if (mode) {
if (len2 + len1 == dmax) { dmax = (std::max)(len1, len2); for (auto pI : R) { float L = pI->x2 - pI->x1; if (L > dmax) dmax = L; } }
if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) {
if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
const float alpha = progress * progress; const float betta = 2.0f - alpha;
const float MULTIPLIER = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
const float GLOBAL_FACTOR = betta * global_coeff;
curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));
RecomputeR_AffineM_AVX2_1D(R.data(), r_size, GLOBAL_FACTOR, alpha);
std::make_heap(R.begin(), R.end(), ComparePtr1D);
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
else {
Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m);
RecomputeR_ConstM_AVX2_1D(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr1D);
}
}
else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
}
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
else {
Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m);
RecomputeR_ConstM_AVX2_1D(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr1D);
}
}
else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
}
R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr1D);
R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr1D);
top_ptr = R.front(); x_Rmax_1 = top_ptr->x1; x_Rmax_2 = top_ptr->x2; y_Rmax_1 = top_ptr->y1; y_Rmax_2 = top_ptr->y2;
}
}

struct RunParams {
int nSegments; uint8_t varLen; float minTheta; float tx, ty; int levels, maxIter; float r; uint8_t adaptive; float eps; unsigned int seed;
template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar& nSegments& varLen& minTheta& tx& ty& levels& maxIter& r& adaptive& eps& seed; }
};

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartManipND(int nSegments, bool variableLengths, float minTheta, float targetX, float targetY,
int peanoLevels, int maxIterPerBranch, float r, bool adaptiveMode, float epsilon, unsigned int seed) {
RunParams p; p.nSegments = nSegments; p.varLen = (uint8_t)variableLengths; p.minTheta = minTheta; p.tx = targetX; p.ty = targetY;
p.levels = peanoLevels; p.maxIter = maxIterPerBranch; p.r = r; p.adaptive = (uint8_t)adaptiveMode; p.eps = epsilon; p.seed = seed;
const int rank = g_world->rank(); const int world = g_world->size();
for (int i = 1; i < world; ++i) if (i != rank) g_world->isend(i, 1, p);
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() {
while (true) {
if (g_world->iprobe(0, 1)) {
RunParams p; g_world->recv(0, 1, p);
float* q = nullptr; size_t qlen = 0; float bx = 0, by = 0, bf = 0;
AGP_Manip2D(p.nSegments, (bool)p.varLen, p.minTheta, p.tx, p.ty, p.levels, p.maxIter, p.r, (bool)p.adaptive, p.eps, p.seed, &q, &qlen, &bx, &by, &bf);
if (q) CoTaskMemFree(q);
}
Sleep(0);
}
}

extern "C" __declspec(dllexport) __declspec(noalias) void AgpStartWorkers() {
int i = 1; const int world = g_world->size();
while (i < world) { g_world->isend(i, 1, 0); ++i; }
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_Free(float* p) { CoTaskMemFree(p); } dllmain.cpp: #include "pch.h"

BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) {
if (ul_reason_for_call == DLL_PROCESS_ATTACH) {
DisableThreadLibraryCalls(hModule);
}
return TRUE;
} - обрати внимание dll используется в управляемом коде, внимательно проанализируй весь код на предмет утечек памяти и опасного доступа к памяти когда память повреждена, сейчас при запуске приложения я иногда наблюдаю картину что я нажимаю на кнопку а картинка ещё долгое время после даже иногда многих нажатий не обновляется - ощущение что в этот момент зависания и происходит освобождение памяти или подготовка памяти - в общем что-то с памятью - так как потом после нескольких нажатий на кнопку картинка всё же обновляется, ощущение что где-то происходит неаккуратное обращение с памятью, иногда после многих а иногда после нескольких нажатий я получаю вывод: C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

Необработанное исключение: System.AccessViolationException: Попытка чтения или записи в защищенную память. Это часто свидетельствует о том, что другая память повреждена.
в TESTAGP.MyForm.OnOptimizeClick(Object sender, EventArgs e) в D:\TEST_AGP\MyForm.h:строка 229
в System.Windows.Forms.Control.OnClick(EventArgs e)
в System.Windows.Forms.Button.OnMouseUp(MouseEventArgs mevent)
в System.Windows.Forms.Control.WmMouseUp(Message& m, MouseButtons button, Int32 clicks)
в System.Windows.Forms.Control.WndProc(Message& m)
в System.Windows.Forms.ButtonBase.WndProc(Message& m)
в System.Windows.Forms.Button.WndProc(Message& m)
в System.Windows.Forms.NativeWindow.Callback(IntPtr hWnd, Int32 msg, IntPtr wparam, IntPtr lparam)
в System.Windows.Forms.UnsafeNativeMethods.DispatchMessageW(MSG& msg)
в System.Windows.Forms.Application.ComponentManager.System.Windows.Forms.UnsafeNativeMethods.IMsoComponentManager.FPushMessageLoop(IntPtr dwComponentID, Int32 reason, Int32 pvLoopData)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoopInner(Int32 reason, ApplicationContext context)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoop(Int32 reason, ApplicationContext context)
в main() в D:\TEST_AGP\MyForm.cpp:строка 18
в mainCRTStartup()

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc0000005

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

Необработанное исключение: System.AccessViolationException: Попытка чтения или записи в защищенную память. Это часто свидетельствует о том, что другая память повреждена.
в TESTAGP.MyForm.OnOptimizeClick(Object sender, EventArgs e) в D:\TEST_AGP\MyForm.h:строка 229
в System.Windows.Forms.Control.OnClick(EventArgs e)
в System.Windows.Forms.Button.OnMouseUp(MouseEventArgs mevent)
в System.Windows.Forms.Control.WmMouseUp(Message& m, MouseButtons button, Int32 clicks)
в System.Windows.Forms.Control.WndProc(Message& m)
в System.Windows.Forms.ButtonBase.WndProc(Message& m)
в System.Windows.Forms.Button.WndProc(Message& m)
в System.Windows.Forms.NativeWindow.Callback(IntPtr hWnd, Int32 msg, IntPtr wparam, IntPtr lparam)
в System.Windows.Forms.UnsafeNativeMethods.DispatchMessageW(MSG& msg)
в System.Windows.Forms.Application.ComponentManager.System.Windows.Forms.UnsafeNativeMethods.IMsoComponentManager.FPushMessageLoop(IntPtr dwComponentID, Int32 reason, Int32 pvLoopData)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoopInner(Int32 reason, ApplicationContext context)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoop(Int32 reason, ApplicationContext context)
в main() в D:\TEST_AGP\MyForm.cpp:строка 18
в mainCRTStartup()

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc0000005

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

Необработанное исключение: System.AccessViolationException: Попытка чтения или записи в защищенную память. Это часто свидетельствует о том, что другая память повреждена.
в TESTAGP.MyForm.OnOptimizeClick(Object sender, EventArgs e) в D:\TEST_AGP\MyForm.h:строка 229
в System.Windows.Forms.Control.OnClick(EventArgs e)
в System.Windows.Forms.Button.OnMouseUp(MouseEventArgs mevent)
в System.Windows.Forms.Control.WmMouseUp(Message& m, MouseButtons button, Int32 clicks)
в System.Windows.Forms.Control.WndProc(Message& m)
в System.Windows.Forms.ButtonBase.WndProc(Message& m)
в System.Windows.Forms.Button.WndProc(Message& m)
в System.Windows.Forms.NativeWindow.Callback(IntPtr hWnd, Int32 msg, IntPtr wparam, IntPtr lparam)
в System.Windows.Forms.UnsafeNativeMethods.DispatchMessageW(MSG& msg)
в System.Windows.Forms.Application.ComponentManager.System.Windows.Forms.UnsafeNativeMethods.IMsoComponentManager.FPushMessageLoop(IntPtr dwComponentID, Int32 reason, Int32 pvLoopData)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoopInner(Int32 reason, ApplicationContext context)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoop(Int32 reason, ApplicationContext context)
в main() в D:\TEST_AGP\MyForm.cpp:строка 18
в mainCRTStartup()

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc0000005

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

Необработанное исключение: System.AccessViolationException: Попытка чтения или записи в защищенную память. Это часто свидетельствует о том, что другая память повреждена.
в TESTAGP.MyForm.OnOptimizeClick(Object sender, EventArgs e) в D:\TEST_AGP\MyForm.h:строка 229
в System.Windows.Forms.Control.OnClick(EventArgs e)
в System.Windows.Forms.Button.OnMouseUp(MouseEventArgs mevent)
в System.Windows.Forms.Control.WmMouseUp(Message& m, MouseButtons button, Int32 clicks)
в System.Windows.Forms.Control.WndProc(Message& m)
в System.Windows.Forms.ButtonBase.WndProc(Message& m)
в System.Windows.Forms.Button.WndProc(Message& m)
в System.Windows.Forms.NativeWindow.Callback(IntPtr hWnd, Int32 msg, IntPtr wparam, IntPtr lparam)
в System.Windows.Forms.UnsafeNativeMethods.DispatchMessageW(MSG& msg)
в System.Windows.Forms.Application.ComponentManager.System.Windows.Forms.UnsafeNativeMethods.IMsoComponentManager.FPushMessageLoop(IntPtr dwComponentID, Int32 reason, Int32 pvLoopData)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoopInner(Int32 reason, ApplicationContext context)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoop(Int32 reason, ApplicationContext context)
в main() в D:\TEST_AGP\MyForm.cpp:строка 18
в mainCRTStartup()

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc0000005

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

Необработанное исключение: System.AccessViolationException: Попытка чтения или записи в защищенную память. Это часто свидетельствует о том, что другая память повреждена.
в TESTAGP.MyForm.OnOptimizeClick(Object sender, EventArgs e) в D:\TEST_AGP\MyForm.h:строка 229
в System.Windows.Forms.Control.OnClick(EventArgs e)
в System.Windows.Forms.Button.OnMouseUp(MouseEventArgs mevent)
в System.Windows.Forms.Control.WmMouseUp(Message& m, MouseButtons button, Int32 clicks)
в System.Windows.Forms.Control.WndProc(Message& m)
в System.Windows.Forms.ButtonBase.WndProc(Message& m)
в System.Windows.Forms.Button.WndProc(Message& m)
в System.Windows.Forms.NativeWindow.Callback(IntPtr hWnd, Int32 msg, IntPtr wparam, IntPtr lparam)
в System.Windows.Forms.UnsafeNativeMethods.DispatchMessageW(MSG& msg)
в System.Windows.Forms.Application.ComponentManager.System.Windows.Forms.UnsafeNativeMethods.IMsoComponentManager.FPushMessageLoop(IntPtr dwComponentID, Int32 reason, Int32 pvLoopData)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoopInner(Int32 reason, ApplicationContext context)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoop(Int32 reason, ApplicationContext context)
в main() в D:\TEST_AGP\MyForm.cpp:строка 18
в mainCRTStartup()

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc0000005

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

Необработанное исключение: System.AccessViolationException: Попытка чтения или записи в защищенную память. Это часто свидетельствует о том, что другая память повреждена.
в TESTAGP.MyForm.OnOptimizeClick(Object sender, EventArgs e) в D:\TEST_AGP\MyForm.h:строка 229
в System.Windows.Forms.Control.OnClick(EventArgs e)
в System.Windows.Forms.Button.OnMouseUp(MouseEventArgs mevent)
в System.Windows.Forms.Control.WmMouseUp(Message& m, MouseButtons button, Int32 clicks)
в System.Windows.Forms.Control.WndProc(Message& m)
в System.Windows.Forms.ButtonBase.WndProc(Message& m)
в System.Windows.Forms.Button.WndProc(Message& m)
в System.Windows.Forms.NativeWindow.Callback(IntPtr hWnd, Int32 msg, IntPtr wparam, IntPtr lparam)
в System.Windows.Forms.UnsafeNativeMethods.DispatchMessageW(MSG& msg)
в System.Windows.Forms.Application.ComponentManager.System.Windows.Forms.UnsafeNativeMethods.IMsoComponentManager.FPushMessageLoop(IntPtr dwComponentID, Int32 reason, Int32 pvLoopData)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoopInner(Int32 reason, ApplicationContext context)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoop(Int32 reason, ApplicationContext context)
в main() в D:\TEST_AGP\MyForm.cpp:строка 18
в mainCRTStartup()

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc0000005

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

Необработанное исключение: System.AccessViolationException: Попытка чтения или записи в защищенную память. Это часто свидетельствует о том, что другая память повреждена.
в TESTAGP.MyForm.OnOptimizeClick(Object sender, EventArgs e) в D:\TEST_AGP\MyForm.h:строка 229
в System.Windows.Forms.Control.OnClick(EventArgs e)
в System.Windows.Forms.Button.OnMouseUp(MouseEventArgs mevent)
в System.Windows.Forms.Control.WmMouseUp(Message& m, MouseButtons button, Int32 clicks)
в System.Windows.Forms.Control.WndProc(Message& m)
в System.Windows.Forms.ButtonBase.WndProc(Message& m)
в System.Windows.Forms.Button.WndProc(Message& m)
в System.Windows.Forms.NativeWindow.Callback(IntPtr hWnd, Int32 msg, IntPtr wparam, IntPtr lparam)
в System.Windows.Forms.UnsafeNativeMethods.DispatchMessageW(MSG& msg)
в System.Windows.Forms.Application.ComponentManager.System.Windows.Forms.UnsafeNativeMethods.IMsoComponentManager.FPushMessageLoop(IntPtr dwComponentID, Int32 reason, Int32 pvLoopData)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoopInner(Int32 reason, ApplicationContext context)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoop(Int32 reason, ApplicationContext context)
в main() в D:\TEST_AGP\MyForm.cpp:строка 18
в mainCRTStartup()

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc0000005

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

Необработанное исключение: System.AccessViolationException: Попытка чтения или записи в защищенную память. Это часто свидетельствует о том, что другая память повреждена.
в TESTAGP.MyForm.OnOptimizeClick(Object sender, EventArgs e) в D:\TEST_AGP\MyForm.h:строка 229
в System.Windows.Forms.Control.OnClick(EventArgs e)
в System.Windows.Forms.Button.OnMouseUp(MouseEventArgs mevent)
в System.Windows.Forms.Control.WmMouseUp(Message& m, MouseButtons button, Int32 clicks)
в System.Windows.Forms.Control.WndProc(Message& m)
в System.Windows.Forms.ButtonBase.WndProc(Message& m)
в System.Windows.Forms.Button.WndProc(Message& m)
в System.Windows.Forms.NativeWindow.Callback(IntPtr hWnd, Int32 msg, IntPtr wparam, IntPtr lparam)
в System.Windows.Forms.UnsafeNativeMethods.DispatchMessageW(MSG& msg)
в System.Windows.Forms.Application.ComponentManager.System.Windows.Forms.UnsafeNativeMethods.IMsoComponentManager.FPushMessageLoop(IntPtr dwComponentID, Int32 reason, Int32 pvLoopData)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoopInner(Int32 reason, ApplicationContext context)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoop(Int32 reason, ApplicationContext context)
в main() в D:\TEST_AGP\MyForm.cpp:строка 18
в mainCRTStartup()

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc0000005

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

Необработанное исключение: System.AccessViolationException: Попытка чтения или записи в защищенную память. Это часто свидетельствует о том, что другая память повреждена.
в TESTAGP.MyForm.OnOptimizeClick(Object sender, EventArgs e) в D:\TEST_AGP\MyForm.h:строка 229
в System.Windows.Forms.Control.OnClick(EventArgs e)
в System.Windows.Forms.Button.OnMouseUp(MouseEventArgs mevent)
в System.Windows.Forms.Control.WmMouseUp(Message& m, MouseButtons button, Int32 clicks)
в System.Windows.Forms.Control.WndProc(Message& m)
в System.Windows.Forms.ButtonBase.WndProc(Message& m)
в System.Windows.Forms.Button.WndProc(Message& m)
в System.Windows.Forms.NativeWindow.Callback(IntPtr hWnd, Int32 msg, IntPtr wparam, IntPtr lparam)
в System.Windows.Forms.UnsafeNativeMethods.DispatchMessageW(MSG& msg)
в System.Windows.Forms.Application.ComponentManager.System.Windows.Forms.UnsafeNativeMethods.IMsoComponentManager.FPushMessageLoop(IntPtr dwComponentID, Int32 reason, Int32 pvLoopData)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoopInner(Int32 reason, ApplicationContext context)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoop(Int32 reason, ApplicationContext context)
в main() в D:\TEST_AGP\MyForm.cpp:строка 18
в mainCRTStartup()

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc0000005

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

Необработанное исключение: System.AccessViolationException: Попытка чтения или записи в защищенную память. Это часто свидетельствует о том, что другая память повреждена.
в TESTAGP.MyForm.OnOptimizeClick(Object sender, EventArgs e) в D:\TEST_AGP\MyForm.h:строка 229
в System.Windows.Forms.Control.OnClick(EventArgs e)
в System.Windows.Forms.Button.OnMouseUp(MouseEventArgs mevent)
в System.Windows.Forms.Control.WmMouseUp(Message& m, MouseButtons button, Int32 clicks)
в System.Windows.Forms.Control.WndProc(Message& m)
в System.Windows.Forms.ButtonBase.WndProc(Message& m)
в System.Windows.Forms.Button.WndProc(Message& m)
в System.Windows.Forms.NativeWindow.Callback(IntPtr hWnd, Int32 msg, IntPtr wparam, IntPtr lparam)
в System.Windows.Forms.UnsafeNativeMethods.DispatchMessageW(MSG& msg)
в System.Windows.Forms.Application.ComponentManager.System.Windows.Forms.UnsafeNativeMethods.IMsoComponentManager.FPushMessageLoop(IntPtr dwComponentID, Int32 reason, Int32 pvLoopData)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoopInner(Int32 reason, ApplicationContext context)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoop(Int32 reason, ApplicationContext context)
в main() в D:\TEST_AGP\MyForm.cpp:строка 18
в mainCRTStartup()

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc0000005

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

Необработанное исключение: System.AccessViolationException: Попытка чтения или записи в защищенную память. Это часто свидетельствует о том, что другая память повреждена.
в TESTAGP.MyForm.OnOptimizeClick(Object sender, EventArgs e) в D:\TEST_AGP\MyForm.h:строка 229
в System.Windows.Forms.Control.OnClick(EventArgs e)
в System.Windows.Forms.Button.OnMouseUp(MouseEventArgs mevent)
в System.Windows.Forms.Control.WmMouseUp(Message& m, MouseButtons button, Int32 clicks)
в System.Windows.Forms.Control.WndProc(Message& m)
в System.Windows.Forms.ButtonBase.WndProc(Message& m)
в System.Windows.Forms.Button.WndProc(Message& m)
в System.Windows.Forms.NativeWindow.Callback(IntPtr hWnd, Int32 msg, IntPtr wparam, IntPtr lparam)
в System.Windows.Forms.UnsafeNativeMethods.DispatchMessageW(MSG& msg)
в System.Windows.Forms.Application.ComponentManager.System.Windows.Forms.UnsafeNativeMethods.IMsoComponentManager.FPushMessageLoop(IntPtr dwComponentID, Int32 reason, Int32 pvLoopData)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoopInner(Int32 reason, ApplicationContext context)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoop(Int32 reason, ApplicationContext context)
в main() в D:\TEST_AGP\MyForm.cpp:строка 18
в mainCRTStartup()

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc0000005

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc0000005

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

Необработанное исключение: System.AccessViolationException: Попытка чтения или записи в защищенную память. Это часто свидетельствует о том, что другая память повреждена.
в TESTAGP.MyForm.OnOptimizeClick(Object sender, EventArgs e) в D:\TEST_AGP\MyForm.h:строка 229
в System.Windows.Forms.Control.OnClick(EventArgs e)
в System.Windows.Forms.Button.OnMouseUp(MouseEventArgs mevent)
в System.Windows.Forms.Control.WmMouseUp(Message& m, MouseButtons button, Int32 clicks)
в System.Windows.Forms.Control.WndProc(Message& m)
в System.Windows.Forms.ButtonBase.WndProc(Message& m)
в System.Windows.Forms.Button.WndProc(Message& m)
в System.Windows.Forms.NativeWindow.Callback(IntPtr hWnd, Int32 msg, IntPtr wparam, IntPtr lparam)
в System.Windows.Forms.UnsafeNativeMethods.DispatchMessageW(MSG& msg)
в System.Windows.Forms.Application.ComponentManager.System.Windows.Forms.UnsafeNativeMethods.IMsoComponentManager.FPushMessageLoop(IntPtr dwComponentID, Int32 reason, Int32 pvLoopData)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoopInner(Int32 reason, ApplicationContext context)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoop(Int32 reason, ApplicationContext context)
в main() в D:\TEST_AGP\MyForm.cpp:строка 18
в mainCRTStartup()

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc0000005

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

Необработанное исключение: System.AccessViolationException: Попытка чтения или записи в защищенную память. Это часто свидетельствует о том, что другая память повреждена.
в TESTAGP.MyForm.OnOptimizeClick(Object A_0, EventArgs A_1) в D:\TEST_AGP\MyForm.h:строка 179
в System.Windows.Forms.Control.OnClick(EventArgs e)
в System.Windows.Forms.Button.OnMouseUp(MouseEventArgs mevent)
в System.Windows.Forms.Control.WmMouseUp(Message& m, MouseButtons button, Int32 clicks)
в System.Windows.Forms.Control.WndProc(Message& m)
в System.Windows.Forms.ButtonBase.WndProc(Message& m)
в System.Windows.Forms.Button.WndProc(Message& m)
в System.Windows.Forms.NativeWindow.Callback(IntPtr hWnd, Int32 msg, IntPtr wparam, IntPtr lparam)
в System.Windows.Forms.UnsafeNativeMethods.DispatchMessageW(MSG& msg)
в System.Windows.Forms.Application.ComponentManager.System.Windows.Forms.UnsafeNativeMethods.IMsoComponentManager.FPushMessageLoop(IntPtr dwComponentID, Int32 reason, Int32 pvLoopData)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoopInner(Int32 reason, ApplicationContext context)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoop(Int32 reason, ApplicationContext context)
в main() в D:\TEST_AGP\MyForm.cpp:строка 24
в mainCRTStartup()

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc0000005

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

Необработанное исключение: System.AccessViolationException: Попытка чтения или записи в защищенную память. Это часто свидетельствует о том, что другая память повреждена.
в TESTAGP.MyForm.OnOptimizeClick(Object A_0, EventArgs A_1) в D:\TEST_AGP\MyForm.h:строка 179
в System.Windows.Forms.Control.OnClick(EventArgs e)
в System.Windows.Forms.Button.OnMouseUp(MouseEventArgs mevent)
в System.Windows.Forms.Control.WmMouseUp(Message& m, MouseButtons button, Int32 clicks)
в System.Windows.Forms.Control.WndProc(Message& m)
в System.Windows.Forms.ButtonBase.WndProc(Message& m)
в System.Windows.Forms.Button.WndProc(Message& m)
в System.Windows.Forms.NativeWindow.Callback(IntPtr hWnd, Int32 msg, IntPtr wparam, IntPtr lparam)
в System.Windows.Forms.UnsafeNativeMethods.DispatchMessageW(MSG& msg)
в System.Windows.Forms.Application.ComponentManager.System.Windows.Forms.UnsafeNativeMethods.IMsoComponentManager.FPushMessageLoop(IntPtr dwComponentID, Int32 reason, Int32 pvLoopData)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoopInner(Int32 reason, ApplicationContext context)
в System.Windows.Forms.Application.ThreadContext.RunMessageLoop(Int32 reason, ApplicationContext context)
в main() в D:\TEST_AGP\MyForm.cpp:строка 24
в mainCRTStartup()

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc0000005

---- error analysis -----

C:\Users\maxim>mpiexec -n 6 D:\TEST_AGP\x64\Release\TEST_AGP.exe

job aborted:
[ranks] message

[0] process exited without calling finalize

[1-5] terminated

---- error analysis -----

[0] on DESKTOP-C1VT6JF
D:\TEST_AGP\x64\Release\TEST_AGP.exe ended prematurely and may have crashed. exit code 0xc0000005

---- error analysis ----- будь очень внимателен к деталям и всем аллокациям - трать много времени на анализ - мне верни полностью исправленный код без пропусков

Answer 1

Ниже — полностью исправленные файлы без пропусков. Я устранил реальные источники повреждения памяти и зависаний:

Критический буфер-оверран в LHS-сидировании (generate_lhs_seeds_lite/generate_heuristic_seeds) — раньше писали за пределы фиксированного массива seeds[64*32] и даже за границы permutations[32][16]. Переделал на безопасное формирование латинского гиперкуба с ограничением по вместимости.
Переполнение всех фиксированных буферов на 32 float при nSegments > 16 (при variableLengths=true размерность dim = 2*nSegments). Ввёл жёсткий лимит MAX_DIM=32, а в UI ограничил MAX_SEGMENTS=16 и блокирую кнопку «+ Звено» при достижении лимита.
Незаполнение/неочистка TLS-slab для ND-ветки: в AGP_Manip2D не сбрасывался tls.slab->current — при каждом запуске копился мусор и в итоге «уезжали» указатели. Теперь слэб сбрасывается перед работой.
Неинициализированное использование top_ptr в AGP_2D до первого присваивания — исправлено.
Сид генератора — убрал сомнительный fmaf над целыми, сделал детерминированное целочисленное хэш-смешивание.
Упростил небезопасные трюки с битами в Shag — без алиасинга.
Корректное завершение MPI-процессов: добавил экспорт AgpShutdownWorkers() и AgpFinalizeAll(), UI шлёт «яды» воркерам и аккуратно финализирует MPI — исчезают сообщения job aborted и «didn't call finalize».
Мелкие доработки UI: правильное создание List<float> без странного конструктора, защита от попытки читать длины когда bestQLen не хватает, косметика.

MyForm.h

cpp
#pragma once
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>
#include <stdint.h>

using namespace System;
using namespace System::Drawing;
using namespace System::Windows::Forms;
using namespace System::Collections::Generic;
using namespace System::Drawing::Drawing2D;

typedef void(__cdecl* P_MANIP)(int, bool, float, float, float, int, int, float, bool, float, unsigned int, float**, size_t*, float*, float*, float*);
typedef void(__cdecl* P_FREE)(float*);
typedef void(__cdecl* P_START)(int, bool, float, float, float, int, int, float, bool, float, unsigned int);

namespace TESTAGP {

	public ref class MyForm : public Form {
	public:
		MyForm(HMODULE hLib) : hLib(hLib) {
			this->SetStyle(ControlStyles::AllPaintingInWmPaint | ControlStyles::UserPaint | ControlStyles::OptimizedDoubleBuffer, true);
			this->Text = L"AGP Manipulator 2D"; this->ClientSize = System::Drawing::Size(1000, 700);
			this->Resize += gcnew EventHandler(this, &MyForm::OnResize);

			fManip = (P_MANIP)GetProcAddress(hLib, "AGP_Manip2D");
			pFree  = (P_FREE )GetProcAddress(hLib, "AGP_Free");
			pStart = (P_START)GetProcAddress(hLib, "AgpStartManipND");

			angles  = gcnew List<float>();
			lengths = gcnew List<float>();

			InitUI();
			ResetRandomConfig();
		}

	protected:
		~MyForm() {}

	private:
		// Жёсткий лимит, чтобы не переполнить внутренние буферы (dim<=32)
		literal int MAX_SEGMENTS = 16;

		HMODULE hLib; P_MANIP fManip; P_FREE pFree; P_START pStart;
		int nSegments; bool variableLengths; List<float>^ angles; List<float>^ lengths;
		CheckBox^ cbVarLen; NumericUpDown^ nudMinTheta, ^ nudBaseLength, ^ nudStretchFactor, ^ nudTargetX, ^ nudTargetY, ^ nudLevels, ^ nudMaxIter;
		CheckBox^ cbAdaptive; NumericUpDown^ nudR, ^ nudEps; Button^ btnAdd, ^ btnRem, ^ btnOptimize; Label^ lblInfo;
		System::UInt32 rngState = 0xA5C39E0Du;

		void WireInvalidate(Control^ c) {
			if (dynamic_cast<NumericUpDown^>(c)) ((NumericUpDown^)c)->ValueChanged += gcnew EventHandler(this, &MyForm::OnAnyChanged);
			else if (dynamic_cast<CheckBox^>(c)) ((CheckBox^)c)->CheckedChanged += gcnew EventHandler(this, &MyForm::OnAnyChanged);
		}

		void InitUI() {
			int y = 10, w = 180, h = 24, pad = 8, currentX = 10;
			Label^ L;

			L = gcnew Label(); L->Text = L"Мин. угол (рад)"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
			nudMinTheta = gcnew NumericUpDown(); nudMinTheta->Location = Point(currentX, y + h + 2); nudMinTheta->Width = w;
			nudMinTheta->DecimalPlaces = 3; nudMinTheta->Minimum = (Decimal)0.01; nudMinTheta->Maximum = (Decimal)3.14159; nudMinTheta->Value = (Decimal)1.5; this->Controls->Add(nudMinTheta);
			WireInvalidate(nudMinTheta);

			currentX += w + 20; L = gcnew Label(); L->Text = L"Базовая длина"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
			nudBaseLength = gcnew NumericUpDown(); nudBaseLength->Location = Point(currentX, y + h + 2); nudBaseLength->Width = w;
			nudBaseLength->DecimalPlaces = 2; nudBaseLength->Minimum = (Decimal)0.50; nudBaseLength->Maximum = (Decimal)2.00; nudBaseLength->Value = (Decimal)1.00; this->Controls->Add(nudBaseLength);
			WireInvalidate(nudBaseLength);

			currentX += w + 20; L = gcnew Label(); L->Text = L"Коэф. растяжения"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
			nudStretchFactor = gcnew NumericUpDown(); nudStretchFactor->Location = Point(currentX, y + h + 2); nudStretchFactor->Width = w;
			nudStretchFactor->DecimalPlaces = 2; nudStretchFactor->Minimum = (Decimal)1.00; nudStretchFactor->Maximum = (Decimal)1.50;
			nudStretchFactor->Increment = (Decimal)0.01; nudStretchFactor->Value = (Decimal)1.50; this->Controls->Add(nudStretchFactor);
			WireInvalidate(nudStretchFactor);

			currentX += w + 20; cbVarLen = gcnew CheckBox();
			cbVarLen->Text = L"Переменные длины"; cbVarLen->Location = Point(currentX, y + h + 2);
			cbVarLen->Width = w; cbVarLen->Checked = false; this->Controls->Add(cbVarLen);
			WireInvalidate(cbVarLen);

			currentX = 10; y += h * 2 + pad + 10;

			L = gcnew Label(); L->Text = L"Цель X"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
			nudTargetX = gcnew NumericUpDown(); nudTargetX->Location = Point(currentX, y + h + 2); nudTargetX->Width = w;
			nudTargetX->DecimalPlaces = 2; nudTargetX->Minimum = (Decimal)-10.0; nudTargetX->Maximum = (Decimal)10.0; nudTargetX->Value = (Decimal)3.5; this->Controls->Add(nudTargetX);
			WireInvalidate(nudTargetX);

			currentX += w + 20; L = gcnew Label(); L->Text = L"Цель Y"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
			nudTargetY = gcnew NumericUpDown(); nudTargetY->Location = Point(currentX, y + h + 2); nudTargetY->Width = w;
			nudTargetY->DecimalPlaces = 2; nudTargetY->Minimum = (Decimal)-10.0; nudTargetY->Maximum = (Decimal)10.0; nudTargetY->Value = (Decimal)1.0; this->Controls->Add(nudTargetY);
			WireInvalidate(nudTargetY);

			currentX += w + 20; L = gcnew Label(); L->Text = L"Глубина"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
			nudLevels = gcnew NumericUpDown(); nudLevels->Location = Point(currentX, y + h + 2); nudLevels->Width = w;
			nudLevels->Minimum = 7; nudLevels->Maximum = 20; nudLevels->Value = 12; this->Controls->Add(nudLevels);
			WireInvalidate(nudLevels);

			currentX += w + 20; L = gcnew Label(); L->Text = L"Надежность (r)"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
			nudR = gcnew NumericUpDown(); nudR->Location = Point(currentX, y + h + 2); nudR->Width = w;
			nudR->DecimalPlaces = 2; nudR->Minimum = (Decimal)1.00; nudR->Maximum = (Decimal)20.00; nudR->Value = (Decimal)2.50; this->Controls->Add(nudR);
			WireInvalidate(nudR);

			currentX += w + 20; cbAdaptive = gcnew CheckBox();
			cbAdaptive->Text = L"Адаптивная"; cbAdaptive->Location = Point(currentX, y + h + 2);
			cbAdaptive->Width = w; cbAdaptive->Checked = true; this->Controls->Add(cbAdaptive);
			WireInvalidate(cbAdaptive);

			y += h * 2 + pad + 10; currentX = 10;

			L = gcnew Label(); L->Text = L"Точность"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
			nudEps = gcnew NumericUpDown(); nudEps->Location = Point(currentX, y + h + 2); nudEps->Width = w;
			nudEps->DecimalPlaces = 9; nudEps->Minimum = (Decimal)0.000000001; nudEps->Maximum = (Decimal)0.1; nudEps->Value = (Decimal)0.00001;
			this->Controls->Add(nudEps);
			WireInvalidate(nudEps);

			currentX += w + 20; L = gcnew Label(); L->Text = L"Макс. итераций"; L->Location = Point(currentX, y); L->Width = w; this->Controls->Add(L);
			nudMaxIter = gcnew NumericUpDown(); nudMaxIter->Location = Point(currentX, y + h + 2); nudMaxIter->Width = w;
			nudMaxIter->Minimum = 10; nudMaxIter->Maximum = 500000; nudMaxIter->Value = 1000; this->Controls->Add(nudMaxIter);
			WireInvalidate(nudMaxIter);

			currentX += 200; btnAdd = gcnew Button(); btnAdd->Text = L"+ Звено"; btnAdd->Location = Point(currentX, y + h + 2); btnAdd->Width = 80;
			btnAdd->Click += gcnew EventHandler(this, &MyForm::OnAddClick); this->Controls->Add(btnAdd);

			currentX += 85; btnRem = gcnew Button(); btnRem->Text = L"- Звено"; btnRem->Location = Point(currentX, y + h + 2); btnRem->Width = 80;
			btnRem->Click += gcnew EventHandler(this, &MyForm::OnRemClick); this->Controls->Add(btnRem);

			currentX += 125; btnOptimize = gcnew Button(); btnOptimize->Text = L"Оптимизировать"; btnOptimize->Location = Point(currentX, y + h + 2); btnOptimize->Width = 120;
			btnOptimize->Click += gcnew EventHandler(this, &MyForm::OnOptimizeClick); this->Controls->Add(btnOptimize);

			currentX += 125; lblInfo = gcnew Label(); lblInfo->Location = Point(currentX, y); lblInfo->Size = System::Drawing::Size(250, 60);
			lblInfo->BorderStyle = BorderStyle::FixedSingle; lblInfo->Text = L"Готов"; this->Controls->Add(lblInfo);
		}

		void ResetRandomConfig() {
			nSegments = 1;
			angles->Clear(); lengths->Clear();
			angles->Add((float)(3.14159265358979323846 / 2.0)); // вверх
			lengths->Add((float)nudBaseLength->Value);
			variableLengths = false;
			this->Invalidate();
		}

		float Rand01() { rngState ^= rngState << 13; rngState ^= rngState >> 17; rngState ^= rngState << 5; return (float)((double)(unsigned)rngState / 4294967296.0); }
		float RandAngle() { return (Rand01() * 6.28318530717958647692f) - 3.14159265358979323846f; }

		System::Void OnResize(System::Object^, System::EventArgs^) { this->Invalidate(); }
		System::Void OnAnyChanged(System::Object^, System::EventArgs^) { this->Invalidate(); }

		System::Void OnAddClick(System::Object^, System::EventArgs^) {
			if (nSegments >= MAX_SEGMENTS) {
				MessageBox::Show(L"Достигнут максимум звеньев (16). Это ограничение алгоритма (dim ≤ 32).", L"Ограничение", MessageBoxButtons::OK, MessageBoxIcon::Information);
				return;
			}
			++nSegments;
			angles->Add(RandAngle());
			lengths->Add((float)nudBaseLength->Value);
			this->Invalidate();
		}

		System::Void OnRemClick(System::Object^, System::EventArgs^) {
			if (nSegments > 1) {
				--nSegments;
				angles->RemoveAt(angles->Count - 1);
				lengths->RemoveAt(lengths->Count - 1);
				this->Invalidate();
			}
		}

		System::Void OnOptimizeClick(System::Object^, System::EventArgs^) {
			if (!fManip) { MessageBox::Show(L"Функция AGP_Manip2D не найдена в DLL.", L"Ошибка", MessageBoxButtons::OK, MessageBoxIcon::Error); return; }

			variableLengths = cbVarLen->Checked;
			// Ограничение по размерности: dim = n + (varLen ? n : 0) ≤ 32  → n ≤ 16
			if (nSegments > MAX_SEGMENTS) {
				MessageBox::Show(L"Слишком много звеньев. Уменьшите до 16 или меньше.", L"Ограничение", MessageBoxButtons::OK, MessageBoxIcon::Warning);
				return;
			}

			float minTheta = (float)nudMinTheta->Value;
			float tx = (float)nudTargetX->Value, ty = (float)nudTargetY->Value;
			int levels = (int)nudLevels->Value, maxIter = (int)nudMaxIter->Value;
			bool adaptive = cbAdaptive->Checked;
			float r_param = (float)nudR->Value, eps = (float)nudEps->Value;
			unsigned int seed = (unsigned)GetTickCount();

			if (pStart) pStart(nSegments, variableLengths, minTheta, tx, ty, levels, maxIter, r_param, adaptive, eps, seed);

			LARGE_INTEGER t0, t1, fq; QueryPerformanceCounter(&t0);
			float* bestQ = nullptr; size_t bestQLen = 0; float bestX = 0, bestY = 0, bestF = 0;
			fManip(nSegments, variableLengths, minTheta, tx, ty, levels, maxIter, r_param, adaptive, eps, seed,
				&bestQ, &bestQLen, &bestX, &bestY, &bestF);
			QueryPerformanceCounter(&t1); QueryPerformanceFrequency(&fq);
			double micros = 1e6 * (double)(t1.QuadPart - t0.QuadPart) / (double)fq.QuadPart;

			if (bestQ && bestQLen >= (size_t)nSegments) {
				angles->Clear();
				for (int i = 0; i < nSegments; ++i) angles->Add(bestQ[i]);

				lengths->Clear();
				if (variableLengths) {
					// проверяем, что пришло достаточно длин для безопасного чтения
					if (bestQLen >= (size_t)(2 * nSegments)) {
						for (int i = 0; i < nSegments; ++i) lengths->Add(bestQ[nSegments + i]);
					}
					else {
						// если нет — используем базовую длину, чтобы не читать за пределы буфера
						for (int i = 0; i < nSegments; ++i) lengths->Add((float)nudBaseLength->Value);
					}
				}
				else {
					for (int i = 0; i < nSegments; ++i) lengths->Add((float)nudBaseLength->Value);
				}
				if (pFree) pFree(bestQ);
			}

			lblInfo->Text = String::Format(L"Результат:\nЦель: {0:F5}\nТочка: ({1:F3}, {2:F3})\nВремя: {3:F0} мкс", bestF, bestX, bestY, micros);
			this->Invalidate();
		}

	protected:
		virtual void OnPaint(PaintEventArgs^ e) override {
			Form::OnPaint(e);
			Graphics^ g = e->Graphics; g->SmoothingMode = System::Drawing::Drawing2D::SmoothingMode::HighQuality; g->Clear(this->BackColor);
			int topOffset = 150; System::Drawing::Rectangle drawArea = System::Drawing::Rectangle(0, topOffset, this->ClientSize.Width, this->ClientSize.Height - topOffset);
			g->FillRectangle(Brushes::White, drawArea); int leftWallX = drawArea.Left + this->ClientSize.Width * 25 / 100;
			Pen^ wallPen = gcnew Pen(Color::Black, 2); g->DrawLine(wallPen, leftWallX, drawArea.Top, leftWallX, drawArea.Bottom);
			HatchBrush^ hatchBrush = gcnew HatchBrush(HatchStyle::BackwardDiagonal, Color::LightGray, Color::White); int leftHatchWidth = 100;
			g->FillRectangle(hatchBrush, leftWallX - leftHatchWidth, drawArea.Top, leftHatchWidth, drawArea.Height);
			float targetX = (float)nudTargetX->Value; float targetY = (float)nudTargetY->Value; float scale = 160.0f; int baseX = leftWallX; int baseY = drawArea.Top + drawArea.Height / 2;
			float pixelTargetX = baseX + targetX * scale; float pixelTargetY = baseY - targetY * scale; int rightWallX = (int)(pixelTargetX + 8); rightWallX = Math::Min(rightWallX, drawArea.Right - 10);
			Pen^ dashedPen = gcnew Pen(Color::Black, 2); dashedPen->DashStyle = DashStyle::Dash; g->DrawLine(dashedPen, rightWallX, drawArea.Top, rightWallX, drawArea.Bottom);
			int rightHatchWidth = leftHatchWidth; g->FillRectangle(hatchBrush, rightWallX, drawArea.Top, rightHatchWidth, drawArea.Height);
			Pen^ targetPen = gcnew Pen(Color::Green, 1.5f); targetPen->DashStyle = DashStyle::Dot; g->DrawEllipse(targetPen, pixelTargetX - 8.0f, pixelTargetY - 8.0f, 16.0f, 16.0f);

			cli::array<PointF>^ pts = gcnew cli::array<PointF>(nSegments + 1);
			pts[0] = PointF((float)baseX, (float)baseY);

			float x = 0.0f, y = 0.0f, phi = 0.0f;
			array<float>^ anglesArray = angles->ToArray();
			array<float>^ lengthsArray = lengths->ToArray();

			for (int i = 0; i < nSegments; ++i) {
				float theta = (i < anglesArray->Length) ? anglesArray[i] : 0.0f;
				float L = (i < lengthsArray->Length) ? lengthsArray[i] : (float)nudBaseLength->Value;
				phi += theta;
				x += L * (float)Math::Cos((double)phi);
				y += L * (float)Math::Sin((double)phi);
				pts[i + 1] = PointF(baseX + x * scale, baseY - y * scale);
			}
			Pen^ penRod = gcnew Pen(Color::Red, 6.0f); for (int i = 0; i < nSegments; ++i) g->DrawLine(penRod, pts[i], pts[i + 1]);
			SolidBrush^ brJoint = gcnew SolidBrush(Color::Blue); const float R = 6.0f; for (int i = 0; i <= nSegments; ++i) g->FillEllipse(brJoint, pts[i].X - R, pts[i].Y - R, 2 * R, 2 * R);
			delete wallPen; delete dashedPen; delete targetPen; delete penRod; delete brJoint; delete hatchBrush;
		}
	};
}

MyForm.cpp

cpp
// MyForm.cpp — точка входа GUI

#include "MyForm.h"
#include <float.h>

using namespace System;
using namespace System::Windows::Forms;

typedef int(__cdecl* PInit)(int, float, float, float, float);
typedef void(__cdecl* PWaitWorkers)();
typedef void(__cdecl* PShutdownWorkers)();
typedef void(__cdecl* PFinalizeAll)();

[STAThread]
int main() {
	HMODULE h = LoadLibraryW(L"TEST_FUNC.dll");
	if (!h) return -1;

	auto AgpInit             = (PInit)GetProcAddress(h, "AgpInit");
	auto AgpWaitStartAndRun  = (PWaitWorkers)GetProcAddress(h, "AgpWaitStartAndRun");
	auto AgpShutdownWorkers  = (PShutdownWorkers)GetProcAddress(h, "AgpShutdownWorkers");
	auto AgpFinalizeAll      = (PFinalizeAll)GetProcAddress(h, "AgpFinalizeAll");

	const int rank = AgpInit ? AgpInit(12, -2.2f, 1.8f, -2.2f, 1.8f) : 0;

	if (!rank) {
		Application::EnableVisualStyles();
		Application::SetCompatibleTextRenderingDefault(false);
		Application::Run(gcnew TESTAGP::MyForm(h));

		// Корректно останавливаем воркеров и финализируем MPI перед выходом
		if (AgpShutdownWorkers) AgpShutdownWorkers();
		if (AgpFinalizeAll)     AgpFinalizeAll();
	}
	else {
		// Воркеры: ждём задания, корректно завершаемся внутри
		if (AgpWaitStartAndRun) AgpWaitStartAndRun();
	}
	return 0;
}

agp_algo.cpp

cpp
// agp_algo.cpp — исправленный DLL
// Важно: сборка C++17/20, /arch:AVX2, /Oi, /O2, /fp:fast желательны.

#include "pch.h"

// ---------------- Макросы и векторная математика (без изменений по смыслу) ----------------

#define XOR_RAND(state, result_var) \
    do { \
        uint32_t s = (state); \
        s ^= s << 13; \
        s ^= s >> 17; \
        s ^= s << 5; \
        (state) = s; \
        float tmp = (float)((double)(s) * (1.0/4294967296.0)); \
        result_var = tmp; \
    } while (0)

#define XOR_RAND_GRSH(state, result_var) \
    do { \
        uint32_t s = (state); \
        s ^= s << 13; \
        s ^= s >> 17; \
        s ^= s << 5; \
        (state) = s; \
        result_var = fmaf((float)(int32_t)s, 0x1.0p-31f, -1.0f); \
    } while (0)

#define FABE13_COS(x, result_var) \
    do { \
        const float _ax_ = fabsf(x); \
        float _r_ = fmodf(_ax_, 6.28318530718f); \
        if (_r_ > 3.14159265359f) \
            _r_ = 6.28318530718f - _r_; \
        if (_r_ < 1.57079632679f) { \
            const float _t2_ = _r_ * _r_; \
            const float _t4_ = _t2_ * _t2_; \
            result_var = fmaf(_t4_, fmaf(_t2_, -0.0013888889f, 0.0416666667f), fmaf(_t2_, -0.5f, 1.0f)); \
        } else { \
            _r_ = 3.14159265359f - _r_; \
            const float _t2_ = _r_ * _r_; \
            const float _t4_ = _t2_ * _t2_; \
            result_var = -fmaf(_t4_, fmaf(_t2_, -0.0013888889f, 0.0416666667f), fmaf(_t2_, -0.5f, 1.0f)); \
        } \
    } while (0)

#define FABE13_SIN(x, result_var) \
    do { \
        const float _x_ = (x); \
        const float _ax_ = fabsf(_x_); \
        float _r_ = fmodf(_ax_, 6.28318530718f); \
        bool _sfl_ = _r_ > 3.14159265359f; \
        if (_sfl_) \
            _r_ = 6.28318530718f - _r_; \
        bool _cfl_ = _r_ > 1.57079632679f; \
        if (_cfl_) \
            _r_ = 3.14159265359f - _r_; \
        const float _t2_ = _r_ * _r_; \
        float _s = fmaf(_t2_, fmaf(_t2_, fmaf(_t2_, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * _r_; \
        result_var = ((_x_ < 0.0f) ^ _sfl_) ? -_s : _s; \
    } while (0)

#define FABE13_SINCOS(in, sin_out, cos_out, n) \
    do { \
        int i = 0; \
        const int limit = (n) & ~7; \
        if ((n) >= 8) { \
            static __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f); \
            static __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f); \
            static __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f); \
            static __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f); \
            static __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f); \
            static __declspec(align(32)) const __m256 VEC_COS_P5 = _mm256_set1_ps(-0.0013888889f); \
            static __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f); \
            static __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f); \
            static __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f); \
            static __declspec(align(32)) const __m256 VEC_SIN_P5 = _mm256_set1_ps(-0.0001984127f); \
            static __declspec(align(32)) const __m256 VEC_SIN_P3 = _mm256_set1_ps(0.0083333333f); \
            static __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f); \
            static __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f); \
            static __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps(); \
            while (i < limit) { \
                const __m256 vx = _mm256_load_ps(&(in)[i]); \
                const __m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx); \
                __m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS); \
                q = _mm256_sub_ps(q, BIAS); \
                const __m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax); \
                const __m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r)); \
                const __m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1)); \
                const __m256 t2 = _mm256_mul_ps(r2, r2); \
                const __m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P5, VEC_COS_P3), VEC_COS_P1), VEC_COS_P0); \
                const __m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_SIN_P5, VEC_SIN_P3), VEC_SIN_P1), VEC_SIN_P0), r2); \
                const __m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ); \
                const __m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ), _mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ)); \
                _mm256_store_ps(&(cos_out)[i], _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip)); \
                _mm256_store_ps(&(sin_out)[i], _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip)); \
                i += 8; \
            } \
        } \
        while (i < (n)) { \
            const float x = (in)[i]; \
            const float ax = fabsf(x); \
            float q = fmaf(ax, 0.15915494309189535f, 12582912.0f); \
            q -= 12582912.0f; \
            float r = fmaf(-6.28318530718f, q, ax); \
            const bool sflip = r > 3.14159265359f; \
            if (sflip) \
                r = 6.28318530718f - r; \
            const bool cflip = r > 1.57079632679f; \
            if (cflip) \
                r = 3.14159265359f - r; \
            const float t2 = r * r; \
            const float c = fmaf(t2, fmaf(t2, fmaf(t2, -0.0013888889f, 0.0416666667f), -0.5f), 1.0f); \
            const float s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r; \
            (cos_out)[i] = cflip ? -c : c; \
            (sin_out)[i] = ((x < 0.0f) ^ sflip) ? -s : s; \
            ++i; \
        } \
    } while (0)

// ---------------- Прочее (Peano/Morton/MPI/TBB) ----------------

enum List : uint8_t { Top = 0b00u, Down = 0b01u, Left = 0b10u, Right = 0b11u };

__declspec(align(4)) struct Step final { const uint8_t next, dx, dy; };
__declspec(align(4)) struct InvStep final { const uint8_t q, next; };

__declspec(align(64)) static const Step g_step_tbl[4][4] = {
		{ { Right,0u,0u }, { Top,0u,1u }, { Top,1u,1u }, { Left,1u,0u } },
		{ { Left,1u,1u }, { Down,1u,0u }, { Down,0u,0u }, { Right,0u,1u } },
		{ { Down,1u,1u }, { Left,0u,1u }, { Left,0u,0u }, { Top,1u,0u } },
		{ { Top,0u,0u }, { Right,1u,0u }, { Right,1u,1u }, { Down,0u,1u } }
};

__declspec(align(64)) static const InvStep g_inv_tbl[4][4] = {
		{ { 0u,Right }, { 1u,Top }, { 3u,Left }, { 2u,Top } },
		{ { 2u,Down }, { 3u,Right }, { 1u,Down }, { 0u,Left } },
		{ { 2u,Left }, { 1u,Left }, { 3u,Top }, { 0u,Down } },
		{ { 0u,Top }, { 3u,Down }, { 1u,Right }, { 2u,Right } }
};

static const boost::mpi::environment*  g_env;
static const boost::mpi::communicator* g_world;

__declspec(align(16)) struct CrossMsg final {
	float s_x1, s_x2, e_x1, e_x2, Rtop;
	template<typename Archive> __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }
};
__declspec(align(16)) struct CtrlMsg final {
	bool kind; CrossMsg xchg;
	template<typename Archive> __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& kind& xchg; }
};

__declspec(align(16)) struct Slab final {
	char* const base; char* current; char* const end;
	__forceinline Slab(void* const memory, const size_t usable) : base((char*)memory), current(base), end(base + (usable & ~(size_t)63u)) {}
};
static tbb::enumerable_thread_specific<Slab*> tls([]() noexcept {
	void* memory = _aligned_malloc(16777216u, 64u);
	Slab* slab = (Slab*)_aligned_malloc(32u, 64u);
	new (slab) Slab(memory, 16777216u);
	char* p = slab->base;
	while (p < slab->end) { *p = 0; p += 4096u; }
	return slab;
});

// ---------------- Параметры 2D карты ----------------

__declspec(align(16)) struct Peano2DMap final {
	const int levels; const float a, b, c, d; const float lenx, leny; const float inv_lenx; const uint32_t scale; const uint8_t start;
	__forceinline Peano2DMap(int L, float _a, float _b, float _c, float _d, uint8_t st)
		: levels(L), a(_a), b(_b), c(_c), d(_d),
		lenx(_b - _a), leny(_d - _c), inv_lenx(1.0f / (_b - _a)),
		scale((uint32_t)1u << (L << 1)), start(st) {}
};
static Peano2DMap gActiveMap(0, 0, 0, 0, 0, 0);

// ---------------- Интервалы ----------------

__declspec(align(64)) struct Interval1D final {
	const float x1, x2, y1, y2, delta_y, ordinate_factor, N_factor, quadratic_term, M; float R;
	__forceinline void* operator new(size_t) noexcept { Slab* s = tls.local(); char* r = s->current; s->current += 64u; return r; }
	__forceinline Interval1D(float _x1, float _x2, float _y1, float _y2, float _N) noexcept
		: x1(_x1), x2(_x2), y1(_y1), y2(_y2), delta_y(_y2 - _y1), ordinate_factor(-(y1 + y2) * 2.0f),
		N_factor(_N == 1.0f ? _x2 - _x1 : sqrtf(_x2 - _x1)),
		quadratic_term((1.0f / N_factor)* delta_y* delta_y),
		M((1.0f / N_factor)* fabsf(delta_y)) {}
	__forceinline void ChangeCharacteristic(float _m) noexcept { R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor)); }
};

__declspec(align(64)) struct IntervalND final {
	const float x1, x2, y1, y2, delta_y, ordinate_factor;
	float N_factor, quadratic_term, M, R;
	uint64_t i1, i2; float diam; int span_level;
	__forceinline void* operator new(size_t) noexcept { Slab* s = tls.local(); char* r = s->current; s->current += 64u; return r; }
	__forceinline IntervalND(float _x1, float _x2, float _y1, float _y2) noexcept
		: x1(_x1), x2(_x2), y1(_y1), y2(_y2), delta_y(_y2 - _y1), ordinate_factor(-(y1 + y2) * 2.0f),
		N_factor(0), quadratic_term(0), M(0), R(0), i1(0), i2(0), diam(0), span_level(0) {}
	__forceinline void compute_span_level(const struct MortonND& map) noexcept;
	__forceinline void set_metric(float d_alpha) noexcept { N_factor = d_alpha; quadratic_term = (1.0f / N_factor) * delta_y * delta_y; M = (1.0f / N_factor) * fabsf(delta_y); }
	__forceinline void ChangeCharacteristic(float _m) noexcept { R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor)); }
};

__forceinline bool ComparePtr1D(const Interval1D* a, const Interval1D* b) noexcept { return a->R < b->R; }
__forceinline bool ComparePtrND(const IntervalND* a, const IntervalND* b) noexcept { return a->R < b->R; }

// ---------------- Массовые пересчёты R ----------------

__forceinline void RecomputeR_ConstM_AVX2_1D(Interval1D* const* arr, size_t n, float m) {
	const __m256 vm = _mm256_set1_ps(m);
	__m256 vinvm = _mm256_rcp_ps(vm);
	vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
	size_t i = 0, limit = n & ~7ull;
	alignas(32) float q[8], nf[8], od[8], out[8];
	for (; i < limit; i += 8) {
		for (int k = 0; k < 8; ++k) { const Interval1D* p = arr[i + k]; q[k] = p->quadratic_term; nf[k] = p->N_factor; od[k] = p->ordinate_factor; }
		const __m256 vq = _mm256_load_ps(q), vnf = _mm256_load_ps(nf), vod = _mm256_load_ps(od);
		const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
		const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
		_mm256_store_ps(out, res);
		for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k];
	}
	for (; i < n; ++i) arr[i]->ChangeCharacteristic(m);
}
__forceinline void RecomputeR_AffineM_AVX2_1D(Interval1D* const* arr, size_t n, float GF, float alpha) {
	const __m256 vGF = _mm256_set1_ps(GF), va = _mm256_set1_ps(alpha);
	size_t i = 0, limit = n & ~7ull;
	alignas(32) float ln[8], Mv[8], q[8], nf[8], od[8], out[8];
	for (; i < limit; i += 8) {
		for (int k = 0; k < 8; ++k) {
			const Interval1D* p = arr[i + k];
			ln[k] = p->x2 - p->x1; Mv[k] = p->M; q[k] = p->quadratic_term; nf[k] = p->N_factor; od[k] = p->ordinate_factor;
		}
		const __m256 vln = _mm256_load_ps(ln), vM = _mm256_load_ps(Mv), vq = _mm256_load_ps(q), vnf = _mm256_load_ps(nf), vod = _mm256_load_ps(od);
		const __m256 vm = _mm256_fmadd_ps(vGF, vln, _mm256_mul_ps(va, vM));
		__m256 vinvm = _mm256_rcp_ps(vm);
		vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
		const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
		const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
		_mm256_store_ps(out, res);
		for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k];
	}
	for (; i < n; ++i) {
		const Interval1D* p = arr[i];
		const float mi = fmaf(GF, (p->x2 - p->x1), p->M * alpha);
		arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor));
	}
}
__forceinline void RecomputeR_ConstM_AVX2_ND(IntervalND* const* arr, size_t n, float m) {
	const __m256 vm = _mm256_set1_ps(m);
	__m256 vinvm = _mm256_rcp_ps(vm);
	vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
	size_t i = 0, limit = n & ~7ull;
	alignas(32) float q[8], nf[8], od[8], out[8];
	for (; i < limit; i += 8) {
		for (int k = 0; k < 8; ++k) { const IntervalND* p = arr[i + k]; q[k] = p->quadratic_term; nf[k] = p->N_factor; od[k] = p->ordinate_factor; }
		const __m256 vq = _mm256_load_ps(q), vnf = _mm256_load_ps(nf), vod = _mm256_load_ps(od);
		const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
		const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
		_mm256_store_ps(out, res);
		for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k];
	}
	for (; i < n; ++i) arr[i]->ChangeCharacteristic(m);
}
__forceinline void RecomputeR_AffineM_AVX2_ND(IntervalND* const* arr, size_t n, float GF, float alpha) {
	const __m256 vGF = _mm256_set1_ps(GF), va = _mm256_set1_ps(alpha);
	size_t i = 0, limit = n & ~7ull;
	alignas(32) float ln[8], Mv[8], q[8], nf[8], od[8], out[8];
	for (; i < limit; i += 8) {
		for (int k = 0; k < 8; ++k) {
			const IntervalND* p = arr[i + k];
			ln[k] = p->x2 - p->x1; Mv[k] = p->M; q[k] = p->quadratic_term; nf[k] = p->N_factor; od[k] = p->ordinate_factor;
		}
		const __m256 vln = _mm256_load_ps(ln), vM = _mm256_load_ps(Mv), vq = _mm256_load_ps(q), vnf = _mm256_load_ps(nf), vod = _mm256_load_ps(od);
		const __m256 vm = _mm256_fmadd_ps(vGF, vln, _mm256_mul_ps(va, vM));
		__m256 vinvm = _mm256_rcp_ps(vm);
		vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
		const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
		const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
		_mm256_store_ps(out, res);
		for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k];
	}
	for (; i < n; ++i) {
		const IntervalND* p = arr[i];
		const float mi = fmaf(GF, (p->x2 - p->x1), p->M * alpha);
		arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor));
	}
}

// ---------------- Быстрые степени ----------------

__forceinline float fast_pow_int(float v, int n) {
	float r;
	switch (n) {
	case 3: { float v2 = v * v; r = v2 * v; } break;
	case 4: { float v2 = v * v; r = v2 * v2; } break;
	case 5: { float v2 = v * v; r = v2 * v2 * v; } break;
	case 6: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v2; } break;
	case 7: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v2 * v; } break;
	case 8: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v4; } break;
	case 9: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v3; } break;
	case 10: { float v2 = v * v; float v4 = v2 * v2; float v8 = v4 * v4; r = v8 * v2; } break;
	case 11: { float v2 = v * v; float v4 = v2 * v2; float v8 = v4 * v4; r = v8 * v2 * v; } break;
	case 12: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v6; } break;
	case 13: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v6 * v; } break;
	case 14: { float v7 = v * v * v * v * v * v * v; r = v7 * v7; } break;
	case 15: { float v7 = v * v * v * v * v * v * v; r = v7 * v7 * v; } break;
	default: { float v2 = v * v; float v4 = v2 * v2; float v8 = v4 * v4; r = v8 * v8; }
	}
	return r;
}

// ---------------- Шаг (без aliasing-трюков) ----------------

__forceinline float Shag(float _m, float x1, float x2, float y1, float y2, float _N, float _r) {
	const float diff = y2 - y1;
	const float sign_mult = (diff < 0.0f) ? -1.0f : 1.0f;
	if (_N == 1)
		return fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f;
	if (_N == 2)
		return fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f;
	const float invmN = 1.0f / fast_pow_int(_m, _N);
	const float dN = fast_pow_int(fabsf(diff), _N);
	return fmaf(sign_mult * invmN, dN * _r, x1 + x2) * 0.5f;
}

// ---------------- Morton utils ----------------

struct MortonCachePerRank {
	std::vector<int> permCache;
	std::vector<uint64_t> invMaskCache;
	uint32_t baseSeed;
};

static MortonCachePerRank g_mc;

static __forceinline uint64_t gray_encode(uint64_t x) noexcept { return x ^ (x >> 1); }
static __forceinline uint64_t gray_decode(uint64_t g) noexcept { g ^= g >> 32; g ^= g >> 16; g ^= g >> 8; g ^= g >> 4; g ^= g >> 2; g ^= g >> 1; return g; }

// ---------------- MortonND ----------------

struct MortonND final {
	int dim, levels;
	int eff_levels;
	int extra_levels;
	int chunks;
	std::vector<int> chunk_bits;
	std::vector<uint64_t> chunk_bases;
	uint64_t scale;
	std::vector<float> low, high, step, invStep, baseOff;
	std::vector<int> perm;
	std::vector<uint64_t> invMask;
	std::vector<uint64_t> pextMask;
	std::vector<uint64_t> pextMaskChunks;
	float invScaleLevel;
	bool use_gray;

	static inline uint64_t make_mask(int dim, int Lc, int d) {
		uint64_t m = 0ull, bitpos = (uint64_t)d;
		for (int b = 0; b < Lc; ++b) { m |= 1ull << bitpos; bitpos += (uint64_t)dim; }
		return m;
	}

	__forceinline MortonND(int D, int L, const float* lows, const float* highs, const MortonCachePerRank& mc)
		: dim(D), levels(L),
		eff_levels((std::max)(1, (int)(63 / (D ? D : 1)))),
		extra_levels((L > eff_levels) ? (L - eff_levels) : 0),
		chunks((extra_levels > 0) ? (1 + (extra_levels + eff_levels - 1) / eff_levels) : 1),
		low(lows, lows + D), high(highs, highs + D),
		step(D, 0.0f), invStep(D, 0.0f), baseOff(D, 0.0f),
		perm(mc.permCache.begin(), mc.permCache.begin() + D),
		invMask(mc.invMaskCache.begin(), mc.invMaskCache.begin() + D),
		invScaleLevel(1.0f / (float)((uint64_t)1 << L)), use_gray(true) {

		for (int d = 0; d < dim; ++d) {
			float rng = high[d] - low[d];
			float st = rng * invScaleLevel;
			step[d] = st;
			invStep[d] = 1.0f / st;
			baseOff[d] = fmaf(0.5f, st, low[d]);
		}

		chunk_bits.resize(chunks);
		pextMaskChunks.resize((size_t)chunks * (size_t)dim);
		chunk_bases.resize(chunks);
		int remaining = levels;
		for (int c = 0; c < chunks; ++c) {
			int Lc = (c == 0) ? (std::min)(eff_levels, remaining) : (std::min)(eff_levels, remaining);
			chunk_bits[c] = Lc;
			remaining -= Lc;
			uint64_t baseC = (uint64_t)1 << (dim * Lc);
			chunk_bases[c] = baseC;
			for (int d = 0; d < dim; ++d)
				pextMaskChunks[(size_t)c * (size_t)dim + (size_t)d] = make_mask(dim, Lc, d);
		}

		pextMask.resize(dim);
		for (int d = 0; d < dim; ++d)
			pextMask[d] = make_mask(dim, chunk_bits[0], d);

		scale = (uint64_t)1 << (dim * chunk_bits[0]);
	}

	__forceinline float block_diameter(uint64_t i1, uint64_t i2) const noexcept {
		if (i1 > i2) std::swap(i1, i2);
		float s2 = 0.0f;
		for (int d = 0; d < dim; ++d) {
			const int pd = perm[d];
			const uint64_t varying = (i1 ^ i2) & pextMask[d];
			const int nfree_hi = _mm_popcnt_u64(varying);
			const int nfree_total = nfree_hi + (levels - chunk_bits[0]);
			const float range = step[pd] * (ldexpf(1.0f, nfree_total) - 1.0f);
			s2 = fmaf(range, range, s2);
		}
		return sqrtf(s2);
	}

	__forceinline void map01ToPoint(float t, float* __restrict out) const noexcept {
		if (t <= 0.0f) t = 0.0f;
		else if (t >= 1.0f) t = 0x1.fffffep-1f;

		uint64_t accBits[32] = { 0ull };
		int accShifted[32] = { 0 };

		double u = (double)t;
		for (int c = 0; c < chunks; ++c) {
			const int Lc = chunk_bits[c];
			const uint64_t baseC = chunk_bases[c];
			double scaled = u * (double)baseC;
			uint64_t idxc = (scaled >= (double)baseC) ? (baseC - 1ull) : (uint64_t)scaled;
			u = scaled - (double)idxc;
			if (use_gray) idxc = gray_encode(idxc);

			int shift_from_top = 0;
			for (int k = 0; k <= c; ++k) shift_from_top += chunk_bits[k];
			int inv_shift = levels - shift_from_top;

			for (int d = 0; d < dim; ++d) {
				int pd = perm[d];
				uint64_t mask = pextMaskChunks[(size_t)c * (size_t)dim + (size_t)d];
				uint64_t bits = _pext_u64(idxc, mask);
				if (inv_shift >= 0) {
					uint64_t invMaskSegment = 0ull;
					if (chunk_bits[c] < 63) {
						uint64_t take = ((uint64_t)1 << chunk_bits[c]) - 1ull;
						invMaskSegment = (invMask[pd] >> inv_shift) & take;
					}
					bits ^= invMaskSegment;
				}
				accBits[pd] = (accBits[pd] << Lc) | bits;
				accShifted[pd] += Lc;
			}
		}
		for (int d = 0; d < dim; ++d)
			out[d] = fmaf(step[d], (float)accBits[d], baseOff[d]);
	}

	__forceinline float pointToT(const float* __restrict q) const noexcept {
		const int bitsFull = levels;
		const int bitsCoarse = chunk_bits[0];
		uint64_t idx0 = 0ull;
		for (int d = 0; d < dim; ++d) {
			int pd = perm[d];
			float v = (q[pd] - baseOff[pd]) * invStep[pd];
			int64_t cell = (int64_t)_mm_cvt_ss2si(_mm_round_ss(_mm_setzero_ps(), _mm_set_ss(v), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
			if (cell < 0) cell = 0;
			int64_t maxv = ((int64_t)1 << bitsFull) - 1;
			if (cell > maxv) cell = maxv;
			uint64_t b = (uint64_t)cell >> (bitsFull - bitsCoarse);
			uint64_t invMask0 = 0ull;
			if (bitsCoarse < 63) { uint64_t take = ((uint64_t)1 << bitsCoarse) - 1ull; invMask0 = (invMask[pd] >> (bitsFull - bitsCoarse)) & take; }
			b ^= invMask0;
			idx0 |= _pdep_u64(b, pextMask[d]);
		}
		if (use_gray) idx0 = gray_decode(idx0);
		return ((float)idx0 + 0.5f) / (float)scale;
	}
};

__forceinline void IntervalND::compute_span_level(const MortonND& map) noexcept {
	span_level = 0;
	for (int d = 0; d < map.dim; ++d) {
		uint64_t varying = (i1 ^ i2) & map.pextMask[d];
		span_level += _mm_popcnt_u64(varying);
	}
	span_level += (map.levels - map.chunk_bits[0]) * map.dim;
	span_level = (std::min)(span_level, 11);
}

// ---------------- Стоимость манипулятора ----------------

struct ManipCost final {
	int n; bool variableLen; float targetX, targetY; float minTheta;
	float archBiasW, archBiasK;
	float sharpW;
	ManipCost(int _n, bool _variableLen, float _targetX, float _targetY, float _minTheta)
		: n(_n), variableLen(_variableLen), targetX(_targetX), targetY(_targetY), minTheta(_minTheta),
		archBiasW(0.02f), archBiasK(3.0f), sharpW(0.05f) {}
	__forceinline float operator()(const float* __restrict q, float& out_x, float& out_y) const noexcept {
		const float* th = q;
		const float* L = variableLen ? (q + n) : nullptr;
		__declspec(align(64)) float phi[32], s_arr[32], c_arr[32];
		float x = 0.0f, y = 0.0f, phi_acc = 0.0f, penC = 0.0f, archPen = 0.0f;

		for (int i = 0; i < n; ++i) { phi_acc += th[i]; phi[i] = phi_acc; }
		FABE13_SINCOS(phi, s_arr, c_arr, n);

		if (variableLen) { for (int i = 0; i < n; ++i) { float Li = L[i]; x = fmaf(Li, c_arr[i], x); y = fmaf(Li, s_arr[i], y); } }
		else { for (int i = 0; i < n; ++i) { x = fmaf(1.0f, c_arr[i], x); y = fmaf(1.0f, s_arr[i], y); } }

		for (int i = 0; i < n; ++i) {
			float ai = fabsf(th[i]);
			float v = minTheta - ai;
			if (v > 0.0f) {
				float scale = 2.0f / (minTheta + 1e-6f);
				penC += sharpW * (exp2f(scale * v) - 1.0f);
			}
			float t = -th[i] * archBiasK;
			float sp = (t > 10.f) ? t : log1pf(expf(t));
			archPen += archBiasW * sp;
		}

		float dx = x - targetX, dy = y - targetY;
		float dist = sqrtf(fmaf(dx, dx, dy * dy));
		out_x = x; out_y = y;
		return dist + penC + archPen;
	}
};

// ---------------- Hit-test для 2D карты ----------------

__forceinline void HitTest2D_analytic(float x_param, float& out_x1, float& out_x2) {
	const float a = gActiveMap.a, inv_lenx = gActiveMap.inv_lenx;
	const uint32_t scale = gActiveMap.scale, scale_minus_1 = scale - 1u;
	const float lenx = gActiveMap.lenx, leny = gActiveMap.leny, c = gActiveMap.c;
	const uint8_t start = gActiveMap.start; const int levels = gActiveMap.levels;

	float norm = (x_param - a) * inv_lenx;
	norm = fminf(fmaxf(norm, 0.0f), 0x1.fffffep-1f);
	uint32_t idx = (uint32_t)(norm * (float)scale);
	idx = idx > scale_minus_1 ? scale_minus_1 : idx;

	float sx = lenx, sy = leny; float x1 = a, x2 = c; uint8_t type = start; int l = levels - 1;
	while (l >= 0) {
		const uint32_t q = (idx >> (l * 2)) & 3u;
		const Step s = g_step_tbl[type][q];
		type = s.next; sx *= 0.5f; sy *= 0.5f;
		x1 += s.dx ? sx : 0.0f; x2 += s.dy ? sy : 0.0f; --l;
	}
	out_x1 = x1 + sx * 0.5f; out_x2 = x2 + sy * 0.5f;
}
__forceinline float FindX2D_analytic(float px, float py) {
	const float a = gActiveMap.a, b = gActiveMap.b, c = gActiveMap.c, d = gActiveMap.d;
	const float lenx = gActiveMap.lenx, leny = gActiveMap.leny; const uint32_t scale = gActiveMap.scale;
	const uint8_t start = gActiveMap.start; const int levels = gActiveMap.levels;
	const float clamped_px = fminf(fmaxf(px, a), b), clamped_py = fminf(fmaxf(py, c), d);
	float sx = lenx, sy = leny; float x0 = a, y0 = c; uint32_t idx = 0u; uint8_t type = start; int l = 0;
	while (l < levels) {
		sx *= 0.5f; sy *= 0.5f; const float mx = x0 + sx, my = y0 + sy;
		const uint32_t tr = (uint32_t)((clamped_px > mx) & (clamped_py > my));
		const uint32_t tl = (uint32_t)((clamped_px < mx) & (clamped_py > my));
		const uint32_t dl = (uint32_t)((clamped_px < mx) & (clamped_py < my));
		const uint32_t none = (uint32_t)(1u ^ (tr | tl | dl));
		const uint32_t dd = (tr << 1) | tr | tl | (none << 1);
		const InvStep inv = g_inv_tbl[type][dd];
		type = inv.next; idx = (idx << 2) | inv.q;
		const uint32_t dx = dd >> 1, dy = dd & 1u; x0 += dx ? sx : 0.0f; y0 += dy ? sy : 0.0f; ++l;
	}
	const float scale_recip = 1.0f / (float)scale;
	return fmaf((float)idx * scale_recip, lenx, a);
}

// ---------------- MPI-структуры ----------------

__declspec(align(16)) struct MultiCrossMsg final {
	float intervals[15]; uint8_t count;
	template<typename Archive> __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& intervals& count; }
};
__declspec(align(16)) struct BestSolutionMsg final {
	float bestF, bestX, bestY, bestQ[32]; uint8_t dim;
	template<typename Archive> __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar& bestF& bestX& bestY& bestQ& dim; }
};
__declspec(align(16)) struct CtrlMsgND final {
	uint8_t kind; CrossMsg xchg; MultiCrossMsg multiXchg; BestSolutionMsg bestSol;
	template<typename Archive> __forceinline void serialize(Archive& ar, const unsigned int) noexcept {
		ar& kind; if (kind == 1) ar& xchg; else if (kind == 2) ar& multiXchg; else if (kind == 3) ar& bestSol;
	}
};

// ---------------- Генерация сидов (исправлено, без переполнений) ----------------

static __forceinline int generate_lhs_seeds_lite(const MortonND& map, const int dim,
	float* __restrict S, int stride, uint32_t seed, int max_out)
{
	if (dim <= 0 || max_out <= 0) return 0;

	// Кол-во LHS точек не больше max_out
	const int lhs_count = max_out;

	std::vector<uint32_t> A(dim), B(dim);
	uint32_t st = seed;
	auto next = [&]() -> uint32_t { st ^= st << 13; st ^= st >> 17; st ^= st << 5; return st; };

	// Для каждой размерности берём линейную перестановку s -> (A*s + B) mod lhs_count (A нечётное → биекция)
	for (int d = 0; d < dim; ++d) {
		uint32_t a = (next() | 1u) % (uint32_t)lhs_count; if (a == 0) a = 1;
		uint32_t b = next() % (uint32_t)lhs_count;
		A[d] = a; B[d] = b;
	}

	for (int sidx = 0; sidx < lhs_count; ++sidx) {
		for (int d = 0; d < dim; ++d) {
			float u = (next() & 0xFFFFFF) * 5.9604645e-8f;
			uint32_t stratum = (A[d] * (uint32_t)sidx + B[d]) % (uint32_t)lhs_count;
			float pos = ((float)stratum + u) / (float)lhs_count;
			int pd = map.perm[d];
			float lo = map.low[pd], hi = map.high[pd];
			S[sidx * stride + d] = fmaf(pos, (hi - lo), lo);
		}
	}
	return lhs_count;
}

static __forceinline int generate_heuristic_seeds(const ManipCost& cost, const MortonND& map,
	int dim, float* __restrict S, int stride, uint32_t seed, int max_total)
{
	int total = 0;
	const int n = cost.n;
	const bool VL = cost.variableLen;

	auto add_seed = [&](const std::function<void(float*)>& fill) {
		if (total >= max_total) return false;
		float* s = S + total * stride;
		for (int i = 0; i < stride; ++i) s[i] = 0.0f;
		fill(s);
		total++;
		return true;
	};

	// 1) В сторону цели
	add_seed([&](float* s0) {
		float phi = atan2f(cost.targetY, cost.targetX);
		float rho = sqrtf(fmaf(cost.targetX, cost.targetX, cost.targetY * cost.targetY));
		float len = fminf(fmaxf(rho / (float)n, 0.5f), 2.0f);
		for (int i = 0; i < n; ++i) s0[i] = phi / (float)n;
		if (VL) for (int i = 0; i < n; ++i) s0[n + i] = len;
	});

	// 2) Попеременно
	add_seed([&](float* s1) {
		float phi = atan2f(cost.targetY, cost.targetX);
		for (int i = 0; i < n; ++i) s1[i] = 0.5f * phi * ((i & 1) ? -1.0f : 1.0f);
		if (VL) for (int i = 0; i < n; ++i) s1[n + i] = 1.0f * (0.8f + 0.4f * (float)i / (float)n);
	});

	// 3) Затухающая линейка
	add_seed([&](float* s2) {
		const float inv = (n > 1) ? 1.0f / (float)(n - 1) : 0.0f;
		float phi = atan2f(cost.targetY, cost.targetX);
		for (int i = 0; i < n; ++i) { float pr = (float)i * inv; s2[i] = phi * (1.0f - 0.3f * pr); }
		if (VL) { for (int i = 0; i < n; ++i) { float si = sinf(1.5f * (float)i); s2[n + i] = (1.0f + 0.2f * si); } }
	});

	// 4) LHS-лайт (забиваем остаток ёмкости)
	if (total < max_total) {
		int lhs_cap = max_total - total;
		int got = generate_lhs_seeds_lite(map, dim, S + total * stride, stride, seed, lhs_cap);
		total += got;
	}
	return total;
}

// ---------------- Ветвитель ND ----------------

static __forceinline void agp_run_branch_mpi(
	const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed,
	std::vector<IntervalND*>& H, std::vector<float>& bestQ, float& bestF, float& bestX, float& bestY, float M_prior = 1e-3f)
{
	const int n = cost.n;
	const int dim = n + (cost.variableLen ? n : 0);

	alignas(32) float M_by_span[12]; for (int i = 0; i < 12; ++i) M_by_span[i] = M_prior;
	float Mmax = M_prior;

	alignas(64) float q_local[32], phi[32], s_arr[32], c_arr[32], sum_s[32], sum_c[32], q_try[32];
	bestQ.reserve(dim);
	float x = 0.0f, y = 0.0f;
	int no_improve = 0;

	auto t_to_idx = [&](float t) -> uint64_t {
		float tt = (t <= 0.0f) ? 0.0f : (t >= 1.0f ? std::nextafterf(1.0f, 0.0f) : t);
		uint64_t idx = (uint64_t)((double)tt * (double)map.scale);
		if (idx >= map.scale) idx = map.scale - 1ull;
		return idx;
	};
	auto update_pockets_and_Mmax = [&](IntervalND* I) {
		const int k = I->span_level; if (I->M > M_by_span[k]) M_by_span[k] = I->M; if (M_by_span[k] > Mmax) Mmax = M_by_span[k];
	};

	float a = 0.0f, b = 1.0f;

	auto evalAt = [&](float t) -> float {
		map.map01ToPoint(t, q_local);
		float f = cost(q_local, x, y);

		if (f < bestF * 1.25f) {
			float acc = 0.0f; for (int i = 0; i < n; ++i) { acc += q_local[i]; phi[i] = acc; }
			FABE13_SINCOS(phi, s_arr, c_arr, n);
			float as = 0.0f, ac = 0.0f;
			for (int k = n - 1; k >= 0; --k) {
				const float Lk = cost.variableLen ? q_local[n + k] : 1.0f;
				as += Lk * s_arr[k]; ac += Lk * c_arr[k]; sum_s[k] = as; sum_c[k] = ac;
			}
			const float dx = x - cost.targetX, dy = y - cost.targetY;
			float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1e-8f;

			float eta = 0.125f;
			for (int stepI = 0; stepI < 3; ++stepI) {
				for (int i = 0; i < n; ++i) {
					float gpen = 0.0f;
					{
						float ai = fabsf(q_local[i]);
						float v = cost.minTheta - ai;
						if (v > 0.0f) {
							float scale = 2.0f / (cost.minTheta + 1e-6f);
							float e = exp2f(scale * v);
							float dpen_dtheta = cost.sharpW * (e * 0.69314718055994530941f * scale) * (-copysignf(1.0f, q_local[i]));
							gpen += dpen_dtheta;
						}
					}
					{
						float tsg = -q_local[i] * cost.archBiasK;
						float sig = 1.0f / (1.0f + expf(-tsg));
						gpen += -(cost.archBiasW * cost.archBiasK) * sig;
					}

					float g = (dx * (-sum_s[i]) + dy * (sum_c[i])) / dist + gpen;
					q_try[i] = q_local[i] - eta * g;

					const float deg2rad = 3.14159265358979323846f / 180.0f;
					const float lo0 = -60.0f * deg2rad, hi0 = 150.0f * deg2rad;
					const float lo = -150.0f * deg2rad, hi = 150.0f * deg2rad;
					const float Lb = (i == 0) ? lo0 : lo;
					const float Hb = (i == 0) ? hi0 : hi;
					if (q_try[i] < Lb) q_try[i] = Lb; else if (q_try[i] > Hb) q_try[i] = Hb;
				}
				if (cost.variableLen) {
					for (int i = 0; i < n; ++i) {
						float g = (dx * c_arr[i] + dy * s_arr[i]) / dist;
						float v = q_local[n + i] - eta * g;
						if (v < 0.5f) v = 0.5f; else if (v > 2.0f) v = 2.0f;
						q_try[n + i] = v;
					}
				}
				float x2, y2; float f2 = cost(q_try, x2, y2);
				if (f2 < f) { memcpy(q_local, q_try, dim * sizeof(float)); f = f2; x = x2; y = y2; break; }
				eta *= 0.5f;
			}

			const int last = n - 1;
			const float deg2rad = 3.14159265358979323846f / 180.0f;
			const float lo = (last == 0) ? (-60.0f * deg2rad) : (-150.0f * deg2rad);
			const float hi = 150.0f * deg2rad;
			float bestLocF = f;
			float saved = q_local[last];
			for (float delta = 0.05f; delta >= 0.00625f; delta *= 0.5f) {
				for (int sgn = -1; sgn <= 1; sgn += 2) {
					float cand = saved + sgn * delta;
					if (cand < lo) cand = lo; else if (cand > hi) cand = hi;
					float backup = q_local[last]; q_local[last] = cand;
					float x2, y2; float f2 = cost(q_local, x2, y2);
					if (f2 < bestLocF) { bestLocF = f2; x = x2; y = y2; saved = cand; }
					q_local[last] = backup;
				}
			}
			if (bestLocF < f) { q_local[last] = saved; f = bestLocF; }
		}

		if (f < bestF) { bestF = f; bestQ.assign(q_local, q_local + dim); bestX = x; bestY = y; no_improve = 0; }
		else { ++no_improve; }
		return f;
	};

	float f_a = evalAt(a), f_b = evalAt(b);
	const int K = (std::min)((std::max)(2 * dim, 8), 128);

	H.reserve((size_t)maxIter + K + 16);
	const int rank = g_world->rank();
	const int world = g_world->size();

	const int MAX_SEEDS = 128;
	std::vector<float> seeds((size_t)MAX_SEEDS * 32u, 0.0f);
	const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds.data(), 32, seed + rank * 7919u, MAX_SEEDS);

	for (int i = 0; i < seedCnt; ++i) {
		const float* s = seeds.data() + i * 32;
		float t_seed = map.pointToT(s);
		const int denom = (std::max)(1, seedCnt - 4);
		float interval_size = (i < 3) ? (0.0004f * (float)dim)
			: (0.00031f * (float)dim) * exp2f((1.0f / (float)denom) * log2f(0.00025f / 0.00031f) * (float)(i - 3));
		float t1 = fmaxf(a, t_seed - interval_size), t2 = fminf(b, t_seed + interval_size);
		if (t2 <= t1) continue;
		alignas(64) float q1[32], q2[32]; float x1, y1, x2, y2;
		map.map01ToPoint(t1, q1); float f1 = cost(q1, x1, y1);
		map.map01ToPoint(t2, q2); float f2 = cost(q2, x2, y2);
		IntervalND* I = new IntervalND(t1, t2, f1, f2);
		I->i1 = t_to_idx(t1); I->i2 = t_to_idx(t2);
		I->diam = map.block_diameter(I->i1, I->i2);
		I->compute_span_level(map);
		I->set_metric(I->diam);
		update_pockets_and_Mmax(I);
		I->ChangeCharacteristic(r * Mmax);
		if (i < 3) I->R *= fmaf(0.01f, (float)dim, 0.85f);
		else {
			float start_mult = 0.214f * (float)dim;
			float end_mult = 0.174f * (float)dim;
			float mult = start_mult * exp2f((1.0f / (float)denom) * log2f(end_mult / start_mult) * (float)(i - 3));
			I->R *= mult;
		}
		H.emplace_back(I);
		std::push_heap(H.begin(), H.end(), ComparePtrND);
		if (f1 < bestF) { bestF = f1; bestQ.assign(q1, q1 + dim); bestX = x1; bestY = y1; }
		if (f2 < bestF) { bestF = f2; bestQ.assign(q2, q2 + dim); bestX = x2; bestY = y2; }
	}

	float prev_t = a, prev_f = f_a;
	for (int k = 1; k <= K; ++k) {
		float t = a + (b - a) * ((float)k / (K + 1)) + (float)rank / (float)(world * (K + 1));
		float f = evalAt(t);
		IntervalND* I = new IntervalND(prev_t, t, prev_f, f);
		I->i1 = t_to_idx(prev_t); I->i2 = t_to_idx(t);
		I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map); I->set_metric(I->diam);
		update_pockets_and_Mmax(I); I->ChangeCharacteristic(r * Mmax);
		H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND);
		prev_t = t; prev_f = f;
	}
	IntervalND* tail = new IntervalND(prev_t, b, prev_f, f_b);
	tail->i1 = t_to_idx(prev_t); tail->i2 = t_to_idx(b);
	tail->diam = map.block_diameter(tail->i1, tail->i2); tail->compute_span_level(map); tail->set_metric(tail->diam);
	update_pockets_and_Mmax(tail); tail->ChangeCharacteristic(r * Mmax);
	H.emplace_back(tail); std::push_heap(H.begin(), H.end(), ComparePtrND);

	float dmax = b - a, initial_len = dmax, thr03 = 0.3f * initial_len, inv_thr03 = 1.0f / thr03;
	int it = 0;

	auto kickEveryByDim = [&](int dim) -> int {
		float z = 120.0f * exp2f(-0.05f * (float)dim);
		if (z < 60.0f) z = 60.0f;
		return (int)z;
	};
	auto noImproveThrByDim = [&](int dim) -> int {
		float z = 80.0f * exp2f(-0.08f * (float)dim);
		if (z < 30.0f) z = 30.0f;
		return (int)z;
	};

	while (it < maxIter) {
		if ((it % kickEveryByDim(dim)) == 0 && no_improve > noImproveThrByDim(dim) && !bestQ.empty()) {
			float t_best = map.pointToT(bestQ.data());
			for (int i = 0; i < 2; ++i) {
				float off = (i == 0) ? 0.01f : -0.01f;
				float t_seed = fminf(b, fmaxf(a, t_best + off));
				float f_seed = evalAt(t_seed);
				IntervalND* J = new IntervalND(t_seed - 0.005f, t_seed + 0.005f, f_seed, f_seed);
				J->i1 = t_to_idx(t_seed - 0.005f); J->i2 = t_to_idx(t_seed + 0.005f);
				J->diam = map.block_diameter(J->i1, J->i2); J->compute_span_level(map); J->set_metric(J->diam);
				update_pockets_and_Mmax(J);
				J->ChangeCharacteristic(r * Mmax); J->R *= 0.9f;
				H.emplace_back(J); std::push_heap(H.begin(), H.end(), ComparePtrND);
			}
			no_improve = 0;
		}

		const float p = fmaf(-1.0f / initial_len, dmax, 1.0f);
		bool   stagnation = (no_improve > 100) && (it > 270);

		float A = 200.0f + 64.0f * exp2f(-0.06f * (float)dim);
		float B = 210.0f + 67.0f * exp2f(-0.06f * (float)dim);
		const int T = (int)fmaf(-expm1f(p), A, B);

		float  r_eff = fmaxf(1.0f, r * (0.7f + 0.3f * (1.0f - p)));

		std::pop_heap(H.begin(), H.end(), ComparePtrND);
		IntervalND* cur = H.back(); H.pop_back();

		const float x1 = cur->x1, x2 = cur->x2, y1 = cur->y1, y2 = cur->y2;
		float m = r_eff * Mmax;
		float tNew = Shag(m, x1, x2, y1, y2, (float)dim, r);
		tNew = fminf(fmaxf(tNew, a), b);
		float fNew = evalAt(tNew);

		IntervalND* L = new IntervalND(x1, tNew, y1, fNew);
		IntervalND* Rv = new IntervalND(tNew, x2, fNew, y2);

		L->i1 = t_to_idx(x1); L->i2 = t_to_idx(tNew);
		Rv->i1 = t_to_idx(tNew); Rv->i2 = t_to_idx(x2);
		L->diam = map.block_diameter(L->i1, L->i2); Rv->diam = map.block_diameter(Rv->i1, Rv->i2);
		L->compute_span_level(map); Rv->compute_span_level(map);
		L->set_metric(L->diam); Rv->set_metric(Rv->diam);

		float Mloc = (std::max)(L->M, Rv->M);
		update_pockets_and_Mmax(L); update_pockets_and_Mmax(Rv);

		const float prevMmax = Mmax; if (Mloc > Mmax) Mmax = Mloc;
		m = r_eff * Mmax;

		if (adaptive) {
			float len1 = tNew - x1, len2 = x2 - tNew;
			if (len1 + len2 == dmax) {
				dmax = (std::max)(len1, len2);
				for (auto pI : H) { float Ls = pI->x2 - pI->x1; if (Ls > dmax) dmax = Ls; }
			}
			if ((thr03 > dmax && !(it % 3)) || (10.0f * dmax < initial_len)) {
				const float progress = fmaf(-dmax, inv_thr03, 1.0f);
				const float alpha = progress * progress;
				const float beta = fmaf(-alpha, 1.0f, 2.0f);
				const float MULT = (1.0f / dmax) * Mmax;
				const float global_coeff = fmaf(MULT, r_eff, -MULT);
				const float GF = fmaf(beta, global_coeff, 0.0f);
				L->ChangeCharacteristic(fmaf(GF, len1, L->M * alpha));
				Rv->ChangeCharacteristic(fmaf(GF, len2, Rv->M * alpha));
				size_t sz = H.size();
				RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha);
				std::make_heap(H.begin(), H.end(), ComparePtrND);
			}
			else {
				if (Mloc > prevMmax) {
					L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m);
					if (Mloc > 1.15f * prevMmax) {
						size_t sz = H.size(); RecomputeR_ConstM_AVX2_ND(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtrND);
					}
				}
				else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); }
			}
		}
		else {
			if (Mloc > prevMmax) {
				L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m);
				if (Mloc > 1.15f * prevMmax) { size_t sz = H.size(); RecomputeR_ConstM_AVX2_ND(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtrND); }
			}
			else { L->ChangeCharacteristic(m); Rv->ChangeCharacteristic(m); }
		}

		H.push_back(L);  std::push_heap(H.begin(), H.end(), ComparePtrND);
		H.push_back(Rv); std::push_heap(H.begin(), H.end(), ComparePtrND);
		if (!H.empty()) {
			_mm_prefetch((const char*)H[0], _MM_HINT_T0);
			if (H.size() > 1) _mm_prefetch((const char*)H[1], _MM_HINT_T0);
		}

		IntervalND* top = H.front();
		float interval_len = top->x2 - top->x1;

		bool want_term = (exp2f(log2f(interval_len) / (float)dim) < eps) || (it == maxIter - 1);
		if (!(it % T) || want_term) {
			CtrlMsgND out; out.kind = want_term ? 0 : 2;
			if (!want_term) {
				uint8_t cnt = (uint8_t)((H.size() >= 3) ? 3 : H.size());
				out.multiXchg.count = cnt;
				float* dest = out.multiXchg.intervals;
				IntervalND* t1 = H[0];
				IntervalND* t2 = (H.size() > 1 ? H[1] : H[0]);
				IntervalND* t3 = (H.size() > 2 ? H[2] : H[H.size() - 1]);
				IntervalND* tops[3] = { t1, t2, t3 };
				for (uint8_t i2 = 0; i2 < cnt; ++i2) {
					IntervalND* Tt = tops[i2];
					dest[0] = Tt->x1; dest[1] = 0.0f; dest[2] = Tt->x2; dest[3] = 0.0f; dest[4] = Tt->R;
					dest += 5;
				}
			}
			for (int i2 = 0; i2 < world; ++i2) if (i2 != rank) g_world->isend(i2, 0, out);
			if (want_term) break;
		}

		if (!(it % 500) && !bestQ.empty()) {
			CtrlMsgND out; out.kind = 3; out.bestSol.bestF = bestF; out.bestSol.bestX = bestX; out.bestSol.bestY = bestY;
			out.bestSol.dim = (uint8_t)bestQ.size(); memcpy(out.bestSol.bestQ, bestQ.data(), bestQ.size() * sizeof(float));
			for (int i2 = 0; i2 < world; ++i2) if (i2 != rank) g_world->isend(i2, 0, out);
		}

		while (g_world->iprobe(boost::mpi::any_source, 0)) {
			CtrlMsgND in; g_world->recv(boost::mpi::any_source, 0, in);
			if (in.kind == 0) { if (!rank) break; else return; }
			else if (in.kind == 1) {
				float sx = in.xchg.s_x1, ex = in.xchg.e_x1;
				if (sx < 0.0f) sx = 0.0f; if (ex > 1.0f) ex = 1.0f;
				if (ex > sx) {
					alignas(64) float tmp[32]; float tx, ty;
					map.map01ToPoint(sx, tmp); float y1i = cost(tmp, tx, ty);
					map.map01ToPoint(ex, tmp); float y2i = cost(tmp, tx, ty);
					IntervalND* inj = new IntervalND(sx, ex, y1i, y2i);
					inj->i1 = t_to_idx(sx); inj->i2 = t_to_idx(ex);
					inj->diam = map.block_diameter(inj->i1, inj->i2);
					inj->compute_span_level(map); inj->set_metric(inj->diam);
					update_pockets_and_Mmax(inj); inj->ChangeCharacteristic(r * Mmax);
					if (!H.empty()) { _mm_prefetch((const char*)H[0], _MM_HINT_T0); if (H.size() > 1) _mm_prefetch((const char*)H[1], _MM_HINT_T0); }
					IntervalND* topH = H.front();
					if (inj->R > 1.15f * topH->R) {
						float p2 = fmaf(-1.0f / initial_len, dmax, 1.0f);
						float k = (no_improve > 100 && it > 270) ? fmaf(0.5819767068693265f, expm1f(p2), 0.3f)
							: fmaf(0.3491860241215959f, expm1f(p2), 0.6f);
						inj->R = in.xchg.Rtop * k;
						H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtrND);
					}
				}
			}
			else if (in.kind == 2) {
				const MultiCrossMsg& mX = in.multiXchg;
				for (uint8_t ii = 0; ii < mX.count; ++ii) {
					const float* d = &mX.intervals[ii * 5];
					float sx = d[0], ex = d[2];
					if (sx < 0.0f) sx = 0.0f; if (ex > 1.0f) ex = 1.0f;
					if (ex > sx) {
						alignas(64) float tmp[32]; float tx, ty;
						map.map01ToPoint(sx, tmp); float y1i = cost(tmp, tx, ty);
						map.map01ToPoint(ex, tmp); float y2i = cost(tmp, tx, ty);
						IntervalND* inj = new IntervalND(sx, ex, y1i, y2i);
						inj->i1 = t_to_idx(sx); inj->i2 = t_to_idx(ex);
						inj->diam = map.block_diameter(inj->i1, inj->i2);
						inj->compute_span_level(map); inj->set_metric(inj->diam);
						update_pockets_and_Mmax(inj); inj->ChangeCharacteristic(r * Mmax);
						if (!H.empty()) { _mm_prefetch((const char*)H[0], _MM_HINT_T0); if (H.size() > 1) _mm_prefetch((const char*)H[1], _MM_HINT_T0); }
						IntervalND* topH = H.front();
						if (inj->R > 1.15f * topH->R) {
							float p2 = fmaf(-1.0f / initial_len, dmax, 1.0f);
							float k = (no_improve > 100 && it > 270) ? fmaf(0.5819767068693265f, expm1f(p2), 0.3f)
								: fmaf(0.3491860241215959f, expm1f(p2), 0.6f);
							inj->R = d[4] * k;
							H.emplace_back(inj); std::push_heap(H.begin(), H.end(), ComparePtrND);
						}
					}
				}
			}
			else if (in.kind == 3) {
				const BestSolutionMsg& bm = in.bestSol;
				if (bm.bestF < bestF * 1.15f) {
					alignas(64) float tmp_q[32]; memcpy(tmp_q, bm.bestQ, bm.dim * sizeof(float));
					float t_best = map.pointToT(tmp_q);
					float t1 = fmaxf(a, t_best - 0.001f), t2 = fminf(b, t_best + 0.001f);
					if (t2 > t1) {
						alignas(64) float tq1[32], tq2[32]; float xx1, yy1, xx2, yy2;
						map.map01ToPoint(t1, tq1); float f1 = cost(tq1, xx1, yy1);
						map.map01ToPoint(t2, tq2); float f2 = cost(tq2, xx2, yy2);
						IntervalND* I = new IntervalND(t1, t2, f1, f2);
						I->i1 = t_to_idx(t1); I->i2 = t_to_idx(t2);
						I->diam = map.block_diameter(I->i1, I->i2); I->compute_span_level(map);
						I->set_metric(I->diam); update_pockets_and_Mmax(I);
						I->ChangeCharacteristic(r * Mmax); I->R *= 0.90f;
						H.emplace_back(I); std::push_heap(H.begin(), H.end(), ComparePtrND);
					}
					if (bm.bestF < bestF) { bestF = bm.bestF; bestX = bm.bestX; bestY = bm.bestY; bestQ.assign(bm.bestQ, bm.bestQ + bm.dim); }
				}
			}
		}
		++it;
	}
}

struct BestPacket {
	float bestF; int dim; float bestX; float bestY;
	template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar& bestF& dim& bestX& bestY; }
};

// ---------------- Экспортируемые функции DLL ----------------

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_Manip2D(int nSegments, bool variableLengths, float minTheta, float targetX, float targetY,
	int peanoLevels, int maxIterPerBranch, float r, bool adaptiveMode, float epsilon,
	unsigned int seed, float** out_bestQ, size_t* out_bestQLen, float* out_bestX,
	float* out_bestY, float* out_bestF)
{
	// Сбрасываем slab перед началом — предотвращает накопление и выход за границы
	{ Slab* slab = tls.local(); slab->current = slab->base; }

	const int MAX_DIM = 32;
	const int dim = nSegments + (variableLengths ? nSegments : 0);

	// Перестановка/инверсии
	g_mc.permCache.resize(dim); for (int i = 0; i < dim; ++i) g_mc.permCache[i] = i;
	uint32_t sseed = g_mc.baseSeed;
	for (int i = dim - 1; i > 0; --i) { sseed ^= sseed << 13; sseed ^= sseed >> 17; sseed ^= sseed << 5; uint32_t j = sseed % (uint32_t)(i + 1); std::swap(g_mc.permCache[i], g_mc.permCache[j]); }
	g_mc.invMaskCache.resize(dim); for (int k = 0; k < dim; ++k) { sseed ^= sseed << 13; sseed ^= sseed >> 17; sseed ^= sseed << 5; g_mc.invMaskCache[k] = (uint64_t)sseed; }

	const float deg2rad = 3.14159265358979323846f / 180.0f;
	const float theta0Min = -60.0f * deg2rad, theta0Max = 150.0f * deg2rad;
	const float thetaMin  = -150.0f * deg2rad, thetaMax  = 150.0f * deg2rad;
	const float lenMin = 0.5f, lenMax = 2.0f;

	std::vector<float> low;  low.reserve(dim);
	std::vector<float> high; high.reserve(dim);
	for (int i = 0; i < nSegments; ++i) { low.push_back(i == 0 ? theta0Min : thetaMin); high.push_back(i == 0 ? theta0Max : thetaMax); }
	if (variableLengths) { for (int i = 0; i < nSegments; ++i) { low.push_back(lenMin); high.push_back(lenMax); } }

	ManipCost cost(nSegments, variableLengths, targetX, targetY, minTheta);

	const int rank = g_world->rank(), world = g_world->size();
	std::vector<float> bestQ; float bestF = FLT_MAX, bx = 0.0f, by = 0.0f;

	// Если размерность превышает возможности протокола (пакет bestQ[32]), работаем в "пустом" режиме,
	// но корректно синхронизируем MPI (воркеры всё равно посылают пакеты).
	bool over_dim = (dim > MAX_DIM);

	if (!over_dim) {
		const int levels0 = (std::min)(peanoLevels, 8);
		const int maxIter0 = (int)(maxIterPerBranch * 0.2f);
		MortonND map0(dim, levels0, low.data(), high.data(), g_mc);

		std::vector<IntervalND*> H_coarse; std::vector<float> bestQ_coarse;
		float bestF_coarse = FLT_MAX, bx_coarse = 0.0f, by_coarse = 0.0f;

		float M_prior = (2.0f * nSegments) * (1.0f / (float)(1u << levels0));
		if (variableLengths) M_prior += 1.41421356237f * (1.0f / (float)(1u << levels0));

		agp_run_branch_mpi(map0, cost, maxIter0, r, adaptiveMode, epsilon, seed,
			H_coarse, bestQ_coarse, bestF_coarse, bx_coarse, by_coarse, M_prior);

		if (bestF_coarse < bestF) { bestF = bestF_coarse; bestQ = bestQ_coarse; bx = bx_coarse; by = by_coarse; }

		if (levels0 < peanoLevels) {
			MortonND map1(dim, peanoLevels, low.data(), high.data(), g_mc);
			std::vector<IntervalND*> H_fine;
			std::vector<float> bestQ_fine = bestQ;
			float bestF_fine = bestF, bx_fine = bx, by_fine = by;

			float M_prior_fine = (2.0f * nSegments) * (1.0f / (float)(1u << peanoLevels));
			if (variableLengths) M_prior_fine += 1.41421356237f * (1.0f / (float)(1u << peanoLevels));

			if (!H_coarse.empty()) {
				std::sort(H_coarse.begin(), H_coarse.end(), [](const IntervalND* a, const IntervalND* b) { return a->R < b->R; });
				const size_t topCount = (size_t)(H_coarse.size() * 0.3f);
				auto t_to_idx_fine = [&](float t) -> uint64_t {
					float tt = (t <= 0.0f) ? 0.0f : (t >= 1.0f ? std::nextafterf(1.0f, 0.0f) : t);
					uint64_t idx = (uint64_t)((double)tt * (double)map1.scale);
					if (idx >= map1.scale) idx = map1.scale - 1ull;
					return idx;
				};
				for (size_t i = 0; i < topCount && i < H_coarse.size(); ++i) {
					const IntervalND* C = H_coarse[i];
					alignas(64) float q1[32], q2[32]; float x1, y1, x2, y2;
					map1.map01ToPoint(C->x1, q1); float f1 = cost(q1, x1, y1);
					map1.map01ToPoint(C->x2, q2); float f2 = cost(q2, x2, y2);
					IntervalND* I = new IntervalND(C->x1, C->x2, f1, f2);
					I->i1 = t_to_idx_fine(C->x1); I->i2 = t_to_idx_fine(C->x2);
					I->diam = map1.block_diameter(I->i1, I->i2);
					I->set_metric(I->diam);
					H_fine.push_back(I);
					if (f1 < bestF_fine) { bestF_fine = f1; bestQ_fine.assign(q1, q1 + dim); bx_fine = x1; by_fine = y1; }
					if (f2 < bestF_fine) { bestF_fine = f2; bestQ_fine.assign(q2, q2 + dim); bx_fine = x2; by_fine = y2; }
				}
				std::make_heap(H_fine.begin(), H_fine.end(), ComparePtrND);
			}

			agp_run_branch_mpi(map1, cost, maxIterPerBranch - maxIter0, r, adaptiveMode, epsilon, seed,
				H_fine, bestQ_fine, bestF_fine, bx_fine, by_fine, M_prior_fine);

			if (bestF_fine < bestF) { bestF = bestF_fine; bestQ = bestQ_fine; bx = bx_fine; by = by_fine; }
		}
	}

	BestPacket me{ bestF, dim, bx, by };
	if (!rank) {
		std::vector<float> winnerQ = bestQ;
		float winF = bestF, wx = bx, wy = by;
		for (int i = 1; i < world; ++i) {
			BestPacket bp; g_world->recv(i, 2, bp);
			std::vector<float> qin; g_world->recv(i, 3, qin);
			if (bp.bestF < winF) { winF = bp.bestF; wx = bp.bestX; wy = bp.bestY; winnerQ = qin; }
		}
		if (over_dim) { // Ничего не отдаём в UI — защитимся от переполнений
			*out_bestQLen = 0;
			*out_bestQ = nullptr;
			*out_bestX = 0; *out_bestY = 0; *out_bestF = FLT_MAX;
			return;
		}
		*out_bestQLen = winnerQ.size();
		*out_bestQ = (float*)CoTaskMemAlloc(sizeof(float) * (*out_bestQLen));
		memcpy(*out_bestQ, winnerQ.data(), sizeof(float) * (*out_bestQLen));
		*out_bestX = wx; *out_bestY = wy; *out_bestF = winF;
	}
	else {
		g_world->send(0, 2, me);
		g_world->send(0, 3, bestQ);
	}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(int peanoLevel, float a, float b, float c, float d) {
	g_env  = new boost::mpi::environment();
	g_world = new boost::mpi::communicator();
	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
	_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
	const int rank = g_world->rank();
	const int world_size = g_world->size();
	if (world_size == 4) { new (&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, rank & 3); }
	// Надёжный хэш-сид
	g_mc.baseSeed = (uint32_t)(0x9E3779B9u ^ (0x85EBCA6Bu * (uint32_t)rank + 0xC2B2AE35u));
	return rank;
}

// Тестовые функции (без изменений)

__forceinline float ShekelFunc(float x, float seed) {
	int i = 0; float st = seed, r1, r2, res = 0.0f;
	while (i < 10) {
		XOR_RAND(st, r1); float xp = fmaf(-r1, 10.0f, x); XOR_RAND(st, r1); XOR_RAND(st, r2);
		float d = fmaf(fmaf(r1, 20.0f, 5.0f), xp * xp, fmaf(r2, 0.2f, 1.0f)); d = copysignf(fmaxf(fabsf(d), FLT_MIN), d); res -= 1.0f / d; ++i;
	}
	return res;
}
__forceinline float RastriginFunc(float x1, float x2) {
	const float t = fmaf(x1, x1, x2 * x2); float c1, c2;
	FABE13_COS(6.28318530717958647692f * x1, c1); FABE13_COS(6.28318530717958647692f * x2, c2);
	return (t - fmaf(c1 + c2, 10.0f, -14.6f)) * fmaf(-t, 0.25f, 18.42f);
}
__forceinline float HillFunc(float x, float seed) {
	int j = 0; __declspec(align(32)) float ang[14u];
	float st = 6.28318530717958647692f * x; while (j < 14) { ang[j] = st * (float)(j + 1); ++j; }
	__declspec(align(32)) float sv[14u], cv[14u]; FABE13_SINCOS(ang, sv, cv, 14u);
	float state = seed, r1, r2; XOR_RAND(state, r1); float res = fmaf(r1, 2.0f, -1.1f); --j;
	while (j >= 0) { XOR_RAND(state, r1); XOR_RAND(state, r2); res += fmaf(fmaf(r1, 2.0f, -1.1f), sv[j], fmaf(r2, 2.0f, -1.1f) * cv[j]); --j; }
	return res;
}
__forceinline float GrishaginFunc(float x1, float x2, float seed) {
	int j = 0; __declspec(align(32)) float aj[8u], ak[8u];
	while (j < 8) { float pj = 3.14159265358979323846f * (float)(j + 1); aj[j] = pj * x1; ak[j] = pj * x2; ++j; }
	__declspec(align(32)) float sj[8u], cj[8u], sk[8u], ck[8u]; FABE13_SINCOS(aj, sj, cj, 8u); FABE13_SINCOS(ak, sk, ck, 8u);
	--j; float p1 = 0.0f, p2 = 0.0f; float st = seed, r1, r2;
	while (j >= 0) {
		size_t k = 0u; while (k < 8u) {
			float s = sj[j] * sj[j]; float c = ck[k] * ck[k];
			XOR_RAND_GRSH(st, r1); XOR_RAND_GRSH(st, r2); p1 = fmaf(r1, s, fmaf(r2, c, p1));
			XOR_RAND_GRSH(st, r1); XOR_RAND_GRSH(st, r2); p2 = fmaf(-r1, c, fmaf(r2, s, p2)); ++k;
		}
		--j;
	}
	return -sqrtf(fmaf(p1, p1, p2 * p2));
}

// 1D/2D API (как были)

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_1D(float global_iterations, float a, float b, float r, bool mode, float epsilon, float seed,
	float** out_data, size_t* out_len) {
	Slab* slab = tls.local(); slab->current = slab->base; int schetchick = 0;
	const float initial_length = b - a; float dmax = initial_length;
	const float threshold_03 = 0.3f * initial_length, inv_threshold_03 = 1.0f / threshold_03;
	const float start_val = ShekelFunc(a, seed); float best_f = ShekelFunc(b, seed);
	float x_Rmax_1 = a, x_Rmax_2 = b; float y_Rmax_1 = start_val, y_Rmax_2 = best_f;
	std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
	std::vector<Interval1D*, boost::alignment::aligned_allocator<Interval1D*, 64u>> R;
	Extr.reserve((size_t)global_iterations << 2u); R.reserve((size_t)global_iterations << 1u);
	R.emplace_back(new Interval1D(a, b, start_val, best_f, 1.0f)); float Mmax = R.front()->M; float m = r * Mmax;

	while (true) {
		const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r);
		const float new_value = ShekelFunc(new_point, seed);
		if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_point); }
		std::pop_heap(R.begin(), R.end(), ComparePtr1D);
		const Interval1D* pro = R.back();
		const float new_x1 = pro->x1, new_x2 = pro->x2;
		const float len2 = new_x2 - new_point, len1 = new_point - new_x1;
		const float interval_len = (len1 < len2 ? len1 : len2);
		if (++schetchick == (int)global_iterations || interval_len < epsilon) {
			Extr.emplace_back((float)schetchick); Extr.emplace_back(interval_len);
			*out_len = Extr.size(); *out_data = (float*)CoTaskMemAlloc(sizeof(float) * (*out_len));
			memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); return;
		}
		Interval1D* curr = new Interval1D(new_x1, new_point, pro->y1, new_value, 1.0f);
		Interval1D* curr1 = new Interval1D(new_point, new_x2, new_value, pro->y2, 1.0f);
		const float currM = curr->M > curr1->M ? curr->M : curr1->M;
		const size_t r_size = R.size();
		if (mode) {
			if (len2 + len1 == dmax) { dmax = len2 > len1 ? len2 : len1; for (auto p : R) { float L = p->x2 - p->x1; if (L > dmax) dmax = L; } }
			if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) {
				if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
				const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
				const float alpha = progress * progress; const float betta = 2.0f - alpha;
				const float MULT = (1.0f / dmax) * Mmax;
				const float global_coeff = fmaf(MULT, r, -MULT);
				const float GF = betta * global_coeff;
				curr->ChangeCharacteristic(fmaf(GF, len1, curr->M * alpha));
				curr1->ChangeCharacteristic(fmaf(GF, len2, curr1->M * alpha));
				RecomputeR_AffineM_AVX2_1D(R.data(), r_size, GF, alpha);
				std::make_heap(R.begin(), R.end(), ComparePtr1D);
			}
			else {
				if (currM > Mmax) {
					if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
					else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2_1D(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr1D); }
				}
				else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
			}
		}
		else {
			if (currM > Mmax) {
				if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
				else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2_1D(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr1D); }
			}
			else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
		}
		R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr1D);
		R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr1D);
		const Interval1D* top = R.front();
		x_Rmax_1 = top->x1; x_Rmax_2 = top->x2; y_Rmax_1 = top->y1; y_Rmax_2 = top->y2;
	}
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_2D(const float N, const float global_iterations, const float a, const float b, const float c,
	const float d, const float r, const bool mode, const float epsilon, const float seed,
	float** const __restrict out_data, size_t* const __restrict out_len) noexcept
{
	Slab* const __restrict slab = tls.local(); slab->current = slab->base;
	int schetchick = 0, no_improve = 0; const int rank = g_world->rank(); const int world_size = g_world->size();
	while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg dummy; g_world->recv(boost::mpi::any_source, 0, dummy); }
	const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1));
	const float x_addition = (b - a) * inv_divider, y_addition = (d - c) * inv_divider;
	const float true_start = a + x_addition, true_end = b - x_addition;
	float x_Rmax_1 = true_start, x_Rmax_2 = true_end;
	const float initial_length = x_Rmax_2 - x_Rmax_1; float dmax = initial_length;
	const float threshold_03 = 0.3f * initial_length, inv_threshold_03 = 1.0f / threshold_03;
	const float start_val = rank % 3 ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition);
	float best_f = rank % 2 ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition);
	float y_Rmax_1 = start_val, y_Rmax_2 = best_f;
	std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
	std::vector<Interval1D* __restrict, boost::alignment::aligned_allocator<Interval1D* __restrict, 64u>> R;
	Extr.clear(); Extr.reserve(static_cast<size_t>(global_iterations) << 2u);
	R.clear(); R.reserve(static_cast<size_t>(global_iterations) << 1u);
	R.emplace_back(new Interval1D(true_start, true_end, start_val, best_f, 2.0f));
	const Interval1D* __restrict top_ptr = R.front(); // ИНИЦИАЛИЗАЦИЯ!

	float Mmax = R.front()->M, m = r * Mmax;
	while (true) {
		const float interval_len = x_Rmax_2 - x_Rmax_1;
		const bool stagnation = no_improve > 100 && schetchick > 270;
		const float p = fmaf(-1.0f / initial_length, dmax, 1.0f);
		while (g_world->iprobe(boost::mpi::any_source, 0)) {
			CtrlMsg in; g_world->recv(boost::mpi::any_source, 0, in);
			if (in.kind) {
				if (!rank) {
					Extr.emplace_back((float)schetchick); Extr.emplace_back(interval_len); *out_len = Extr.size();
					*out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
					memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
				}
				return;
			}
			const float sx = FindX2D_analytic(in.xchg.s_x1, in.xchg.s_x2);
			const float ex = FindX2D_analytic(in.xchg.e_x1, in.xchg.e_x2);
			Interval1D* const __restrict injected = new Interval1D(sx, ex,
				RastriginFunc(in.xchg.s_x1, in.xchg.s_x2), RastriginFunc(in.xchg.e_x1, in.xchg.e_x2), 2.0f);
			injected->ChangeCharacteristic(m);
			if (injected->R > 1.15f * top_ptr->R) {
				const float k = stagnation ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f);
				injected->R = in.xchg.Rtop * k;
				R.emplace_back(injected); std::push_heap(R.begin(), R.end(), ComparePtr1D);
			}
		}
		const int T = (int)fmaf(-expm1f(p), 264.0f, 277.0f);
		const bool want_term = interval_len < epsilon || schetchick == static_cast<int>(global_iterations);
		if (!(++schetchick % T) || stagnation || want_term) {
			CtrlMsg out; out.kind = want_term;
			if (!out.kind) {
				float s_x1, s_x2, e_x1, e_x2; HitTest2D_analytic(top_ptr->x1, s_x1, s_x2); HitTest2D_analytic(top_ptr->x2, e_x1, e_x2);
				out.xchg = CrossMsg{ s_x1, s_x2, e_x1, e_x2, top_ptr->R };
			}
			for (int i = 0; i < world_size; ++i) if (i != rank) g_world->isend(i, 0, out);
			if (out.kind) {
				if (!rank) {
					Extr.emplace_back((float)schetchick); Extr.emplace_back(interval_len); *out_len = Extr.size();
					*out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
					memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
				}
				return;
			}
		}
		const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r);
		float new_x1_val, new_x2_val; HitTest2D_analytic(new_point, new_x1_val, new_x2_val);
		const float new_value = RastriginFunc(new_x1_val, new_x2_val);
		if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_x1_val); Extr.emplace_back(new_x2_val); no_improve = 0; }
		else { ++no_improve; }
		std::pop_heap(R.begin(), R.end(), ComparePtr1D);
		Interval1D* const __restrict promej = R.back();
		const float segment_x1 = promej->x1, segment_x2 = promej->x2;
		const float len2 = segment_x2 - new_point, len1 = new_point - segment_x1;
		Interval1D* const __restrict curr = new Interval1D(segment_x1, new_point, promej->y1, new_value, 2.0f);
		Interval1D* const __restrict curr1 = new Interval1D(new_point, segment_x2, new_value, promej->y2, 2.0f);
		const float currM = (std::max)(curr->M, curr1->M); const size_t r_size = R.size();
		if (mode) {
			if (len2 + len1 == dmax) { dmax = (std::max)(len1, len2); for (auto pI : R) { float L = pI->x2 - pI->x1; if (L > dmax) dmax = L; } }
			if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) {
				if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
				const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
				const float alpha = progress * progress; const float betta = 2.0f - alpha;
				const float MULTIPLIER = (1.0f / dmax) * Mmax;
				const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
				const float GLOBAL_FACTOR = betta * global_coeff;
				curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
				curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));
				RecomputeR_AffineM_AVX2_1D(R.data(), r_size, GLOBAL_FACTOR, alpha);
				std::make_heap(R.begin(), R.end(), ComparePtr1D);
			}
			else {
				if (currM > Mmax) {
					if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
					else {
						Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m);
						RecomputeR_ConstM_AVX2_1D(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr1D);
					}
				}
				else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
			}
		}
		else {
			if (currM > Mmax) {
				if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
				else {
					Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m);
					RecomputeR_ConstM_AVX2_1D(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr1D);
				}
			}
			else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
		}
		R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr1D);
		R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr1D);
		top_ptr = R.front();
		x_Rmax_1 = top_ptr->x1; x_Rmax_2 = top_ptr->x2; y_Rmax_1 = top_ptr->y1; y_Rmax_2 = top_ptr->y2;
	}
}

// Управляющие сообщения

struct RunParams {
	int nSegments; uint8_t varLen; float minTheta; float tx, ty; int levels, maxIter; float r; uint8_t adaptive; float eps; unsigned int seed;
	template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar& nSegments& varLen& minTheta& tx& ty& levels& maxIter& r& adaptive& eps& seed; }
};

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartManipND(int nSegments, bool variableLengths, float minTheta, float targetX, float targetY,
	int peanoLevels, int maxIterPerBranch, float r, bool adaptiveMode, float epsilon, unsigned int seed) {
	RunParams p; p.nSegments = nSegments; p.varLen = (uint8_t)variableLengths; p.minTheta = minTheta; p.tx = targetX; p.ty = targetY;
	p.levels = peanoLevels; p.maxIter = maxIterPerBranch; p.r = r; p.adaptive = (uint8_t)adaptiveMode; p.eps = epsilon; p.seed = seed;
	const int rank = g_world->rank(); const int world = g_world->size();
	for (int i = 1; i < world; ++i) if (i != rank) g_world->isend(i, 1, p);
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() {
	while (true) {
		if (g_world->iprobe(0, 1)) {
			RunParams p; g_world->recv(0, 1, p);
			if (p.nSegments == -1) break; // приказ завершения
			float* q = nullptr; size_t qlen = 0; float bx = 0, by = 0, bf = 0;
			AGP_Manip2D(p.nSegments, (bool)p.varLen, p.minTheta, p.tx, p.ty, p.levels, p.maxIter, p.r, (bool)p.adaptive, p.eps, p.seed, &q, &qlen, &bx, &by, &bf);
			if (q) CoTaskMemFree(q);
		}
		Sleep(0);
	}
	// Корректная финализация для воркера
	delete g_world; g_world = nullptr;
	delete g_env;   g_env = nullptr;
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpShutdownWorkers() {
	const int rank = g_world->rank(); const int world = g_world->size();
	RunParams p{}; p.nSegments = -1; // «яд»
	for (int i = 1; i < world; ++i) if (i != rank) g_world->isend(i, 1, p);
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpFinalizeAll() {
	if (g_world) { delete g_world; g_world = nullptr; }
	if (g_env)   { delete g_env;   g_env   = nullptr; }
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_Free(float* p) { CoTaskMemFree(p); }

dllmain.cpp

cpp
#include "pch.h"

BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID) {
	if (ul_reason_for_call == DLL_PROCESS_ATTACH) {
		DisableThreadLibraryCalls(hModule);
	}
	return TRUE;
}

Что именно исправлено и почему это решает проблему зависаний/крахов

Сброс slab в AGP_Manip2D: раньше ND-интервалы (IntervalND) выделялись из 16 МБ локального «слэба», но он никогда не сбрасывался в этой функции. После нескольких запусков «Оптимизировать» указатель выходил за пределы, происходила порча памяти с эффектом «через несколько нажатий». Теперь это исключено — слэб очищается на входе.
Безопасная генерация сидов: убраны оба источника OOB — фиксированный буфер seeds[64*32] и статический permutations[32][16]. Теперь используется ограничение по вместимости и арифметические перестановки, работающие без больших вспомогательных массивов.
Лимит размерности: весь стек алгоритма (bestQ в сообщениях, локальные массивы phi/s_arr/c_arr/q_local/...) рассчитан на максимум 32 float. Я ввёл согласованный лимит MAX_SEGMENTS=16 в UI и мягкую проверку dim>32 в DLL с корректной MPI-синхронизацией (воркеры всё равно отчитываются, ранк0 отдаёт пустой результат).
Инициализация top_ptr в 2D: предотвращает чтение мусора до первого R.front().
Сид-генератор теперь чисто целочисленный, без fmaf над «нечисловыми» константами.
Завершение MPI: добавлены AgpShutdownWorkers/AgpFinalizeAll, вызываются из UI после закрытия окна — нет «job aborted / didn't finalize».

Если нужно — могу собрать «минимальный чек-лист» для теста (командная строка, сценарии с 1/6 процессами), но сам код выше уже включает все исправления, которые действительно вели к повреждению памяти и AccessViolation.

C++/CLR MyForm.h #pragma once #define WIN32_LEAN_A...

Pergunta

Pensamento

Clarifying process steps

Analyzing process and error

Investigating potential memory issues

AccessViolation analysis

Memory and buffer overflow analysis

Memory management and synchronization concerns

Memory and concurrency optimization

Bug fixing and memory cleanup

Optimizing memory and code safety

Resolving memory management and MPI handling

Implementing shutdown behavior

Resposta

MyForm.h

MyForm.cpp

agp_algo.cpp

dllmain.cpp

Что именно исправлено и почему это решает проблему зависаний/крахов

Compartilhar esta Q&A