я пробую вызывать пайтоновский скрипт из управляем...

Question

я пробую вызывать пайтоновский скрипт из управляемого приложения c++/clr winforms (не C#), код MyForm.h: "#pragma once

#include <pybind11/embed.h>
#include <pybind11/stl.h>
#include <pybind11/numpy.h>

#define WIN32_LEAN_AND_MEAN
#include <Windows.h>

using namespace System;
using namespace System::Drawing;
using namespace System::Windows::Forms;
using namespace System::Collections::Generic;
using namespace System::Drawing::Drawing2D;
using namespace System::Diagnostics;
using namespace System::Globalization;
using namespace System::Runtime::CompilerServices;

typedef void(__cdecl* P_RUN_PYOPTIMIZATION)(
const char*, int, bool, float, float, float,
int, int, float, float, bool,
float*, float*, float*, float*, float*,
int*, float*, float*);

typedef void(__cdecl* P_MANIP)(int, bool, float, float, float, int, int, float, bool, float, unsigned int, float**, size_t*, float*, float*, float*, size_t*, float*);
typedef void(__cdecl* P_FREE)(float*);
typedef void(__cdecl* P_START)(int, bool, float, float, float, int, int, float, bool, float, unsigned int);

namespace TESTAGP {

text
public enum class OptimizerBackend {
	AgpCpp = 0,
	Optuna = 1,
	IOpt = 2
};

public ref class MyForm sealed : public Form {
public:
	MyForm(HMODULE hLib)
		: hLib(hLib) {
		this->SetStyle(ControlStyles::AllPaintingInWmPaint |
			ControlStyles::UserPaint |
			ControlStyles::OptimizedDoubleBuffer, true);
		this->Text = L"AGP Manipulator 2D";
		this->ClientSize = System::Drawing::Size(1200, 800);
		this->Resize += gcnew EventHandler(this, &MyForm::OnResize);

		fManip = reinterpret_cast<P_MANIP>(GetProcAddress(hLib, "AGP_Manip2D"));
		pFree = reinterpret_cast<P_FREE>(GetProcAddress(hLib, "AGP_Free"));
		pStart = reinterpret_cast<P_START>(GetProcAddress(hLib, "AgpStartManipND"));

		pRunPythonOptimization = reinterpret_cast<P_RUN_PYOPTIMIZATION>(
			GetProcAddress(hLib, "RunPythonOptimization"));

		angles = gcnew List<float>(8);
		lengths = gcnew List<float>(8);
		backend = OptimizerBackend::AgpCpp;

		InitGraphicsResources();
		InitUI();
		ResetRandomConfig();
	}

private:
	initonly HMODULE hLib;
	initonly P_MANIP fManip;
	initonly P_FREE pFree;
	initonly P_START pStart;

	P_RUN_PYOPTIMIZATION pRunPythonOptimization;

	OptimizerBackend backend;
	ComboBox^ cbBackend;
	int nSegments;
	bool variableLengths;
	List<float>^ angles;
	List<float>^ lengths;
	CheckBox^ cbVarLen;
	NumericUpDown^ nudMinTheta;
	NumericUpDown^ nudBaseLength;
	NumericUpDown^ nudStretchFactor;
	NumericUpDown^ nudTargetX;
	NumericUpDown^ nudTargetY;
	NumericUpDown^ nudLevels;
	NumericUpDown^ nudMaxIter;
	CheckBox^ cbAdaptive;
	NumericUpDown^ nudR;
	NumericUpDown^ nudEps;
	Button^ btnAdd;
	Button^ btnRem;
	Button^ btnOptimize;
	Label^ lblInfo;
	UInt32 rngState = 0xA5C39E0Du;

	System::Drawing::Font^ uiFontBold11;
	System::Drawing::Font^ uiFontBold10;

	Pen^ wallPen;
	Pen^ dashedPen;
	Pen^ targetPen;
	Pen^ penRod;
	SolidBrush^ jointBrush;
	HatchBrush^ wallHatchBrush;

	void InitGraphicsResources() {
		uiFontBold11 = gcnew System::Drawing::Font("Yu Gothic UI", 11, FontStyle::Bold);
		uiFontBold10 = gcnew System::Drawing::Font("Yu Gothic UI", 10, FontStyle::Bold);

		wallPen = gcnew Pen(Color::Black, 2.0f);

		dashedPen = gcnew Pen(Color::Black, 2.0f);
		dashedPen->DashStyle = DashStyle::Dash;

		targetPen = gcnew Pen(Color::Green, 3.0f);
		targetPen->DashStyle = DashStyle::Dot;

		penRod = gcnew Pen(Color::Red, 6.0f);
		jointBrush = gcnew SolidBrush(Color::Blue);
		wallHatchBrush = gcnew HatchBrush(HatchStyle::BackwardDiagonal, Color::LightGray, Color::White);
	}

	void InitUI() {
		cbBackend = gcnew ComboBox();
		cbBackend->Location = Point(920, 52);
		cbBackend->Width = 260;
		cbBackend->Height = 28;
		cbBackend->DropDownStyle = ComboBoxStyle::DropDownList;
		cbBackend->Font = uiFontBold11;
		cbBackend->BackColor = SystemColors::Info;
		cbBackend->FlatStyle = FlatStyle::Flat;
		cbBackend->Items->Add(L"AGP");
		cbBackend->Items->Add(L"Optuna");
		cbBackend->Items->Add(L"iOpt");
		cbBackend->SelectedIndex = 0;
		cbBackend->SelectedIndexChanged += gcnew EventHandler(this, &MyForm::OnBackendChanged);
		this->Controls->Add(cbBackend);

		Label^ L;

		L = gcnew Label();
		L->Text = L"Мин. угол (рад)";
		L->Location = Point(20, 20);
		L->Width = 200;
		L->Font = uiFontBold11;
		this->Controls->Add(L);

		nudMinTheta = gcnew NumericUpDown();
		nudMinTheta->Location = Point(20, 52);
		nudMinTheta->Width = 200;
		nudMinTheta->DecimalPlaces = 3;
		nudMinTheta->Minimum = Decimal(1) / Decimal(100);
		nudMinTheta->Maximum = Decimal(314159) / Decimal(100000);
		nudMinTheta->Value = Decimal(150) / Decimal(100);
		nudMinTheta->Font = uiFontBold10;
		nudMinTheta->ValueChanged += gcnew EventHandler(this, &MyForm::OnAnyChanged);
		this->Controls->Add(nudMinTheta);

		L = gcnew Label();
		L->Text = L"Базовая длина";
		L->Location = Point(245, 20);
		L->Width = 200;
		L->Font = uiFontBold11;
		this->Controls->Add(L);

		nudBaseLength = gcnew NumericUpDown();
		nudBaseLength->Location = Point(245, 52);
		nudBaseLength->Width = 200;
		nudBaseLength->DecimalPlaces = 2;
		nudBaseLength->Minimum = Decimal(1) / Decimal(2);
		nudBaseLength->Maximum = Decimal(200) / Decimal(100);
		nudBaseLength->Value = Decimal(100) / Decimal(100);
		nudBaseLength->Font = uiFontBold10;
		nudBaseLength->ValueChanged += gcnew EventHandler(this, &MyForm::OnAnyChanged);
		this->Controls->Add(nudBaseLength);

		L = gcnew Label();
		L->Text = L"Коэф. растяжения";
		L->Location = Point(470, 20);
		L->Width = 200;
		L->Font = uiFontBold11;
		this->Controls->Add(L);

		nudStretchFactor = gcnew NumericUpDown();
		nudStretchFactor->Location = Point(470, 52);
		nudStretchFactor->Width = 200;
		nudStretchFactor->DecimalPlaces = 2;
		nudStretchFactor->Minimum = Decimal(100) / Decimal(100);
		nudStretchFactor->Maximum = Decimal(150) / Decimal(100);
		nudStretchFactor->Increment = Decimal(1) / Decimal(100);
		nudStretchFactor->Value = Decimal(150) / Decimal(100);
		nudStretchFactor->Font = uiFontBold10;
		nudStretchFactor->ValueChanged += gcnew EventHandler(this, &MyForm::OnAnyChanged);
		this->Controls->Add(nudStretchFactor);

		cbVarLen = gcnew CheckBox();
		cbVarLen->Text = L"Переменные длины";
		cbVarLen->Location = Point(695, 52);
		cbVarLen->Width = 200;
		cbVarLen->Checked = false;
		cbVarLen->Font = uiFontBold11;
		cbVarLen->CheckedChanged += gcnew EventHandler(this, &MyForm::OnAnyChanged);
		this->Controls->Add(cbVarLen);

		L = gcnew Label();
		L->Text = L"Цель X";
		L->Location = Point(20, 107);
		L->Width = 200;
		L->Font = uiFontBold11;
		this->Controls->Add(L);

		nudTargetX = gcnew NumericUpDown();
		nudTargetX->Location = Point(20, 139);
		nudTargetX->Width = 200;
		nudTargetX->DecimalPlaces = 2;
		nudTargetX->Minimum = Decimal(-100) / Decimal(10);
		nudTargetX->Maximum = Decimal(100) / Decimal(10);
		nudTargetX->Value = Decimal(25) / Decimal(10);
		nudTargetX->Font = uiFontBold10;
		nudTargetX->ValueChanged += gcnew EventHandler(this, &MyForm::OnAnyChanged);
		this->Controls->Add(nudTargetX);

		L = gcnew Label();
		L->Text = L"Цель Y";
		L->Location = Point(245, 107);
		L->Width = 200;
		L->Font = uiFontBold11;
		this->Controls->Add(L);

		nudTargetY = gcnew NumericUpDown();
		nudTargetY->Location = Point(245, 139);
		nudTargetY->Width = 200;
		nudTargetY->DecimalPlaces = 2;
		nudTargetY->Minimum = Decimal(-100) / Decimal(10);
		nudTargetY->Maximum = Decimal(100) / Decimal(10);
		nudTargetY->Value = Decimal(-10) / Decimal(10);
		nudTargetY->Font = uiFontBold10;
		nudTargetY->ValueChanged += gcnew EventHandler(this, &MyForm::OnAnyChanged);
		this->Controls->Add(nudTargetY);

		L = gcnew Label();
		L->Text = L"Глубина развёрток";
		L->Location = Point(470, 107);
		L->Width = 200;
		L->Font = uiFontBold11;
		this->Controls->Add(L);

		nudLevels = gcnew NumericUpDown();
		nudLevels->Location = Point(470, 139);
		nudLevels->Width = 200;
		nudLevels->Minimum = 7;
		nudLevels->Maximum = 20;
		nudLevels->Value = 12;
		nudLevels->Font = uiFontBold10;
		nudLevels->ValueChanged += gcnew EventHandler(this, &MyForm::OnAnyChanged);
		this->Controls->Add(nudLevels);

		L = gcnew Label();
		L->Text = L"Надежность (r)";
		L->Location = Point(695, 107);
		L->Width = 200;
		L->Font = uiFontBold11;
		this->Controls->Add(L);

		nudR = gcnew NumericUpDown();
		nudR->Location = Point(695, 139);
		nudR->Width = 200;
		nudR->DecimalPlaces = 2;
		nudR->Minimum = Decimal(100) / Decimal(100);
		nudR->Maximum = Decimal(2000) / Decimal(100);
		nudR->Value = Decimal(250) / Decimal(100);
		nudR->Font = uiFontBold10;
		nudR->ValueChanged += gcnew EventHandler(this, &MyForm::OnAnyChanged);
		this->Controls->Add(nudR);

		cbAdaptive = gcnew CheckBox();
		cbAdaptive->Text = L"Адаптивная схема";
		cbAdaptive->Location = Point(920, 139);
		cbAdaptive->Width = 200;
		cbAdaptive->Checked = true;
		cbAdaptive->Font = uiFontBold11;
		cbAdaptive->CheckedChanged += gcnew EventHandler(this, &MyForm::OnAnyChanged);
		this->Controls->Add(cbAdaptive);

		L = gcnew Label();
		L->Text = L"Точность";
		L->Location = Point(20, 194);
		L->Width = 200;
		L->Font = uiFontBold11;
		this->Controls->Add(L);

		nudEps = gcnew NumericUpDown();
		nudEps->Location = Point(20, 226);
		nudEps->Width = 200;
		nudEps->DecimalPlaces = 9;
		nudEps->Minimum = Decimal(1) / Decimal(1000000000);
		nudEps->Maximum = Decimal(1) / Decimal(10);
		nudEps->Value = Decimal(1) / Decimal(100000);
		nudEps->Font = uiFontBold10;
		nudEps->ValueChanged += gcnew EventHandler(this, &MyForm::OnAnyChanged);
		this->Controls->Add(nudEps);

		L = gcnew Label();
		L->Text = L"Макс. итераций";
		L->Location = Point(245, 194);
		L->Width = 200;
		L->Font = uiFontBold11;
		this->Controls->Add(L);

		nudMaxIter = gcnew NumericUpDown();
		nudMaxIter->Location = Point(245, 226);
		nudMaxIter->Width = 200;
		nudMaxIter->Minimum = 10;
		nudMaxIter->Maximum = 500000;
		nudMaxIter->Value = 1000;
		nudMaxIter->Font = uiFontBold10;
		nudMaxIter->Increment = 100;
		nudMaxIter->ValueChanged += gcnew EventHandler(this, &MyForm::OnAnyChanged);
		this->Controls->Add(nudMaxIter);

		btnAdd = gcnew Button();
		btnAdd->Text = L"+ Звено";
		btnAdd->Location = Point(465, 226);
		btnAdd->Width = 90;
		btnAdd->Height = 35;
		btnAdd->BackColor = SystemColors::Info;
		btnAdd->Cursor = Cursors::Hand;
		btnAdd->FlatAppearance->BorderColor = Color::FromArgb(64, 64, 64);
		btnAdd->FlatAppearance->BorderSize = 3;
		btnAdd->FlatAppearance->MouseDownBackColor = Color::FromArgb(128, 128, 255);
		btnAdd->FlatAppearance->MouseOverBackColor = Color::FromArgb(192, 192, 255);
		btnAdd->FlatStyle = FlatStyle::Flat;
		btnAdd->Font = uiFontBold11;
		btnAdd->ForeColor = SystemColors::ControlDarkDark;
		btnAdd->Click += gcnew EventHandler(this, &MyForm::OnAddClick);
		this->Controls->Add(btnAdd);

		btnRem = gcnew Button();
		btnRem->Text = L"- Звено";
		btnRem->Location = Point(560, 226);
		btnRem->Width = 90;
		btnRem->Height = 35;
		btnRem->BackColor = SystemColors::Info;
		btnRem->Cursor = Cursors::Hand;
		btnRem->FlatAppearance->BorderColor = Color::FromArgb(64, 64, 64);
		btnRem->FlatAppearance->BorderSize = 3;
		btnRem->FlatAppearance->MouseDownBackColor = Color::FromArgb(128, 128, 255);
		btnRem->FlatAppearance->MouseOverBackColor = Color::FromArgb(192, 192, 255);
		btnRem->FlatStyle = FlatStyle::Flat;
		btnRem->Font = uiFontBold11;
		btnRem->ForeColor = SystemColors::ControlDarkDark;
		btnRem->Click += gcnew EventHandler(this, &MyForm::OnRemClick);
		this->Controls->Add(btnRem);

		btnOptimize = gcnew Button();
		btnOptimize->Text = L"Оптимизировать";
		btnOptimize->Location = Point(680, 226);
		btnOptimize->Width = 150;
		btnOptimize->Height = 35;
		btnOptimize->BackColor = SystemColors::Info;
		btnOptimize->Cursor = Cursors::Hand;
		btnOptimize->FlatAppearance->BorderColor = Color::FromArgb(64, 64, 64);
		btnOptimize->FlatAppearance->BorderSize = 3;
		btnOptimize->FlatAppearance->MouseDownBackColor = Color::FromArgb(128, 128, 255);
		btnOptimize->FlatAppearance->MouseOverBackColor = Color::FromArgb(192, 192, 255);
		btnOptimize->FlatStyle = FlatStyle::Flat;
		btnOptimize->Font = uiFontBold11;
		btnOptimize->ForeColor = SystemColors::ControlDarkDark;
		btnOptimize->Click += gcnew EventHandler(this, &MyForm::OnOptimizeClick);
		this->Controls->Add(btnOptimize);

		lblInfo = gcnew Label();
		lblInfo->Location = Point(835, 194);
		lblInfo->Size = System::Drawing::Size(335, 140);
		lblInfo->BorderStyle = BorderStyle::FixedSingle;
		lblInfo->Text = L"Готов";
		lblInfo->Font = uiFontBold10;
		this->Controls->Add(lblInfo);
	}

	void ResetRandomConfig() {
		nSegments = 1;
		angles->Clear();
		lengths->Clear();
		angles->Add(1.57079637f);
		lengths->Add(static_cast<float>(nudBaseLength->Value));
		variableLengths = false;
		this->Invalidate();
	}

	[MethodImpl(MethodImplOptions::AggressiveInlining)]
	float Rand01() {
		rngState ^= rngState << 13;
		rngState ^= rngState >> 17;
		rngState ^= rngState << 5;
		const float inv = 1.0f / 4294967296.0f;
		return static_cast<float>(static_cast<unsigned int>(rngState)) * inv;
	}

	[MethodImpl(MethodImplOptions::AggressiveInlining)]
	float RandAngle() {
		return Rand01() * 6.28318548f - 3.14159274f;
	}

	void RunAgpCpp() {
		variableLengths = cbVarLen->Checked;
		const float minTheta = static_cast<float>(nudMinTheta->Value);
		const float tx = static_cast<float>(nudTargetX->Value);
		const float ty = static_cast<float>(nudTargetY->Value);
		const int levels = static_cast<int>(nudLevels->Value);
		const int maxIter = static_cast<int>(nudMaxIter->Value);
		const bool adaptive = cbAdaptive->Checked;
		const float r_param = static_cast<float>(nudR->Value);
		const float eps = static_cast<float>(nudEps->Value);
		const unsigned int seed = static_cast<unsigned int>(GetTickCount());

		pStart(nSegments, variableLengths, minTheta, tx, ty, levels, maxIter, r_param, adaptive, eps, seed);

		LARGE_INTEGER t0;
		LARGE_INTEGER t1;
		LARGE_INTEGER fq;
		QueryPerformanceCounter(&t0);

		float* bestQ = nullptr;
		size_t bestQLen = 0;
		float bestX = 0.0f;
		float bestY = 0.0f;
		float bestF = 0.0f;
		size_t actualIterations = 0u;
		float achievedEps = 0.0f;

		fManip(nSegments, variableLengths, minTheta, tx, ty, levels, maxIter, r_param, adaptive, eps, seed,
			&bestQ, &bestQLen, &bestX, &bestY, &bestF, &actualIterations, &achievedEps);

		QueryPerformanceCounter(&t1);
		QueryPerformanceFrequency(&fq);

		const double micros = 1e6 * static_cast<double>(t1.QuadPart - t0.QuadPart) /
			static_cast<double>(fq.QuadPart);

		angles->Clear();
		for (int i = 0; i < nSegments; ++i) {
			angles->Add(bestQ[i]);
		}

		lengths->Clear();
		if (variableLengths) {
			for (int i = 0; i < nSegments; ++i) {
				lengths->Add(bestQ[nSegments + i]);
			}
		}
		else {
			const float baseLen = static_cast<float>(nudBaseLength->Value);
			for (int i = 0; i < nSegments; ++i) {
				lengths->Add(baseLen);
			}
		}

		pFree(bestQ);

		const float dx = bestX - tx;
		const float dy = bestY - ty;

		lblInfo->Text = String::Format(
			L"Результат:\n"
			L"Близость захвата: {0:F5}\n"
			L"Функционал: {1:F5}\n"
			L"Точка: ({2:F3}, {3:F3})\n"
			L"Время: {4:F0} мкс\n"
			L"Число шагов: {5}\n"
			L"Достигнутая точность: {6:E3}",
			Math::Sqrt(dx * dx + dy * dy),
			bestF,
			bestX,
			bestY,
			micros,
			actualIterations,
			achievedEps);

		this->Invalidate();
	}

	[MethodImpl(MethodImplOptions::AggressiveInlining)]
	void RunPythonBackend(String^ which) {
		const int nSeg = nSegments;
		const bool varLen = cbVarLen->Checked;
		const float minTheta = static_cast<float>(nudMinTheta->Value);
		const float tx = static_cast<float>(nudTargetX->Value);
		const float ty = static_cast<float>(nudTargetY->Value);
		const int levels = static_cast<int>(nudLevels->Value);
		const int maxIter = static_cast<int>(nudMaxIter->Value);
		const float r_param = static_cast<float>(nudR->Value);
		const float eps = static_cast<float>(nudEps->Value);
		const bool adaptive = cbAdaptive->Checked;

		// Подготовка массивов для результатов
		array<float>^ tempAngles = gcnew array<float>(nSeg);
		array<float>^ tempLengths = gcnew array<float>(nSeg);
		pin_ptr<float> pinnedAngles = &tempAngles[0];
		pin_ptr<float> pinnedLengths = &tempLengths[0];

		float bestF = 0.0f, bestX = 0.0f, bestY = 0.0f;
		float achievedEps = 0.0f, micros = 0.0f;
		int iterations = 0;

		// Конвертируем String^ в const char*
		std::string backendStr;
		if (which == "optuna") {
			backendStr = "optuna";
		}
		else {
			backendStr = "iopt";
		}

		// Прямой вызов Python через Pybind11
		pRunPythonOptimization(
			backendStr.c_str(),
			nSeg, varLen, minTheta, tx, ty,
			levels, maxIter, r_param, eps, adaptive,
			&bestF, &bestX, &bestY,
			pinnedAngles, pinnedLengths,
			&iterations, &achievedEps, &micros
		);

		// Обновляем UI
		angles->Clear();
		lengths->Clear();

		for (int i = 0; i < nSeg; ++i) {
			angles->Add(tempAngles[i]);
			lengths->Add(varLen ? tempLengths[i] : static_cast<float>(nudBaseLength->Value));
		}

		const float dx = bestX - tx;
		const float dy = bestY - ty;
		const double distance = Math::Sqrt(dx * dx + dy * dy);

		lblInfo->Text = String::Format(
			L"Результат (pybind11):\n"
			L"Близость захвата: {0:F5}\n"
			L"Функционал: {1:F5}\n"
			L"Точка: ({2:F3}, {3:F3})\n"
			L"Время: {4:F0} мкс\n"
			L"Число шагов: {5}\n"
			L"Достигнутая точность: {6:E3}",
			distance,
			bestF,
			bestX,
			bestY,
			micros,
			iterations,
			achievedEps);

		this->Invalidate();
		this->Refresh();
	}

	System::Void OnBackendChanged(System::Object^ sender, System::EventArgs^ e) {
		switch (cbBackend->SelectedIndex) {
		case 0:
			backend = OptimizerBackend::AgpCpp;
			cbAdaptive->Enabled = true;
			break;
		case 1:
			backend = OptimizerBackend::Optuna;
			cbAdaptive->Enabled = false;
			break;
		case 2:
			backend = OptimizerBackend::IOpt;
			cbAdaptive->Enabled = true;
			break;
		default:
			backend = OptimizerBackend::AgpCpp;
			cbAdaptive->Enabled = true;
			break;
		}
	}

	System::Void OnResize(System::Object^ sender, System::EventArgs^ e) {
		this->Invalidate();
	}

	System::Void OnAnyChanged(System::Object^ sender, System::EventArgs^ e) {
		this->Invalidate();
	}

	System::Void OnAddClick(System::Object^ sender, System::EventArgs^ e) {
		++nSegments;
		angles->Add(RandAngle());
		lengths->Add(static_cast<float>(nudBaseLength->Value));
		this->Invalidate();
	}

	System::Void OnRemClick(System::Object^ sender, System::EventArgs^ e) {
		if (nSegments > 1) {
			--nSegments;
			angles->RemoveAt(angles->Count - 1);
			lengths->RemoveAt(lengths->Count - 1);
			this->Invalidate();
		}
	}

	System::Void OnOptimizeClick(System::Object^ sender, System::EventArgs^ e) {
		switch (backend) {
		case OptimizerBackend::AgpCpp:
			RunAgpCpp();
			break;
		case OptimizerBackend::Optuna:
			RunPythonBackend(L"optuna");
			break;
		case OptimizerBackend::IOpt:
			RunPythonBackend(L"iopt");
			break;
		}
	}

protected:
	virtual void OnPaint(PaintEventArgs^ e) override {
		Form::OnPaint(e);
		Graphics^ g = e->Graphics;
		g->SmoothingMode = SmoothingMode::HighQuality;
		g->Clear(this->BackColor);

		System::Drawing::Rectangle drawArea(0, 180, this->ClientSize.Width, this->ClientSize.Height - 180);
		g->FillRectangle(Brushes::White, drawArea);

		const int leftWallX = drawArea.Left + this->ClientSize.Width * 25 / 100;

		g->DrawLine(wallPen, leftWallX, drawArea.Top, leftWallX, this->ClientSize.Height);
		g->FillRectangle(wallHatchBrush, leftWallX - 100, drawArea.Top, 100, this->ClientSize.Height - drawArea.Top);

		const float targetX = static_cast<float>(nudTargetX->Value);
		const float targetY = static_cast<float>(nudTargetY->Value);

		const int baseX = leftWallX;
		const int baseY = drawArea.Top + drawArea.Height / 2;

		const float pixelTargetX = static_cast<float>(baseX) + targetX * 160.0f;
		const float pixelTargetY = static_cast<float>(baseY) - targetY * 160.0f;

		int rightWallX = static_cast<int>(pixelTargetX + 8.0f);
		if (rightWallX > drawArea.Right - 10) {
			rightWallX = drawArea.Right - 10;
		}

		g->DrawLine(dashedPen, rightWallX, drawArea.Top, rightWallX, this->ClientSize.Height);
		g->FillRectangle(wallHatchBrush, rightWallX, drawArea.Top, 100, this->ClientSize.Height - drawArea.Top);

		g->DrawEllipse(targetPen, pixelTargetX - 8.0f, pixelTargetY - 8.0f, 16.0f, 16.0f);

		cli::array<PointF>^ pts = gcnew cli::array<PointF>(nSegments + 1);
		pts[0] = PointF(static_cast<float>(baseX), static_cast<float>(baseY));

		float x = 0.0f;
		float y = 0.0f;
		float phi = 0.0f;

		array<float>^ localAngles = angles->ToArray();
		array<float>^ localLengths = lengths->ToArray();

		for (int i = 0; i < nSegments; ++i) {
			const float theta = localAngles[i];
			const float L = localLengths[i];
			phi += theta;
			x += L * static_cast<float>(Math::Cos(static_cast<double>(phi)));
			y += L * static_cast<float>(Math::Sin(static_cast<double>(phi)));
			pts[i + 1] = PointF(static_cast<float>(baseX) + x * 160.0f,
				static_cast<float>(baseY) - y * 160.0f);
		}

		for (int i = 0; i < nSegments; ++i) {
			g->DrawLine(penRod, pts[i], pts[i + 1]);
		}

		for (int i = 0; i <= nSegments; ++i) {
			g->FillEllipse(jointBrush, pts[i].X - 8.0f, pts[i].Y - 8.0f, 16.0f, 16.0f);
		}
	}
};

}" код MyForm.cpp: "#include "MyForm.h"

using namespace System;
using namespace System::Windows::Forms;

typedef int(__cdecl* PInit)(int, float, float, float, float);
typedef void(__cdecl* PStartWorkers)();

[STAThread]
int main() {
HMODULE h = LoadLibraryW(L"TEST_FUNC.dll");
auto AgpInit = (PInit)GetProcAddress(h, "AgpInit");
auto AgpWaitStartAndRun = (PStartWorkers)GetProcAddress(h, "AgpWaitStartAndRun");

text
const int rank = AgpInit(12, -2.2f, 1.8f, -2.2f, 1.8f);

if (!rank) {
	Application::EnableVisualStyles();
	Application::SetCompatibleTextRenderingDefault(false);
	Application::Run(gcnew TESTAGP::MyForm(h));
}
else {
	AgpWaitStartAndRun();
}

return 0;

}", код .cpp файла dll библиотеки: "#include "pch.h"

#define XOR_RAND(state, result_var)
do {
unsigned s = (state);
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
(state) = s;
float tmp = (float)((double)(s) * (1.0/4294967296.0));
result_var = tmp;
} while (0)

#define XOR_RAND_GRSH(state, result_var)
do {
unsigned s = (state);
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
(state) = s;
result_var = fmaf((float)(int)s, 0x1.0p-31f, -1.0f);
} while (0)

#define FABE13_COS(x, result_var)
do {
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530718f);
if (r > 3.14159265359f)
r = 6.28318530718f - r;
if (r < 1.57079632679f) {
const float t2 = r * r;
const float t4 = t2 * t2;
result_var = fmaf(t4, fmaf(t2, -0.0013888889f, 0.0416666667f), fmaf(t2, -0.5f, 1.0f));
} else {
r = 3.14159265359f - r;
const float t2 = r * r;
const float t4 = t2 * t2;
result_var = -fmaf(t4, fmaf(t2, -0.0013888889f, 0.0416666667f), fmaf(t2, -0.5f, 1.0f));
}
} while (0)

#define FABE13_SIN(x, result_var)
do {
const float x = (x);
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530718f);
bool sfl = r > 3.14159265359f;
if (sfl)
r = 6.28318530718f - r;
bool cfl = r > 1.57079632679f;
if (cfl)
r = 3.14159265359f - r;
const float t2 = r * r;
float _s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
result_var = ((x < 0.0f) ^ sfl) ? -_s : _s;
} while (0)

#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
const int limit = (n) & ~7;
if ((n) >= 8) {
static __declspec(align(16)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f);
static __declspec(align(16)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f);
static __declspec(align(16)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f);
static __declspec(align(16)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f);
static __declspec(align(16)) const __m256 BIAS = _mm256_set1_ps(12582912.0f);
static __declspec(align(16)) const __m256 VEC_COS_P5 = _mm256_set1_ps(-0.0013888889f);
static __declspec(align(16)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f);
static __declspec(align(16)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f);
static __declspec(align(16)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(16)) const __m256 VEC_SIN_P5 = _mm256_set1_ps(-0.0001984127f);
static __declspec(align(16)) const __m256 VEC_SIN_P3 = _mm256_set1_ps(0.0083333333f);
static __declspec(align(16)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f);
static __declspec(align(16)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(16)) const __m256 VEC_ZERO = _mm256_setzero_ps();
while (i < limit) {
const __m256 vx = _mm256_load_ps(&(in)[i]);
const __m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
const __m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
const __m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
const __m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
const __m256 t2 = _mm256_mul_ps(r2, r2);
const __m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P5, VEC_COS_P3), VEC_COS_P1), VEC_COS_P0);
const __m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_SIN_P5, VEC_SIN_P3), VEC_SIN_P1), VEC_SIN_P0), r2);
const __m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
const __m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ), _mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
_mm256_store_ps(&(cos_out)[i], _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip));
_mm256_store_ps(&(sin_out)[i], _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip));
i += 8;
}
}
while (i < (n)) {
const float x = (in)[i];
const float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
const bool sflip = r > 3.14159265359f;
if (sflip)
r = 6.28318530718f - r;
const bool cflip = r > 1.57079632679f;
if (cflip)
r = 3.14159265359f - r;
const float t2 = r * r;
const float c = fmaf(t2, fmaf(t2, fmaf(t2, -0.0013888889f, 0.0416666667f), -0.5f), 1.0f);
const float s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
(cos_out)[i] = cflip ? -c : c;
(sin_out)[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while (0)

enum List : unsigned {
Top = 0b00u,
Down = 0b01u,
Left = 0b10u,
Right = 0b11u
};

__declspec(align(16)) struct Step final {
const unsigned next, dx, dy;
Step(const unsigned n, const unsigned x, const unsigned y) noexcept : next(n), dx(x), dy(y) {}
};

__declspec(align(16)) struct InvStep final {
const unsigned q, next;
InvStep(const unsigned q_val, const unsigned n) noexcept : q(q_val), next(n) {}
};

__declspec(align(16)) static const Step g_step_tbl[4][4] = {
{ Step(Right,0u,0u), Step(Top,0u,1u), Step(Top,1u,1u), Step(Left,1u,0u) },
{ Step(Left,1u,1u), Step(Down,1u,0u), Step(Down,0u,0u), Step(Right,0u,1u) },
{ Step(Down,1u,1u), Step(Left,0u,1u), Step(Left,0u,0u), Step(Top,1u,0u) },
{ Step(Top,0u,0u), Step(Right,1u,0u), Step(Right,1u,1u), Step(Down,0u,1u) }
};

__declspec(align(16)) static const InvStep g_inv_tbl[4][4] = {
{ InvStep(0u,Right), InvStep(1u,Top), InvStep(3u,Left), InvStep(2u,Top) },
{ InvStep(2u,Down), InvStep(3u,Right), InvStep(1u,Down), InvStep(0u,Left) },
{ InvStep(2u,Left), InvStep(1u,Left), InvStep(3u,Top), InvStep(0u,Down) },
{ InvStep(0u,Top), InvStep(3u,Down), InvStep(1u,Right), InvStep(2u,Right) }
};

static const boost::mpi::environment* g_env;
static const boost::mpi::communicator* g_world;
static const pybind11::scoped_interpreter* guard;
static pybind11::module_ sys;
static pybind11::object path;
static pybind11::module_ optimizer_bridge;

__declspec(align(16)) struct CrossMsg final {
float s_x1, s_x2, e_x1, e_x2, Rtop;
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept {
ar& s_x1& s_x2& e_x1& e_x2& Rtop;
}
};

__declspec(align(16)) struct MultiCrossMsg final {
float intervals[15];
unsigned count;
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept {
ar& intervals& count;
}
};

__declspec(align(16)) struct BestSolutionMsg final {
float bestF, bestX, bestY, bestQ[32];
unsigned dim;
template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept {
ar& bestF& bestX& bestY& bestQ& dim;
}
};

__declspec(align(16)) struct Slab final {
char* const base;
char* current;
char* const end;
__forceinline Slab(void* const memory, const size_t usable) noexcept :
base((char*)memory), current(base), end(base + (usable & ~(size_t)63u)) {
}
};

static thread_local tbb::enumerable_thread_specific<Slab*> tls( noexcept {
void* memory = _aligned_malloc(16777216u, 16u);
Slab* slab = (Slab*)_aligned_malloc(sizeof(Slab), 16u);
new (slab) Slab(memory, 16777216u);
char* p = slab->base;
#pragma loop ivdep
while (p < slab->end) {
*p = 0;
p += 4096u;
}
return slab;
});

__declspec(align(16)) struct Peano2DMap final {
const int levels;
const float a, b, c, d;
const float lenx, leny;
const float inv_lenx;
const unsigned scale;
const unsigned start;

text
__forceinline Peano2DMap(const int L, const float _a, const float _b, const float _c, const float _d, const unsigned st) noexcept
	: levels(L), a(_a), b(_b), c(_c), d(_d),
	lenx(_b - _a), leny(_d - _c), inv_lenx(1.0f / (_b - _a)),
	scale((unsigned)1u << (L << 1)), start(st) {
}

};

static Peano2DMap gActiveMap(0, 0, 0, 0, 0, 0);

__declspec(align(16)) struct Interval1D final {
const float x1, x2, y1, y2, delta_y, ordinate_factor, N_factor, quadratic_term, M;
float R;

text
static __declspec(noalias) __forceinline void* operator new(const size_t) noexcept {
	Slab* s = tls.local();
	char* r = s->current;
	s->current += 64u;
	return r;
}

__declspec(noalias) __forceinline Interval1D(const float _x1, const float _x2, const float _y1, const float _y2, const float _N) noexcept
	: x1(_x1), x2(_x2), y1(_y1), y2(_y2), delta_y(_y2 - _y1), ordinate_factor(-(y1 + y2) * 2.0f),
	N_factor(_N == 1.0f ? _x2 - _x1 : sqrtf(_x2 - _x1)),
	quadratic_term(fmaf((1.0f / N_factor)* delta_y, delta_y, 0.0f)),
	M(fabsf(delta_y) / N_factor) {
}

__declspec(noalias) __forceinline void ChangeCharacteristic(const float _m) noexcept {
	const float inv_m = 1.0f / _m;
	R = fmaf(inv_m, quadratic_term, fmaf(_m, N_factor, ordinate_factor));
}

};

__declspec(align(16)) struct IntervalND final {
const float x1, x2, y1, y2, delta_y, ordinate_factor;
float N_factor, quadratic_term, M, R;
unsigned long long i1, i2;
float diam;
int span_level;

text
static __declspec(noalias) __forceinline void* operator new(const size_t) noexcept {
	Slab* s = tls.local();
	char* r = s->current;
	s->current += 64u;
	return r;
}

__declspec(noalias) __forceinline IntervalND(const float _x1, const float _x2, const float _y1, const float _y2) noexcept
	: x1(_x1), x2(_x2), y1(_y1), y2(_y2), delta_y(_y2 - _y1), ordinate_factor(-(y1 + y2) * 2.0f),
	N_factor(0), quadratic_term(0), M(0), R(0), i1(0), i2(0), diam(0), span_level(0) {
}

__declspec(noalias) __forceinline void compute_span_level(const struct MortonND& map) noexcept;
__declspec(noalias) __forceinline void set_metric(const float d_alpha) noexcept {
	N_factor = d_alpha;
	quadratic_term = (1.0f / N_factor) * delta_y * delta_y;
	M = fabsf(delta_y) / N_factor;
}

__declspec(noalias) __forceinline void ChangeCharacteristic(const float _m) noexcept {
	const float inv_m = 1.0f / _m;
	R = fmaf(inv_m, quadratic_term, fmaf(_m, N_factor, ordinate_factor));
}

};

static __declspec(noalias) __forceinline bool ComparePtr1D(const Interval1D* const a, const Interval1D* const b) noexcept {
return a->R < b->R;
}

static __declspec(noalias) __forceinline bool ComparePtrND(const IntervalND* const a, const IntervalND* const b) noexcept {
return a->R < b->R;
}

static __declspec(noalias) __forceinline void RecomputeR_ConstM_AVX2_1D(Interval1D* const* const arr, const size_t n, const float m) noexcept {
const __m256 vm = _mm256_set1_ps(m);
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
size_t i = 0, limit = n & ~7ull;
alignas(16) float q[8], nf[8], od[8], out[8];
#pragma loop ivdep
while (i < limit) {
int k = 0;
#pragma loop ivdep
while (k < 8) {
const Interval1D* const p = arr[i + k];
q[k] = p->quadratic_term;
nf[k] = p->N_factor;
od[k] = p->ordinate_factor;
++k;
}
const __m256 vq = _mm256_load_ps(q), vnf = _mm256_load_ps(nf), vod = _mm256_load_ps(od);
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
_mm256_store_ps(out, res);
k = 0;
#pragma loop ivdep
while (k < 8) {
arr[i + k]->R = out[k];
++k;
}
i += 8;
}
while (i < n) {
arr[i]->ChangeCharacteristic(m);
++i;
}
}

static __declspec(noalias) __forceinline void RecomputeR_AffineM_AVX2_1D(Interval1D* const* const arr, const size_t n, const float GF, const float alpha) noexcept {
const __m256 vGF = _mm256_set1_ps(GF), va = _mm256_set1_ps(alpha);
size_t i = 0, limit = n & ~7ull;
alignas(16) float ln[8], Mv[8], q[8], nf[8], od[8], out[8];
#pragma loop ivdep
while (i < limit) {
int k = 0;
#pragma loop ivdep
while (k < 8) {
const Interval1D* const p = arr[i + k];
ln[k] = p->x2 - p->x1;
Mv[k] = p->M;
q[k] = p->quadratic_term;
nf[k] = p->N_factor;
od[k] = p->ordinate_factor;
++k;
}
const __m256 vln = _mm256_load_ps(ln), vM = _mm256_load_ps(Mv), vq = _mm256_load_ps(q), vnf = _mm256_load_ps(nf), vod = _mm256_load_ps(od);
const __m256 vm = _mm256_fmadd_ps(vGF, vln, _mm256_mul_ps(va, vM));
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
_mm256_store_ps(out, res);
k = 0;
#pragma loop ivdep
while (k < 8) {
arr[i + k]->R = out[k];
++k;
}
i += 8;
}
while (i < n) {
const Interval1D* const p = arr[i];
const float mi = fmaf(GF, (p->x2 - p->x1), p->M * alpha);
arr[i]->R = fmaf((1.0f / mi) * p->quadratic_term, 1.0f, fmaf(mi, p->N_factor, p->ordinate_factor));
++i;
}
}

static __declspec(noalias) __forceinline void RecomputeR_ConstM_AVX2_ND(IntervalND* const* const arr, const size_t n, const float m) noexcept {
const __m256 vm = _mm256_set1_ps(m);
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
size_t i = 0, limit = n & ~7ull;
alignas(16) float q[8], nf[8], od[8], out[8];
#pragma loop ivdep
while (i < limit) {
int k = 0;
#pragma loop ivdep
while (k < 8) {
const IntervalND* const p = arr[i + k];
q[k] = p->quadratic_term;
nf[k] = p->N_factor;
od[k] = p->ordinate_factor;
++k;
}
const __m256 vq = _mm256_load_ps(q), vnf = _mm256_load_ps(nf), vod = _mm256_load_ps(od);
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
_mm256_store_ps(out, res);
k = 0;
#pragma loop ivdep
while (k < 8) {
arr[i + k]->R = out[k];
++k;
}
i += 8;
}
while (i < n) {
arr[i]->ChangeCharacteristic(m);
++i;
}
}

static __declspec(noalias) __forceinline void RecomputeR_AffineM_AVX2_ND(IntervalND* const* const arr, const size_t n, const float GF, const float alpha) noexcept {
const __m256 vGF = _mm256_set1_ps(GF), va = _mm256_set1_ps(alpha);
size_t i = 0, limit = n & ~7ull;
alignas(16) float ln[8], Mv[8], q[8], nf[8], od[8], out[8];
#pragma loop ivdep
while (i < limit) {
int k = 0;
#pragma loop ivdep
while (k < 8) {
const IntervalND* const p = arr[i + k];
ln[k] = p->x2 - p->x1;
Mv[k] = p->M;
q[k] = p->quadratic_term;
nf[k] = p->N_factor;
od[k] = p->ordinate_factor;
++k;
}
const __m256 vln = _mm256_load_ps(ln), vM = _mm256_load_ps(Mv), vq = _mm256_load_ps(q), vnf = _mm256_load_ps(nf), vod = _mm256_load_ps(od);
const __m256 vm = _mm256_fmadd_ps(vGF, vln, _mm256_mul_ps(va, vM));
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
_mm256_store_ps(out, res);
k = 0;
#pragma loop ivdep
while (k < 8) {
arr[i + k]->R = out[k];
++k;
}
i += 8;
}
while (i < n) {
const IntervalND* const p = arr[i];
const float mi = fmaf(GF, (p->x2 - p->x1), p->M * alpha);
arr[i]->R = fmaf((1.0f / mi) * p->quadratic_term, 1.0f, fmaf(mi, p->N_factor, p->ordinate_factor));
++i;
}
}

static __declspec(noalias) __forceinline float fast_pow_int(const float v, const int n) noexcept {
float r;
switch (n) {
case 3: {
const float v2 = v * v;
r = v2 * v;
} break;
case 4: {
const float v2 = v * v;
r = v2 * v2;
} break;
case 5: {
const float v2 = v * v;
r = v2 * v2 * v;
} break;
case 6: {
const float v2 = v * v;
const float v4 = v2 * v2;
r = v4 * v2;
} break;
case 7: {
const float v2 = v * v;
const float v4 = v2 * v2;
r = v4 * v2 * v;
} break;
case 8: {
const float v2 = v * v;
const float v4 = v2 * v2;
r = v4 * v4;
} break;
case 9: {
const float v3 = v * v * v;
const float v6 = v3 * v3;
r = v6 * v3;
} break;
case 10: {
const float v2 = v * v;
const float v4 = v2 * v2;
const float v8 = v4 * v4;
r = v8 * v2;
} break;
case 11: {
const float v2 = v * v;
const float v4 = v2 * v2;
const float v8 = v4 * v4;
r = v8 * v2 * v;
} break;
case 12: {
const float v3 = v * v * v;
const float v6 = v3 * v3;
r = v6 * v6;
} break;
case 13: {
const float v3 = v * v * v;
const float v6 = v3 * v3;
r = v6 * v6 * v;
} break;
case 14: {
const float v7 = v * v * v * v * v * v * v;
r = v7 * v7;
} break;
case 15: {
const float v7 = v * v * v * v * v * v * v;
r = v7 * v7 * v;
} break;
default: {
const float v2 = v * v;
const float v4 = v2 * v2;
const float v8 = v4 * v4;
r = v8 * v8;
}
}
return r;
}

static __declspec(noalias) __forceinline float step(const float _m, const float x1, const float x2, const float y1, const float y2, const float _N, const float _r) noexcept {
const float diff = y2 - y1;
const unsigned sign_mask = ((reinterpret_cast<const unsigned>(&diff)) & 0x80000000u) ^ 0x80000000u;
const float sign_mult = reinterpret_cast<const float>(&sign_mask);
if (_N == 1.0f)
return fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f;
if (_N == 2.0f)
return fmaf((1.0f / (_m * _m)) * sign_mult * diff * diff * _r, 1.0f, x1 + x2) * 0.5f;
return fmaf((1.0f / fast_pow_int(_m, _N)) * sign_mult * fast_pow_int(fabsf(diff), _N) * _r, 1.0f, x1 + x2) * 0.5f;
}

__declspec(align(16)) struct MortonCachePerRank final {
std::vector<int> permCache;
std::vector<unsigned long long> invMaskCache;
unsigned baseSeed;
};

static thread_local MortonCachePerRank g_mc;

static __declspec(noalias) __forceinline unsigned long long gray_encode(const unsigned long long x) noexcept {
return x ^ (x >> 1);
}

static __declspec(noalias) __forceinline long long gray_decode(unsigned long long g) noexcept {
g ^= g >> 32;
g ^= g >> 16;
g ^= g >> 8;
g ^= g >> 4;
g ^= g >> 2;
g ^= g >> 1;
return g;
}

__declspec(align(16)) struct MortonND final {
const int dim, levels;
const int eff_levels;
const int extra_levels;
const int chunks;
std::vector<int> chunk_bits;
std::vector<unsigned long long> chunk_bases;
unsigned long long scale;
std::vector<float> low, high, step, invStep, baseOff;
std::vector<int> perm;
std::vector<unsigned long long> invMask;
std::vector<unsigned long long> pextMask;
std::vector<unsigned long long> pextMaskChunks;
const float invScaleLevel;
const bool use_gray;

text
static __declspec(noalias) __forceinline unsigned long long make_mask(const int dim, const int Lc, const int d) noexcept {
	unsigned long long m = 0ull, bitpos = static_cast<unsigned long long>(d);
	int b = 0;

#pragma loop ivdep
while (b < Lc) {
m |= 1ull << bitpos;
bitpos += static_cast<unsigned long long>(dim);
++b;
}
return m;
}

text
__declspec(noalias) __forceinline MortonND(const int D, const int L, const float* const lows, const float* const highs, const MortonCachePerRank& mc)
	: dim(D), levels(L),
	eff_levels((std::max)(1, static_cast<int>(63 / (D ? D : 1)))),
	extra_levels((L > eff_levels) ? (L - eff_levels) : 0),
	chunks((extra_levels > 0) ? (1 + (extra_levels + eff_levels - 1) / eff_levels) : 1),
	low(lows, lows + D), high(highs, highs + D),
	step(D, 0.0f), invStep(D, 0.0f), baseOff(D, 0.0f),
	perm(mc.permCache.begin(), mc.permCache.begin() + D),
	invMask(mc.invMaskCache.begin(), mc.invMaskCache.begin() + D),
	invScaleLevel(1.0f / static_cast<float>(static_cast<unsigned long long>(1) << L)), use_gray(true) {

	int d = 0;

#pragma loop ivdep
while (d < dim) {
const float rng = high[d] - low[d];
const float st = rng * invScaleLevel;
step[d] = st;
invStep[d] = 1.0f / st;
baseOff[d] = fmaf(0.5f, st, low[d]);
++d;
}

text
	chunk_bits.resize(chunks);
	pextMaskChunks.resize(static_cast<size_t>(chunks) * static_cast<size_t>(dim));
	chunk_bases.resize(chunks);
	int remaining = levels;
	int c = 0;
	while (c < chunks) {
		const int Lc = (c == 0) ? (std::min)(eff_levels, remaining) : (std::min)(eff_levels, remaining);
		chunk_bits[c] = Lc;
		remaining -= Lc;
		const unsigned long long baseC = static_cast<unsigned long long>(1) << (dim * Lc);
		chunk_bases[c] = baseC;
		d = 0;

#pragma loop ivdep
while (d < dim) {
pextMaskChunks[static_cast<size_t>(c) * static_cast<size_t>(dim) + static_cast<size_t>(d)] = make_mask(dim, Lc, d);
++d;
}
++c;
}

text
	pextMask.resize(dim);
	d = 0;

#pragma loop ivdep
while (d < dim) {
pextMask[d] = make_mask(dim, chunk_bits[0], d);
++d;
}

text
	scale = static_cast<unsigned long long>(1) << (dim * chunk_bits[0]);
}

__declspec(noalias) __forceinline float block_diameter(unsigned long long i1, unsigned long long i2) const noexcept {
	if (i1 > i2) std::swap(i1, i2);
	float s2 = 0.0f;
	int d = 0;

#pragma loop ivdep
while (d < dim) {
const int pd = perm[d];
const unsigned long long varying = (i1 ^ i2) & pextMask[d];
const int nfree_hi = _mm_popcnt_u64(varying);
const int nfree_total = nfree_hi + (levels - chunk_bits[0]);
const float range = step[pd] * (ldexpf(1.0f, nfree_total) - 1.0f);
s2 = fmaf(range, range, s2);
++d;
}
return sqrtf(s2);
}

text
__declspec(noalias) __forceinline void map01ToPoint(const float t, float* const __restrict out) const noexcept {
	unsigned long long accBits[32] = { 0ull };
	int accShifted[32] = { 0 };

	int c = 0;
	while (c < chunks) {
		const int Lc = chunk_bits[c];
		const unsigned long long baseC = chunk_bases[c];
		const float scaled = t * static_cast<float>(baseC);
		unsigned long long idxc = (scaled >= static_cast<float>(baseC)) ? (baseC - 1ull) : static_cast<unsigned long long>(scaled);
		const float u = scaled - static_cast<float>(idxc);
		if (use_gray) idxc = gray_encode(idxc);

		int shift_from_top = 0;
		int k = 0;
		while (k <= c) {
			shift_from_top += chunk_bits[k];
			++k;
		}
		const int inv_shift = levels - shift_from_top;

		int d = 0;

#pragma loop ivdep
while (d < dim) {
const int pd = perm[d];
const unsigned long long mask = pextMaskChunks[static_cast<size_t>(c) * static_cast<size_t>(dim) + static_cast<size_t>(d)];
unsigned long long bits = _pext_u64(idxc, mask);
if (inv_shift >= 0) {
unsigned long long invMaskSegment = 0ull;
if (chunk_bits[c] < 63) {
const unsigned long long take = (static_cast<unsigned long long>(1) << chunk_bits[c]) - 1ull;
invMaskSegment = (invMask[pd] >> inv_shift) & take;
}
bits ^= invMaskSegment;
}
accBits[pd] = (accBits[pd] << Lc) | bits;
accShifted[pd] += Lc;
++d;
}
++c;
}
int d = 0;
#pragma loop ivdep
while (d < dim) {
out[d] = fmaf(step[d], static_cast<float>(accBits[d]), baseOff[d]);
++d;
}
}

text
__declspec(noalias) __forceinline float pointToT(const float* const __restrict q) const noexcept {
	const int bitsFull = levels;
	const int bitsCoarse = chunk_bits[0];
	unsigned long long idx0 = 0ull;
	int d = 0;

#pragma loop ivdep
while (d < dim) {
const int pd = perm[d];
const float v = (q[pd] - baseOff[pd]) * invStep[pd];
const long long cell = static_cast<long long>(_mm_cvt_ss2si(_mm_round_ss(_mm_setzero_ps(), _mm_set_ss(v), _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)));
const long long maxv = (static_cast<long long>(1) << bitsFull) - 1;
unsigned long long b = static_cast<unsigned long long>(cell) >> (bitsFull - bitsCoarse);
unsigned long long invMask0 = 0ull;
if (bitsCoarse < 63) {
const unsigned long long take = (static_cast<unsigned long long>(1) << bitsCoarse) - 1ull;
invMask0 = (invMask[pd] >> (bitsFull - bitsCoarse)) & take;
}
b ^= invMask0;
idx0 |= _pdep_u64(b, pextMask[d]);
++d;
}
if (use_gray) idx0 = gray_decode(idx0);
return (static_cast<float>(idx0) + 0.5f) / static_cast<float>(scale);
}
};

__declspec(noalias) __forceinline void IntervalND::compute_span_level(const MortonND& map) noexcept {
span_level = 0;
int d = 0;
#pragma loop ivdep
while (d < map.dim) {
const unsigned long long varying = (i1 ^ i2) & map.pextMask[d];
span_level += _mm_popcnt_u64(varying);
++d;
}
span_level += (map.levels - map.chunk_bits[0]) * map.dim;
span_level = (std::min)(span_level, 11);
}

__declspec(align(16)) struct ManipCost final {
const int n;
const bool variableLen;
const float targetX, targetY;
const float minTheta;
const float archBiasW, archBiasK;
const float sharpW;

text
__declspec(noalias) __forceinline ManipCost(const int _n, const bool _variableLen, const float _targetX, const float _targetY, const float _minTheta) noexcept
	: n(_n), variableLen(_variableLen), targetX(_targetX), targetY(_targetY), minTheta(_minTheta),
	archBiasW(0.02f), archBiasK(3.0f), sharpW(0.05f) {
}

__declspec(noalias) __forceinline float operator()(const float* const __restrict q, float& out_x, float& out_y) const noexcept {
	const float* const th = q;
	const float* const L = variableLen ? (q + n) : nullptr;
	__declspec(align(16)) float phi[32], s_arr[32], c_arr[32];
	float x = 0.0f, y = 0.0f, phi_acc = 0.0f, penC = 0.0f, archPen = 0.0f;

	int i = 0;

#pragma loop ivdep
while (i < n) {
phi_acc += th[i];
phi[i] = phi_acc;
++i;
}
FABE13_SINCOS(phi, s_arr, c_arr, n);

text
	const float Lc = 1.0f;
	if (variableLen) {
		i = 0;
		while (i < n) {
			const float Li = L[i];
			x = fmaf(Li, c_arr[i], x);
			y = fmaf(Li, s_arr[i], y);
			++i;
		}
	}
	else {
		i = 0;
		while (i < n) {
			x = fmaf(Lc, c_arr[i], x);
			y = fmaf(Lc, s_arr[i], y);
			++i;
		}
	}

	i = 0;

#pragma loop ivdep
while (i < n) {
const float ai = fabsf(th[i]);
const float v = ai - minTheta;
if (v > 0.0f) {
const float scale = 2.0f / (minTheta + 1.0e-6f);
penC += sharpW * (exp2f(scale * v) - 1.0f);
}
const float t = -th[i] * archBiasK;
float sp;
if (t > 10.0f) {
sp = t;
}
else {
const float e_t = fmaf(t, fmaf(t, fmaf(t, fmaf(t, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f);
sp = log1pf(e_t);
}
archPen += archBiasW * sp;
++i;
}

text
	const float dx = x - targetX, dy = y - targetY;
	const float dist = sqrtf(fmaf(dx, dx, dy * dy));
	out_x = x;
	out_y = y;
	return dist + penC + archPen;
}

};

static __declspec(noalias) __forceinline void HitTest2D_analytic(const float x_param, float& out_x1, float& out_x2) noexcept {
const float a = gActiveMap.a, inv_lenx = gActiveMap.inv_lenx;
const unsigned scale = gActiveMap.scale, scale_minus_1 = scale - 1u;
const float lenx = gActiveMap.lenx, leny = gActiveMap.leny, c = gActiveMap.c;
const unsigned start = gActiveMap.start;
const int levels = gActiveMap.levels;

text
float norm = (x_param - a) * inv_lenx;
norm = fminf(fmaxf(norm, 0.0f), 0x1.fffffep-1f);
unsigned idx = static_cast<unsigned>(norm * static_cast<float>(scale));
idx = idx > scale_minus_1 ? scale_minus_1 : idx;

float sx = lenx, sy = leny;
float x1 = a, x2 = c;
unsigned type = start;
int l = levels - 1;
while (l >= 0) {
	const unsigned q = (idx >> (l * 2)) & 3u;
	const Step s = g_step_tbl[type][q];
	type = s.next;
	sx *= 0.5f;
	sy *= 0.5f;
	x1 += s.dx ? sx : 0.0f;
	x2 += s.dy ? sy : 0.0f;
	--l;
}
out_x1 = x1 + sx * 0.5f;
out_x2 = x2 + sy * 0.5f;

}

static __declspec(noalias) __forceinline float FindX2D_analytic(const float px, const float py) noexcept {
const float a = gActiveMap.a, b = gActiveMap.b, c = gActiveMap.c, d = gActiveMap.d;
const float lenx = gActiveMap.lenx, leny = gActiveMap.leny;
const unsigned scale = gActiveMap.scale;
const unsigned start = gActiveMap.start;
const int levels = gActiveMap.levels;
const float clamped_px = fminf(fmaxf(px, a), b), clamped_py = fminf(fmaxf(py, c), d);
float sx = lenx, sy = leny;
float x0 = a, y0 = c;
unsigned idx = 0u;
unsigned type = start;
int l = 0;
while (l < levels) {
sx *= 0.5f;
sy *= 0.5f;
const float mx = x0 + sx, my = y0 + sy;
const unsigned tr = static_cast<unsigned>((clamped_px > mx) & (clamped_py > my));
const unsigned tl = static_cast<unsigned>((clamped_px < mx) & (clamped_py > my));
const unsigned dl = static_cast<unsigned>((clamped_px < mx) & (clamped_py < my));
const unsigned none = static_cast<unsigned>(1u ^ (tr | tl | dl));
const unsigned dd = (tr << 1) | tr | tl | (none << 1);
const InvStep inv = g_inv_tbl[type][dd];
type = inv.next;
idx = (idx << 2) | inv.q;
const unsigned dx = dd >> 1, dy = dd & 1u;
x0 += dx ? sx : 0.0f;
y0 += dy ? sy : 0.0f;
++l;
}
const float scale_recip = 1.0f / static_cast<float>(scale);
return fmaf(static_cast<float>(idx) * scale_recip, lenx, a);
}

static __declspec(noalias) __forceinline int generate_lhs_seeds_lite(const MortonND& map, const int dim, float* const __restrict S, const int stride, unsigned seed) noexcept {
int temp_dim = dim;
const int ns = --temp_dim * temp_dim;
unsigned st = seed;
alignas(16) int permutations[32][256];

int d = 0;

#pragma loop ivdep
while (d < dim) {
int s = 0;
#pragma loop ivdep
while (s < ns) {
permutations[d][s] = s;
++s;
}
s = ns - 1;
while (s > 0) {
st ^= st << 13;
st ^= st >> 17;
st ^= st << 5;
const int j = static_cast<int>(st % static_cast<unsigned>(s + 1));
const int t = permutations[d][s];
permutations[d][s] = permutations[d][j];
permutations[d][j] = t;
--s;
}
++d;
}

int s2 = 0;

#pragma loop ivdep
while (s2 < ns) {
d = 0;
#pragma loop ivdep
while (d < dim) {
st ^= st << 13;
st ^= st >> 17;
st ^= st << 5;
const float u = (st & 0xFFFFFF) * 5.9604645e-8f;
const int stratum = permutations[d][s2];
const float pos = (static_cast<float>(stratum) + u) / static_cast<float>(ns);
const int pd = map.perm[d];
const float lo = map.low[pd], hi = map.high[pd];
S[s2 * stride + d] = fmaf(pos, (hi - lo), lo);
++d;
}
++s2;
}
return ns;
}

static __declspec(noalias) __forceinline int generate_heuristic_seeds(const ManipCost& cost, const MortonND& map, const int dim, float* const __restrict S, const int stride, const unsigned seed) noexcept {
const int n = cost.n;
const bool VL = cost.variableLen;
const float tx = cost.targetX, ty = cost.targetY;
int total_seeds = 0;

text
{
	float* const s0 = S + total_seeds * stride;
	float sin_phi, cos_phi;
	const float rho = sqrtf(fmaf(tx, tx, ty * ty));
	FABE13_SINCOS(&tx, &sin_phi, &cos_phi, 1);
	const float phi = (fabsf(sin_phi) > 0.0f) ? atan2f(ty, tx) : 0.0f;
	const float len = fminf(fmaxf(fmaf(1.0f / static_cast<float>(n), rho, 0.0f), 0.5f), 2.0f);
	int i = 0;

#pragma loop ivdep
while (i < n) {
s0[i] = (1.0f / static_cast<float>(n)) * phi;
++i;
}
if (VL) {
i = 0;
while (i < n) {
s0[n + i] = len;
++i;
}
}
++total_seeds;
}

text
{
	float* const s1 = S + total_seeds * stride;
	float sin_phi, cos_phi;
	FABE13_SINCOS(&tx, &sin_phi, &cos_phi, 1);
	const float phi = (fabsf(sin_phi) > 0.0f) ? atan2f(ty, tx) : 0.0f;
	int i = 0;

#pragma loop ivdep
while (i < n) {
s1[i] = fmaf(0.5f, phi, 0.0f) * ((i & 1) ? -1.0f : 1.0f);
++i;
}
if (VL) {
i = 0;
while (i < n) {
s1[n + i] = fmaf(0.4f, static_cast<float>(i) / static_cast<float>(n), 0.8f);
++i;
}
}
++total_seeds;
}

text
{
	float* const s2 = S + total_seeds * stride;
	const float inv = (n > 1) ? 1.0f / static_cast<float>(n - 1) : 0.0f;
	float sin_phi, cos_phi;
	FABE13_SINCOS(&tx, &sin_phi, &cos_phi, 1);
	const float phi = (fabsf(sin_phi) > 0.0f) ? atan2f(ty, tx) : 0.0f;
	int i = 0;

#pragma loop ivdep
while (i < n) {
const float pr = static_cast<float>(i) * inv;
s2[i] = fmaf(phi, fmaf(-0.3f, pr, 1.0f), 0.0f);
++i;
}
if (VL) {
int j = 0;
while (j < n) {
float si;
FABE13_SIN(fmaf(1.5f, static_cast<float>(j), 0.0f), si);
s2[n + j] = fmaf(0.2f, si, 1.0f);
++j;
}
}
++total_seeds;
}

text
const int lhs_count = generate_lhs_seeds_lite(map, dim, S + total_seeds * stride, stride, seed);
total_seeds += lhs_count;
return total_seeds;

}

static __declspec(noalias) void agp_run_branch_mpi(
const MortonND& map, const ManipCost& cost, const int maxIter, const float r, const bool adaptive, const float eps, const unsigned seed,
std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>>& H,
std::vector<float, boost::alignment::aligned_allocator<float, 16u>>& bestQ,
float& bestF, float& bestX, float& bestY, size_t& out_iterations, float& out_achieved_epsilon, const float M_prior = 1e-3f)
noexcept {
const int n = cost.n;
const int dim = n + (cost.variableLen ? n : 0);
const float dim_f = static_cast<float>(dim);
unsigned exchange_counter_500 = 0;
unsigned exchange_counter_T = 0;

text
alignas(16) float M_by_span[12];
int msi = 0;
while (msi < 12) {
	M_by_span[msi++] = M_prior;
}
float Mmax = M_prior;

alignas(16) float q_local[32], phi[32], s_arr[32], c_arr[32], sum_s[32], sum_c[32], q_try[32];
bestQ.reserve(static_cast<size_t>(dim));
float x = 0.0f, y = 0.0f;
int no_improve = 0;

auto t_to_idx = [&](const float t) -> unsigned long long {
	unsigned long long idx = static_cast<unsigned long long>(fmaf(t, static_cast<float>(map.scale), 0.0f));
	return idx;
	};

auto update_pockets_and_Mmax = [&](IntervalND* const I) {
	const int k = I->span_level;
	if (I->M > M_by_span[k]) M_by_span[k] = I->M;
	if (M_by_span[k] > Mmax) Mmax = M_by_span[k];
	};

const float a = 0.0f, b = 1.0f;

auto evalAt = [&](const float t) -> float {
	map.map01ToPoint(t, q_local);
	float f = cost(q_local, x, y);

	if (f < bestF * 1.25f) {
		float acc = 0.0f;
		int ii = 0;
		while (ii < n) {
			acc = fmaf(q_local[ii], 1.0f, acc);
			phi[ii] = acc;
			++ii;
		}
		FABE13_SINCOS(phi, s_arr, c_arr, n);
		float as = 0.0f, ac = 0.0f;
		int k = n - 1;
		while (k >= 0) {
			const float Lk = cost.variableLen ? q_local[n + k] : 1.0f;
			as = fmaf(Lk, s_arr[k], as);
			ac = fmaf(Lk, c_arr[k], ac);
			sum_s[k] = as;
			sum_c[k] = ac;
			--k;
		}
		const float dx = fmaf(x, 1.0f, -cost.targetX);
		const float dy = fmaf(y, 1.0f, -cost.targetY);
		const float dist = sqrtf(fmaf(dx, dx, dy * dy)) + 1.0e-8f;

		float eta = 0.125f;
		int stepI = 0;
		while (stepI < 3) {
			int i = 0;

#pragma loop ivdep
while (i < n) {
float gpen = 0.0f;
{
const float ai = fabsf(q_local[i]);
const float v = ai - cost.minTheta;
if (v > 0.0f) {
const float scale = fmaf(cost.minTheta, 1.0f, 1.0e-6f);
const float e = exp2f(fmaf(2.0f / scale, v, 0.0f));
const float dpen_dtheta = cost.sharpW * fmaf(e, fmaf(0.69314718055994530941723212145818f, 2.0f / scale, 0.0f), 0.0f) * (copysignf(1.0f, q_local[i]));
gpen = fmaf(dpen_dtheta, 1.0f, gpen);
}
}
{
const float tsg = fmaf(-q_local[i], cost.archBiasK, 0.0f);
const float sig = 1.0f / fmaf(expf(-tsg), 1.0f, 1.0f);
gpen = fmaf(-(cost.archBiasW * cost.archBiasK), sig, gpen);
}

text
				const float g = fmaf(fmaf(dx, -sum_s[i], fmaf(dy, sum_c[i], 0.0f)), 1.0f / dist, gpen);
				q_try[i] = fmaf(-eta, g, q_local[i]);

				const float lo0 = -1.0471975511965977461542144610932f;
				const float hi0 = 2.6179938779914943653855361527329f;
				const float lo = -2.6179938779914943653855361527329f;
				const float hi = 2.6179938779914943653855361527329f;
				const float Lb = (i == 0) ? lo0 : lo;
				const float Hb = (i == 0) ? hi0 : hi;
				if (q_try[i] < Lb) q_try[i] = Lb;
				else if (q_try[i] > Hb) q_try[i] = Hb;
				++i;
			}
			if (cost.variableLen) {
				int j = 0;

#pragma loop ivdep
while (j < n) {
const float g = fmaf(fmaf(dx, c_arr[j], fmaf(dy, s_arr[j], 0.0f)), 1.0f / dist, 0.0f);
float v = fmaf(-eta, g, q_local[n + j]);
if (v < 0.5f) v = 0.5f;
else if (v > 2.0f) v = 2.0f;
q_try[n + j] = v;
++j;
}
}
float x2, y2;
const float f2 = cost(q_try, x2, y2);
if (f2 < f) {
memcpy(q_local, q_try, static_cast<size_t>(dim) * sizeof(float));
f = f2;
x = x2;
y = y2;
break;
}
eta = fmaf(eta, 0.5f, 0.0f);
++stepI;
}

text
		const int last = n - 1;
		const float lo = (last == 0) ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f;
		const float hi = 2.6179938779914943653855361527329f;
		float bestLocF = f;
		float saved = q_local[last];
		float delta = 0.05f;
		while (delta >= 0.00625f) {
			int sgn = -1;
			while (sgn <= 1) {
				float cand = fmaf(static_cast<float>(sgn), delta, saved);
				if (cand < lo) cand = lo;
				else if (cand > hi) cand = hi;
				const float backup = q_local[last];
				q_local[last] = cand;
				float x2, y2;
				const float f2 = cost(q_local, x2, y2);
				if (f2 < bestLocF) {
					bestLocF = f2;
					x = x2;
					y = y2;
					saved = cand;
				}
				q_local[last] = backup;
				sgn += 2;
			}
			delta = fmaf(delta, 0.5f, 0.0f);
		}
		if (bestLocF < f) {
			q_local[last] = saved;
			f = bestLocF;
		}
	}

	if (f < bestF) {
		bestF = f;
		bestQ.assign(q_local, q_local + dim);
		bestX = x;
		bestY = y;
		no_improve = 0;
	}
	else {
		++no_improve;
	}
	return f;
	};

const float f_a = evalAt(a), f_b = evalAt(b);
const float Kf = fminf(fmaxf(fmaf(2.0f, dim_f, 0.0f), 8.0f), 128.0f);
const int K = static_cast<int>(Kf);

H.reserve(static_cast<size_t>(maxIter) + static_cast<size_t>(K) + 16u);
const int rank = g_world->rank();
const int world = g_world->size();

alignas(16) float seeds[256 * 32];
const int seedCnt = generate_heuristic_seeds(cost, map, dim, seeds, 32, fmaf(static_cast<float>(rank), 7919.0f, static_cast<float>(seed)));

int i = 0;
while (i < seedCnt) {
	const float* const s = seeds + static_cast<size_t>(fmaf(static_cast<float>(i), 32.0f, 0.0f));
	const float t_seed = map.pointToT(s);
	const float interval_size = (i < 3) ? fmaf(0.0004f, static_cast<float>(dim), 0.0f)
		: fmaf(fmaf(0.00031f, static_cast<float>(dim), 0.0f),
			exp2f(fmaf((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(0.00025f, 1.0f / 0.00031f, 0.0f)),
				static_cast<float>(i - 3), 0.0f)),
			0.0f);
	const float t1 = fmaxf(a, fmaf(-interval_size, 1.0f, t_seed));
	const float t2 = fminf(b, fmaf(interval_size, 1.0f, t_seed));
	if (t2 > t1) {
		alignas(16) float q1[32], q2[32];
		float x1, y1, x2, y2;
		map.map01ToPoint(t1, q1);
		const float f1 = cost(q1, x1, y1);
		map.map01ToPoint(t2, q2);
		const float f2 = cost(q2, x2, y2);
		IntervalND* const I = new IntervalND(t1, t2, f1, f2);
		I->i1 = t_to_idx(t1);
		I->i2 = t_to_idx(t2);
		I->diam = map.block_diameter(I->i1, I->i2);
		I->compute_span_level(map);
		I->set_metric(I->diam);
		update_pockets_and_Mmax(I);
		I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
		if (i < 3) {
			I->R = fmaf(I->R, fmaf(0.01f, static_cast<float>(dim), 0.85f), 0.0f);
		}
		else {
			const float start_mult = fmaf(0.214f, static_cast<float>(dim), 0.0f);
			const float end_mult = fmaf(0.174f, static_cast<float>(dim), 0.0f);
			const float mult = fmaf(start_mult,
				exp2f(fmaf((1.0f / static_cast<float>(seedCnt - 4)) * log2f(fmaf(end_mult, 1.0f / start_mult, 0.0f)),
					static_cast<float>(i - 3), 0.0f)),
				0.0f);
			I->R = fmaf(I->R, mult, 0.0f);
		}
		H.emplace_back(I);
		std::push_heap(H.begin(), H.end(), ComparePtrND);
		if (f1 < bestF) {
			bestF = f1;
			bestQ.assign(q1, q1 + dim);
			bestX = x1;
			bestY = y1;
		}
		if (f2 < bestF) {
			bestF = f2;
			bestQ.assign(q2, q2 + dim);
			bestX = x2;
			bestY = y2;
		}
	}
	++i;
}

float prev_t = a, prev_f = f_a;
int k = 1;
while (k <= K) {
	const float t = fmaf(fmaf((b - a), static_cast<float>(k) / static_cast<float>(K + 1), a),
		1.0f,
		static_cast<float>(rank) / static_cast<float>(world * (K + 1)));
	const float f = evalAt(t);
	IntervalND* const I = new IntervalND(prev_t, t, prev_f, f);
	I->i1 = t_to_idx(prev_t);
	I->i2 = t_to_idx(t);
	I->diam = map.block_diameter(I->i1, I->i2);
	I->compute_span_level(map);
	I->set_metric(I->diam);
	update_pockets_and_Mmax(I);
	I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
	H.emplace_back(I);
	std::push_heap(H.begin(), H.end(), ComparePtrND);
	prev_t = t;
	prev_f = f;
	++k;
}
IntervalND* const tail = new IntervalND(prev_t, b, prev_f, f_b);
tail->i1 = t_to_idx(prev_t);
tail->i2 = t_to_idx(b);
tail->diam = map.block_diameter(tail->i1, tail->i2);
tail->compute_span_level(map);
tail->set_metric(tail->diam);
update_pockets_and_Mmax(tail);
tail->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
H.emplace_back(tail);
std::push_heap(H.begin(), H.end(), ComparePtrND);

float dmax = fmaf(b, 1.0f, -a);
const float initial_len = dmax;
const float thr03 = fmaf(0.3f, initial_len, 0.0f);
const float inv_thr03 = 1.0f / thr03;
int it = 0;

float kickEveryDimf = fmaf(120.0f, exp2f(fmaf(-0.05f, dim_f, 0.0f)), 0.0f);
if (kickEveryDimf < 60.0f) kickEveryDimf = 60.0f;
const int kickEveryDim = static_cast<int>(kickEveryDimf);

float noImproveThrDimf = fmaf(80.0f, exp2f(fmaf(-0.08f, dim_f, 0.0f)), 0.0f);
if (noImproveThrDimf < 30.0f) noImproveThrDimf = 30.0f;
const int noImproveThrDim = static_cast<int>(noImproveThrDimf);

auto kickEveryByDim = [&](const int d) -> int {
	float z = fmaf(120.0f, exp2f(fmaf(-0.05f, static_cast<float>(d), 0.0f)), 0.0f);
	if (z < 60.0f) z = 60.0f;
	return static_cast<int>(z);
	};

auto noImproveThrByDim = [&](const int d) -> int {
	float z = fmaf(80.0f, exp2f(fmaf(-0.08f, static_cast<float>(d), 0.0f)), 0.0f);
	if (z < 30.0f) z = 30.0f;
	return static_cast<int>(z);
	};

while (it < maxIter) {
	if ((it % kickEveryDim) == 0 && no_improve > noImproveThrDim) {
		const float t_best = map.pointToT(bestQ.data());
		int ii = 0;
		while (ii < 2) {
			const float off = (ii == 0) ? 0.01f : -0.01f;
			const float t_seed = fminf(b, fmaxf(a, fmaf(off, 1.0f, t_best)));
			const float f_seed = evalAt(t_seed);
			IntervalND* const J = new IntervalND(fmaf(-0.005f, 1.0f, t_seed), fmaf(0.005f, 1.0f, t_seed), f_seed, f_seed);
			J->i1 = t_to_idx(fmaf(-0.005f, 1.0f, t_seed));
			J->i2 = t_to_idx(fmaf(0.005f, 1.0f, t_seed));
			J->diam = map.block_diameter(J->i1, J->i2);
			J->compute_span_level(map);
			J->set_metric(J->diam);
			update_pockets_and_Mmax(J);
			J->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
			J->R = fmaf(J->R, 0.9f, 0.0f);
			H.emplace_back(J);
			std::push_heap(H.begin(), H.end(), ComparePtrND);
			++ii;
		}
		no_improve = 0;
	}

	const float p = fmaf(-1.0f / initial_len, dmax, 1.0f);
	const bool stagnation = (no_improve > 100) && (it > 270);

	const float exp_arg = fmaf(-0.06f, dim_f, 0.0f);
	const float exp2_exp_arg = exp2f(exp_arg);
	const float A = fmaf(64.0f, exp2_exp_arg, 200.0f);
	const float B = fmaf(67.0f, exp2_exp_arg, 210.0f);
	const int T = static_cast<int>(fmaf(-expm1f(p), A, B));

	const float p_arg = fmaf(p, 2.3f, -3.0f);
	const float r_eff = fmaf(-fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, fmaf(p_arg, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 1.0f, 1.05f);

	std::pop_heap(H.begin(), H.end(), ComparePtrND);
	IntervalND* const cur = H.back();
	H.pop_back();

	const float x1 = cur->x1, x2 = cur->x2, y1 = cur->y1, y2 = cur->y2;
	float m = fmaf(r_eff, Mmax, 0.0f);
	float tNew = step(m, x1, x2, y1, y2, dim_f, r);
	tNew = fminf(fmaxf(tNew, a), b);
	const float fNew = evalAt(tNew);

	IntervalND* const L = new IntervalND(x1, tNew, y1, fNew);
	IntervalND* const Rv = new IntervalND(tNew, x2, fNew, y2);

	L->i1 = t_to_idx(x1);
	L->i2 = t_to_idx(tNew);
	Rv->i1 = t_to_idx(tNew);
	Rv->i2 = t_to_idx(x2);
	L->diam = map.block_diameter(L->i1, L->i2);
	Rv->diam = map.block_diameter(Rv->i1, Rv->i2);
	L->compute_span_level(map);
	Rv->compute_span_level(map);
	L->set_metric(L->diam);
	Rv->set_metric(Rv->diam);

	const float Mloc = fmaxf(L->M, Rv->M);
	update_pockets_and_Mmax(L);
	update_pockets_and_Mmax(Rv);

	const float prevMmax = Mmax;
	if (Mloc > Mmax) Mmax = Mloc;
	m = fmaf(r_eff, Mmax, 0.0f);

	if (adaptive) {
		const float len1 = fmaf(tNew, 1.0f, -x1);
		const float len2 = fmaf(x2, 1.0f, -tNew);
		if (fmaf(len1, 1.0f, len2) == dmax) {
			dmax = fmaxf(len1, len2);
			for (auto pI : H) {
				const float Ls = fmaf(pI->x2, 1.0f, -pI->x1);
				if (Ls > dmax) dmax = Ls;
			}
		}
		if ((thr03 > dmax && !(it % 3)) || (fmaf(10.0f, dmax, 0.0f) < initial_len)) {
			const float progress = fmaf(-inv_thr03, dmax, 1.0f);
			const float alpha = fmaf(progress, progress, 0.0f);
			const float beta = fmaf(-alpha, 1.0f, 2.0f);
			const float MULT = (1.0f / dmax) * Mmax;
			const float global_coeff = fmaf(MULT, r_eff, -MULT);
			const float GF = fmaf(beta, global_coeff, 0.0f);
			L->ChangeCharacteristic(fmaf(GF, len1, fmaf(L->M, alpha, 0.0f)));
			Rv->ChangeCharacteristic(fmaf(GF, len2, fmaf(Rv->M, alpha, 0.0f)));
			const size_t sz = H.size();
			RecomputeR_AffineM_AVX2_ND(H.data(), sz, GF, alpha);
			std::make_heap(H.begin(), H.end(), ComparePtrND);
		}
		else {
			if (Mloc > prevMmax) {
				L->ChangeCharacteristic(m);
				Rv->ChangeCharacteristic(m);
				if (Mloc > fmaf(1.15f, prevMmax, 0.0f)) {
					const size_t sz = H.size();
					RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
					std::make_heap(H.begin(), H.end(), ComparePtrND);
				}
			}
			else {
				L->ChangeCharacteristic(m);
				Rv->ChangeCharacteristic(m);
			}
		}
	}
	else {
		if (Mloc > prevMmax) {
			L->ChangeCharacteristic(m);
			Rv->ChangeCharacteristic(m);
			if (Mloc > fmaf(1.15f, prevMmax, 0.0f)) {
				const size_t sz = H.size();
				RecomputeR_ConstM_AVX2_ND(H.data(), sz, m);
				std::make_heap(H.begin(), H.end(), ComparePtrND);
			}
		}
		else {
			L->ChangeCharacteristic(m);
			Rv->ChangeCharacteristic(m);
		}
	}

	H.emplace_back(L);
	std::push_heap(H.begin(), H.end(), ComparePtrND);
	H.emplace_back(Rv);
	std::push_heap(H.begin(), H.end(), ComparePtrND);
	_mm_prefetch(reinterpret_cast<const char*>(H[0]), _MM_HINT_T0);
	_mm_prefetch(reinterpret_cast<const char*>(H[1]), _MM_HINT_T0);

	IntervalND* const top = H.front();
	const float interval_len = top->x2 - top->x1;

	if ((exp2f((1.0f / dim_f) * log2f(interval_len)) < eps) || (it == maxIter)) {
		out_iterations = static_cast<size_t>(it);
		out_achieved_epsilon = interval_len;
		return;
	}

	if (!(it % T)) {
		MultiCrossMsg out;
		out.count = 3;
		float* dest = out.intervals;
		IntervalND* const t1 = H[0];
		IntervalND* const t2 = H[1];
		IntervalND* const t3 = H[2];
		IntervalND* const tops[3] = { t1, t2, t3 };
		unsigned i2 = 0;
		while (i2 < 3) {
			IntervalND* const Tt = tops[i2];
			dest[0] = Tt->x1;
			dest[1] = 0.0f;
			dest[2] = Tt->x2;
			dest[3] = 0.0f;
			dest[4] = Tt->R;
			dest += 5;
			++i2;
		}
		const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
		bool active = true;
		const bool invert_T = static_cast<int>(fmaf(static_cast<float>(exchange_counter_T), 1.0f, 1.0f)) & 1;

		size_t ii = 0;
		while (ii < iterations && active) {
			const size_t step = 1ULL << ii;
			const int partner = rank ^ static_cast<int>(step);
			if (partner < world) {
				const bool am_sender = (!!(rank & static_cast<int>(step))) ^ invert_T;
				if (am_sender) {
					g_world->isend(partner, 0, out);
					active = false;
				}
			}
			++ii;
		}
		++exchange_counter_T;
	}

	if (!(it % 500)) {
		BestSolutionMsg out;
		out.bestF = bestF;
		out.bestX = bestX;
		out.bestY = bestY;
		out.dim = static_cast<unsigned>(bestQ.size());
		memcpy(out.bestQ, bestQ.data(), bestQ.size() * sizeof(float));
		const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
		bool active = true;
		const bool invert_T = static_cast<int>(fmaf(static_cast<float>(exchange_counter_500), 1.0f, 1.0f)) & 1;

		size_t ii = 0;
		while (ii < iterations && active) {
			const size_t step = 1ULL << ii;
			const int partner = rank ^ static_cast<int>(step);
			if (partner < world) {
				const bool am_sender = (!!(rank & static_cast<int>(step))) ^ invert_T;
				if (am_sender) {
					g_world->isend(partner, 2, out);
					active = false;
				}
			}
			++ii;
		}
		++exchange_counter_500;
	}

	while (g_world->iprobe(boost::mpi::any_source, 0)) {
		MultiCrossMsg in;
		g_world->recv(boost::mpi::any_source, 0, in);
		const MultiCrossMsg& mX = in;
		unsigned ii = 0;
		while (ii < mX.count) {
			const float* const d = &mX.intervals[ii * 5];
			float sx = d[0], ex = d[2];
			if (ex > sx) {
				alignas(16) float tmp[32];
				float tx, ty;
				map.map01ToPoint(sx, tmp);
				const float y1i = cost(tmp, tx, ty);
				map.map01ToPoint(ex, tmp);
				const float y2i = cost(tmp, tx, ty);
				IntervalND* const inj = new IntervalND(sx, ex, y1i, y2i);
				inj->i1 = t_to_idx(sx);
				inj->i2 = t_to_idx(ex);
				inj->diam = map.block_diameter(inj->i1, inj->i2);
				inj->compute_span_level(map);
				inj->set_metric(inj->diam);
				update_pockets_and_Mmax(inj);
				inj->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
				_mm_prefetch(reinterpret_cast<const char*>(H[0]), _MM_HINT_T0);
				_mm_prefetch(reinterpret_cast<const char*>(H[1]), _MM_HINT_T0);
				IntervalND* const topH = H.front();
				if (inj->R > fmaf(1.15f, topH->R, 0.0f)) {
					const float p2 = fmaf(-1.0f / initial_len, dmax, 1.0f);
					const float kf = (no_improve > 100 && it > 270) ? fmaf(0.5819767068693265f, expm1f(p2), 0.3f)
						: fmaf(0.3491860241215959f, expm1f(p2), 0.6f);
					inj->R = fmaf(d[4], kf, 0.0f);
					H.emplace_back(inj);
					std::push_heap(H.begin(), H.end(), ComparePtrND);
				}
			}
			++ii;
		}
	}
	while (g_world->iprobe(boost::mpi::any_source, 2)) {
		BestSolutionMsg bm;
		g_world->recv(boost::mpi::any_source, 2, bm);
		if (bm.bestF < fmaf(bestF, 1.15f, 0.0f)) {
			alignas(16) float tmp_q[32];
			memcpy(tmp_q, bm.bestQ, bm.dim * sizeof(float));
			const float t_best = map.pointToT(tmp_q);
			const float t1 = fmaxf(a, fmaf(-0.001f, 1.0f, t_best));
			const float t2 = fminf(b, fmaf(0.001f, 1.0f, t_best));
			if (t2 > t1) {
				alignas(16) float tq1[32], tq2[32];
				float xx1, yy1, xx2, yy2;
				map.map01ToPoint(t1, tq1);
				const float f1 = cost(tq1, xx1, yy1);
				map.map01ToPoint(t2, tq2);
				const float f2 = cost(tq2, xx2, yy2);
				IntervalND* const I = new IntervalND(t1, t2, f1, f2);
				I->i1 = t_to_idx(t1);
				I->i2 = t_to_idx(t2);
				I->diam = map.block_diameter(I->i1, I->i2);
				I->compute_span_level(map);
				I->set_metric(I->diam);
				update_pockets_and_Mmax(I);
				I->ChangeCharacteristic(fmaf(r, Mmax, 0.0f));
				I->R = fmaf(I->R, 0.90f, 0.0f);
				H.emplace_back(I);
				std::push_heap(H.begin(), H.end(), ComparePtrND);
			}
			if (bm.bestF < bestF) {
				bestF = bm.bestF;
				bestX = bm.bestX;
				bestY = bm.bestY;
				bestQ.assign(bm.bestQ, bm.bestQ + bm.dim);
			}
		}
	}
	++it;
}

}

static __declspec(noalias) __forceinline float PivotCalculation(std::vector<IntervalND*>::iterator first, std::vector<IntervalND*>::iterator last) noexcept {
const auto mid = first + ((last - first) >> 1);
float pivot_value = NAN;
if (last - first < 199) {
pivot_value = (*mid)->R;
}
else {
if ((*first)->R < (*mid)->R) {
if ((*mid)->R < (*last)->R) {
pivot_value = (*mid)->R;
}
else {
pivot_value = std::max((*first)->R, (*last)->R);
}
}
else {
if ((*first)->R < (*last)->R) {
pivot_value = (*first)->R;
}
else {
pivot_value = std::max((*mid)->R, (*last)->R);
}
}
}
return pivot_value;
}

static __declspec(noalias) __forceinline void HoaraSort(std::vector<IntervalND*>::iterator first, std::vector<IntervalND*>::iterator last) noexcept {
if (first >= last) {
return;
}
const float pivot_value = PivotCalculation(first, last);
auto left = first;
auto right = last;
do {
while (left < last && (*left)->R < pivot_value) {
left++;
}
while (right > first && (*right)->R > pivot_value) {
right--;
}
if ((*left)->R == (*right)->R && left != right) {
if ((left)->R < ((left + 1))->R) {
left++;
}
else {
right--;
}
}
std::iter_swap(left, right);
} while (left != right);
if (last - first < 199) {
HoaraSort(first, right);
HoaraSort(left + 1, last);
}
else {
oneapi::tbb::parallel_invoke(&first, &right { HoaraSort(first, right); },
&left, &last { HoaraSort(left + 1, last); });
}
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_Manip2D(const int nSegments, const bool variableLengths, const float minTheta, const float targetX, const float targetY,
const int peanoLevels, const int maxIterPerBranch, const float r, const bool adaptiveMode, const float epsilon,
const unsigned int seed, float** const out_bestQ, size_t* const out_bestQLen, float* const out_bestX,
float* const out_bestY, float* const out_bestF, size_t* const out_iterations, float* const out_achieved_epsilon)
noexcept {
Slab* const __restrict slab = tls.local();
slab->current = slab->base;
while (g_world->iprobe(boost::mpi::any_source, 0)) {
MultiCrossMsg dummy;
g_world->recv(boost::mpi::any_source, 0, dummy);
}
while (g_world->iprobe(boost::mpi::any_source, 2)) {
BestSolutionMsg dummy;
g_world->recv(boost::mpi::any_source, 2, dummy);
}
const int dim = nSegments + (variableLengths ? nSegments : 0);

text
g_mc.permCache.resize(static_cast<size_t>(dim));
int i = 0;
while (i < dim) {
	g_mc.permCache[i] = i;
	++i;
}
unsigned s = g_mc.baseSeed;
i = dim - 1;
while (i > 0) {
	s ^= s << 13;
	s ^= s >> 17;
	s ^= s << 5;
	const unsigned j = s % static_cast<unsigned>(i + 1);
	std::swap(g_mc.permCache[i], g_mc.permCache[j]);
	--i;
}
g_mc.invMaskCache.resize(static_cast<size_t>(dim));
int k = 0;
while (k < dim) {
	s ^= s << 13;
	s ^= s >> 17;
	s ^= s << 5;
	g_mc.invMaskCache[k] = static_cast<unsigned long long>(s);
	++k;
}

std::vector<float, boost::alignment::aligned_allocator<float, 16u>> low;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> high;
low.reserve(static_cast<size_t>(dim));
high.reserve(static_cast<size_t>(dim));
i = 0;
while (i < nSegments) {
	low.emplace_back(i == 0 ? -1.0471975511965977461542144610932f : -2.6179938779914943653855361527329f);
	high.emplace_back(2.6179938779914943653855361527329f);
	++i;
}
if (variableLengths) {
	i = 0;
	while (i < nSegments) {
		low.emplace_back(0.5f);
		high.emplace_back(2.0f);
		++i;
	}
}

const ManipCost cost(nSegments, variableLengths, targetX, targetY, minTheta);

const int rank = g_world->rank(), world = g_world->size();
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> bestQ;
float bestF = FLT_MAX, bx = 0.0f, by = 0.0f;

const int levels0 = static_cast<int>(fminf(static_cast<float>(peanoLevels), 8.0f));
const int maxIter0 = static_cast<int>(fmaf(static_cast<float>(maxIterPerBranch), 0.2f, 0.0f));
const MortonND map0(dim, levels0, low.data(), high.data(), g_mc);

std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>> H_coarse;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> bestQ_coarse;
float bestF_coarse = FLT_MAX, bx_coarse = 0.0f, by_coarse = 0.0f;
size_t total_oi = 0u;
float total_oe = 0.0f;
size_t oi = 0u;
float oe = 0.0f;

const float base_M_prior_factor = fmaf(2.0f, static_cast<float>(nSegments), variableLengths ? 1.41421356f : 0.0f);

float M_prior = fmaf(base_M_prior_factor,
	ldexpf(1.0f, -levels0),
	0.0f);

agp_run_branch_mpi(map0, cost, maxIter0, r, adaptiveMode, epsilon, seed,
	H_coarse, bestQ_coarse, bestF_coarse, bx_coarse, by_coarse, oi, oe, M_prior);

total_oi += oi;
total_oe = oe;

if (bestF_coarse < bestF) {
	bestF = bestF_coarse;
	bestQ = std::move(bestQ_coarse);
	bx = bx_coarse;
	by = by_coarse;
}

if (levels0 < peanoLevels) {
	while (g_world->iprobe(boost::mpi::any_source, 0)) {
		MultiCrossMsg dummy;
		g_world->recv(boost::mpi::any_source, 0, dummy);
	}
	while (g_world->iprobe(boost::mpi::any_source, 2)) {
		BestSolutionMsg dummy;
		g_world->recv(boost::mpi::any_source, 2, dummy);
	}
	const MortonND map1(dim, peanoLevels, low.data(), high.data(), g_mc);
	std::vector<IntervalND*, boost::alignment::aligned_allocator<IntervalND*, 16u>> H_fine;
	std::vector<float, boost::alignment::aligned_allocator<float, 16u>> bestQ_fine = bestQ;
	float bestF_fine = bestF, bx_fine = bx, by_fine = by;
	size_t oi_fine = 0u;
	float oe_fine = 0.0f;

	float M_prior_fine = fmaf(base_M_prior_factor,
		ldexpf(1.0f, -peanoLevels),
		0.0f);

	HoaraSort(H_coarse.begin(), H_coarse.end() - 1);
	const float inv_dim = 1.0f / static_cast<float>(dim + 1);
	size_t ci = static_cast<size_t>(fmaf(static_cast<float>(H_coarse.size()), fmaf(fmaf(inv_dim, fmaf(inv_dim, fmaf(inv_dim, fmaf(inv_dim, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f), 1.0f, -0.7f), 0.0f));
	while (ci < H_coarse.size()) {
		const IntervalND* const C = H_coarse[ci];
		alignas(16) float q1[32], q2[32];
		float x1, y1, x2, y2;
		map1.map01ToPoint(C->x1, q1);
		const float f1 = cost(q1, x1, y1);
		map1.map01ToPoint(C->x2, q2);
		const float f2 = cost(q2, x2, y2);
		IntervalND* const I = new IntervalND(C->x1, C->x2, f1, f2);
		I->i1 = static_cast<unsigned long long>(fmaf(C->x1, static_cast<float>(map1.scale), 0.0f));
		I->i2 = static_cast<unsigned long long>(fmaf(C->x2, static_cast<float>(map1.scale), 0.0f));
		I->diam = map1.block_diameter(I->i1, I->i2);
		I->set_metric(I->diam);
		H_fine.emplace_back(I);
		if (f1 < bestF_fine) {
			bestF_fine = f1;
			bestQ_fine.assign(q1, q1 + dim);
			bx_fine = x1;
			by_fine = y1;
		}
		if (f2 < bestF_fine) {
			bestF_fine = f2;
			bestQ_fine.assign(q2, q2 + dim);
			bx_fine = x2;
			by_fine = y2;
		}
		++ci;
	}
	std::make_heap(H_fine.begin(), H_fine.end(), ComparePtrND);
	agp_run_branch_mpi(map1, cost, fmaf(static_cast<float>(maxIterPerBranch), 1.0f, -static_cast<float>(maxIter0)), r, adaptiveMode, epsilon, seed,
		H_fine, bestQ_fine, bestF_fine, bx_fine, by_fine, oi_fine, oe_fine, M_prior_fine);

	total_oi += oi_fine;
	total_oe = oe_fine;

	if (bestF_fine < bestF) {
		bestF = bestF_fine;
		bestQ = std::move(bestQ_fine);
		bx = bx_fine;
		by = by_fine;
	}
}

BestSolutionMsg best;
best.bestF = bestF;
best.bestX = bx;
best.bestY = by;
best.dim = static_cast<unsigned>(bestQ.size());
memcpy(best.bestQ, bestQ.data(), static_cast<size_t>(best.dim) * sizeof(float));

const size_t iterations = std::bit_width(static_cast<size_t>(world - 1));
bool active = true;

size_t itx = 0;
while (itx < iterations && active) {
	const size_t step = 1ULL << itx;
	const int partner = rank ^ static_cast<int>(step);

	if (partner < world) {
		const bool am_sender = (rank & static_cast<int>(step)) != 0;
		if (am_sender) {
			g_world->isend(partner, 3, best);
			active = false;
		}
		else {
			BestSolutionMsg in;
			g_world->recv(partner, 3, in);
			if (in.bestF < best.bestF) best = in;
		}
	}
	++itx;
}

if (rank == 0) {
	*out_bestQLen = static_cast<size_t>(best.dim);
	*out_bestQ = static_cast<float*>(CoTaskMemAlloc(sizeof(float) * (*out_bestQLen)));
	memcpy(*out_bestQ, best.bestQ, sizeof(float) * (*out_bestQLen));
	*out_bestX = best.bestX;
	*out_bestY = best.bestY;
	*out_bestF = best.bestF;
	*out_iterations = total_oi;
	*out_achieved_epsilon = total_oe;
}

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(const int peanoLevel, const float a, const float b, const float c, const float d) noexcept {
g_env = new boost::mpi::environment();
g_world = new boost::mpi::communicator();
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
const int rank = g_world->rank();
const int world_size = g_world->size();
guard = new pybind11::scoped_interpreter();
sys = pybind11::module::import("sys");
path = sys.attr("path");
path.attr("append")(".");
optimizer_bridge = pybind11::module::import("optimizer_bridge");
if (world_size == 4) {
new (&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, rank & 3);
}
g_mc.baseSeed = fmaf(0x9E3779B9u, static_cast<float>(rank), 0x9E3779B9u);
return rank;
}

static __declspec(noalias) __forceinline float ShekelFunc(const float x, const float seed) noexcept {
int i = 0;
float st = seed, r1, r2, res = 0.0f;
#pragma loop ivdep
while (i < 10) {
XOR_RAND(st, r1);
const float xp = fmaf(-r1, 10.0f, x);
XOR_RAND(st, r1);
XOR_RAND(st, r2);
float d = fmaf(fmaf(r1, 20.0f, 5.0f), xp * xp, fmaf(r2, 0.2f, 1.0f));
d = copysignf(fmaxf(fabsf(d), FLT_MIN), d);
res -= (1.0f / d) * 1.0f;
++i;
}
return res;
}

static __declspec(noalias) __forceinline float RastriginFunc(const float x1, const float x2) noexcept {
const float t = fmaf(x1, x1, x2 * x2);
float c1, c2;
FABE13_COS(6.28318530717958647692f * x1, c1);
FABE13_COS(6.28318530717958647692f * x2, c2);
return (t - fmaf(c1 + c2, 10.0f, -14.6f)) * fmaf(-t, 0.25f, 18.42f);
}

static __declspec(noalias) __forceinline float HillFunc(const float x, const float seed) noexcept {
int j = 0;
__declspec(align(16)) float ang[14u];
const float st_ang = 6.28318530717958647692f * x;
while (j < 14) {
ang[j] = st_ang * static_cast<float>(j + 1);
++j;
}
__declspec(align(16)) float sv[14u], cv[14u];
FABE13_SINCOS(ang, sv, cv, 14u);
float state = seed, r1, r2;
XOR_RAND(state, r1);
float res = fmaf(r1, 2.0f, -1.1f);
--j;
#pragma loop ivdep
while (j >= 0) {
XOR_RAND(state, r1);
XOR_RAND(state, r2);
res += fmaf(fmaf(r1, 2.0f, -1.1f), sv[j], fmaf(r2, 2.0f, -1.1f) * cv[j]);
--j;
}
return res;
}

static __declspec(noalias) __forceinline float GrishaginFunc(const float x1, const float x2, const float seed) noexcept {
int j = 0;
__declspec(align(16)) float aj[8u], ak[8u];
#pragma loop ivdep
while (j < 8) {
const float pj = 3.14159265358979323846f * static_cast<float>(j + 1);
aj[j] = pj * x1;
ak[j] = pj * x2;
++j;
}
__declspec(align(16)) float sj[8u], cj[8u], sk[8u], ck[8u];
FABE13_SINCOS(aj, sj, cj, 8u);
FABE13_SINCOS(ak, sk, ck, 8u);
--j;
float p1 = 0.0f, p2 = 0.0f;
float st = seed, r1, r2;
#pragma loop ivdep
while (j >= 0) {
size_t k2 = 0u;
while (k2 < 8u) {
const float s = sj[j] * sj[j];
const float c = ck[k2] * ck[k2];
XOR_RAND_GRSH(st, r1);
XOR_RAND_GRSH(st, r2);
p1 = fmaf(r1, s, fmaf(r2, c, p1));
XOR_RAND_GRSH(st, r1);
XOR_RAND_GRSH(st, r2);
p2 = fmaf(-r1, c, fmaf(r2, s, p2));
++k2;
}
--j;
}
return -sqrtf(fmaf(p1, p1, p2 * p2));
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_1D(const float global_iterations, const float a, const float b, const float r, const bool mode, const float epsilon, const float seed,
float** const out_data, size_t* const out_len) noexcept {
Slab* const __restrict slab = tls.local();
slab->current = slab->base;
int counter = 0;
const float initial_length = b - a;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length, inv_threshold_03 = 1.0f / threshold_03;
const float start_val = ShekelFunc(a, seed);
float best_f = ShekelFunc(b, seed);
float x_Rmax_1 = a, x_Rmax_2 = b;
float y_Rmax_1 = start_val, y_Rmax_2 = best_f;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
std::vector<Interval1D*, boost::alignment::aligned_allocator<Interval1D*, 16u>> R;
Extr.reserve(static_cast<size_t>(global_iterations) << 2u);
R.reserve(static_cast<size_t>(global_iterations) << 1u);
R.emplace_back(new Interval1D(a, b, start_val, best_f, 1.0f));
float Mmax = R.front()->M;
float m = r * Mmax;

text
while (true) {
	const float new_point = step(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r);
	const float new_value = ShekelFunc(new_point, seed);
	if (new_value < best_f) {
		best_f = new_value;
		Extr.emplace_back(best_f);
		Extr.emplace_back(new_point);
	}
	std::pop_heap(R.begin(), R.end(), ComparePtr1D);
	const Interval1D* const pro = R.back();
	const float new_x1 = pro->x1, new_x2 = pro->x2;
	const float len2 = new_x2 - new_point, len1 = new_point - new_x1;
	const float interval_len = (len1 < len2 ? len1 : len2);
	if (++counter == static_cast<int>(global_iterations) || interval_len < epsilon) {
		Extr.emplace_back(static_cast<float>(counter));
		Extr.emplace_back(interval_len);
		*out_len = Extr.size();
		*out_data = static_cast<float*>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
		memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
		return;
	}
	Interval1D* const curr = new Interval1D(new_x1, new_point, pro->y1, new_value, 1.0f);
	Interval1D* const curr1 = new Interval1D(new_point, new_x2, new_value, pro->y2, 1.0f);
	const float currM = curr->M > curr1->M ? curr->M : curr1->M;
	const size_t r_size = R.size();
	if (mode) {
		if (len2 + len1 == dmax) {
			dmax = len2 > len1 ? len2 : len1;
			for (auto p : R) {
				const float L = p->x2 - p->x1;
				if (L > dmax) dmax = L;
			}
		}
		if (threshold_03 > dmax && !(counter % 3) || 10.0f * dmax < initial_length) {
			if (currM > Mmax) {
				Mmax = currM;
				m = r * Mmax;
			}
			const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
			const float alpha = progress * progress;
			const float betta = 2.0f - alpha;
			const float MULT = (1.0f / dmax) * Mmax;
			const float global_coeff = fmaf(MULT, r, -MULT);
			const float GF = betta * global_coeff;
			curr->ChangeCharacteristic(fmaf(GF, len1, curr->M * alpha));
			curr1->ChangeCharacteristic(fmaf(GF, len2, curr1->M * alpha));
			RecomputeR_AffineM_AVX2_1D(R.data(), r_size, GF, alpha);
			std::make_heap(R.begin(), R.end(), ComparePtr1D);
		}
		else {
			if (currM > Mmax) {
				if (currM < 1.15f * Mmax) {
					Mmax = currM;
					m = r * Mmax;
					curr->ChangeCharacteristic(m);
					curr1->ChangeCharacteristic(m);
				}
				else {
					Mmax = currM;
					m = r * Mmax;
					curr->ChangeCharacteristic(m);
					curr1->ChangeCharacteristic(m);
					RecomputeR_ConstM_AVX2_1D(R.data(), r_size, m);
					std::make_heap(R.begin(), R.end(), ComparePtr1D);
				}
			}
			else {
				curr->ChangeCharacteristic(m);
				curr1->ChangeCharacteristic(m);
			}
		}
	}
	else {
		if (currM > Mmax) {
			if (currM < 1.15f * Mmax) {
				Mmax = currM;
				m = r * Mmax;
				curr->ChangeCharacteristic(m);
				curr1->ChangeCharacteristic(m);
			}
			else {
				Mmax = currM;
				m = r * Mmax;
				curr->ChangeCharacteristic(m);
				curr1->ChangeCharacteristic(m);
				RecomputeR_ConstM_AVX2_1D(R.data(), r_size, m);
				std::make_heap(R.begin(), R.end(), ComparePtr1D);
			}
		}
		else {
			curr->ChangeCharacteristic(m);
			curr1->ChangeCharacteristic(m);
		}
	}
	R.back() = curr;
	std::push_heap(R.begin(), R.end(), ComparePtr1D);
	R.emplace_back(curr1);
	std::push_heap(R.begin(), R.end(), ComparePtr1D);
	const Interval1D* const top = R.front();
	x_Rmax_1 = top->x1;
	x_Rmax_2 = top->x2;
	y_Rmax_1 = top->y1;
	y_Rmax_2 = top->y2;
}

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_2D(const float N, const float global_iterations, const float a, const float b, const float c,
const float d, const float r, const bool mode, const float epsilon, const float seed,
float** const out_data, size_t* const out_len) noexcept {
Slab* const __restrict slab = tls.local();
slab->current = slab->base;
int counter = 0, no_improve = 0;
const int rank = g_world->rank();
const int world_size = g_world->size();
while (g_world->iprobe(boost::mpi::any_source, 0)) {
MultiCrossMsg dummy;
g_world->recv(boost::mpi::any_source, 0, dummy);
}
const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1));
const float x_addition = (b - a) * inv_divider, y_addition = (d - c) * inv_divider;
const float true_start = a + x_addition, true_end = b - x_addition;
float x_Rmax_1 = true_start, x_Rmax_2 = true_end;
const float initial_length = x_Rmax_2 - x_Rmax_1;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length, inv_threshold_03 = 1.0f / threshold_03;
const float start_val = rank % 3 ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition);
float best_f = rank % 2 ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition);
float y_Rmax_1 = start_val, y_Rmax_2 = best_f;
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
std::vector<Interval1D* __restrict, boost::alignment::aligned_allocator<Interval1D* __restrict, 16u>> R;
Extr.clear();
Extr.reserve(static_cast<size_t>(global_iterations) << 2u);
R.clear();
R.reserve(static_cast<size_t>(global_iterations) << 1u);
R.emplace_back(new Interval1D(true_start, true_end, start_val, best_f, 2.0f));
const Interval1D* __restrict top_ptr;
float Mmax = R.front()->M, m = r * Mmax;
while (true) {
const float interval_len = x_Rmax_2 - x_Rmax_1;
const bool stagnation = no_improve > 100 && counter > 270;
const float p = fmaf(-1.0f / initial_length, dmax, 1.0f);
while (g_world->iprobe(boost::mpi::any_source, 0)) {
MultiCrossMsg in;
g_world->recv(boost::mpi::any_source, 0, in);
const MultiCrossMsg& mX = in;
unsigned ii = 0;
while (ii < mX.count) {
const float* const d2 = &mX.intervals[ii * 5];
float sx = d2[0], ex = d2[2];
if (ex > sx) {
Interval1D* const __restrict injected = new Interval1D(sx, ex,
RastriginFunc(d2[0], d2[1]), RastriginFunc(d2[2], d2[3]), 2.0f);
injected->ChangeCharacteristic(m);
if (injected->R > 1.15f * top_ptr->R) {
const float k = stagnation ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f);
injected->R = d2[4] * k;
R.emplace_back(injected);
std::push_heap(R.begin(), R.end(), ComparePtr1D);
}
}
++ii;
}
}
const int T = static_cast<int>(fmaf(-expm1f(p), 264.0f, 277.0f));
const bool want_term = interval_len < epsilon || counter == static_cast<int>(global_iterations);
if (!(++counter % T) || stagnation) {
if (!want_term) {
MultiCrossMsg out;
float s_x1, s_x2, e_x1, e_x2;
HitTest2D_analytic(top_ptr->x1, s_x1, s_x2);
HitTest2D_analytic(top_ptr->x2, e_x1, e_x2);
out.intervals[0] = s_x1;
out.intervals[1] = s_x2;
out.intervals[2] = e_x1;
out.intervals[3] = e_x2;
out.intervals[4] = top_ptr->R;
out.count = 1;
int i2 = 0;
while (i2 < world_size) {
if (i2 != rank) g_world->isend(i2, 0, out);
++i2;
}
}
}
if (want_term) {
if (!rank) {
Extr.emplace_back(static_cast<float>(counter));
Extr.emplace_back(interval_len);
*out_len = Extr.size();
out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (out_len)));
memcpy(out_data, Extr.data(), sizeof(float) * (out_len));
}
return;
}
const float new_point = step(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r);
float new_x1_val, new_x2_val;
HitTest2D_analytic(new_point, new_x1_val, new_x2_val);
const float new_value = RastriginFunc(new_x1_val, new_x2_val);
if (new_value < best_f) {
best_f = new_value;
Extr.emplace_back(best_f);
Extr.emplace_back(new_x1_val);
Extr.emplace_back(new_x2_val);
no_improve = 0;
}
else {
++no_improve;
}
std::pop_heap(R.begin(), R.end(), ComparePtr1D);
Interval1D const __restrict intermediate = R.back();
const float segment_x1 = intermediate->x1, segment_x2 = intermediate->x2;
const float len2 = segment_x2 - new_point, len1 = new_point - segment_x1;
Interval1D const __restrict curr = new Interval1D(segment_x1, new_point, intermediate->y1, new_value, 2.0f);
Interval1D const __restrict curr1 = new Interval1D(new_point, segment_x2, new_value, intermediate->y2, 2.0f);
const float currM = (std::max)(curr->M, curr1->M);
const size_t r_size = R.size();
if (mode) {
if (len2 + len1 == dmax) {
dmax = (std::max)(len1, len2);
for (auto pI : R) {
const float L = pI->x2 - pI->x1;
if (L > dmax) dmax = L;
}
}
if (threshold_03 > dmax && !(counter % 3) || 10.0f * dmax < initial_length) {
if (currM > Mmax) {
Mmax = currM;
m = r * Mmax;
}
const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
const float alpha = progress * progress;
const float betta = 2.0f - alpha;
const float MULTIPLIER = (1.0f / dmax) * Mmax;
const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
const float GLOBAL_FACTOR = betta * global_coeff;
curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));
RecomputeR_AffineM_AVX2_1D(R.data(), r_size, GLOBAL_FACTOR, alpha);
std::make_heap(R.begin(), R.end(), ComparePtr1D);
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
RecomputeR_ConstM_AVX2_1D(R.data(), r_size, m);
std::make_heap(R.begin(), R.end(), ComparePtr1D);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
if (currM < 1.15f * Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
RecomputeR_ConstM_AVX2_1D(R.data(), r_size, m);
std::make_heap(R.begin(), R.end(), ComparePtr1D);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
R.back() = curr;
std::push_heap(R.begin(), R.end(), ComparePtr1D);
R.emplace_back(curr1);
std::push_heap(R.begin(), R.end(), ComparePtr1D);
top_ptr = R.front();
x_Rmax_1 = top_ptr->x1;
x_Rmax_2 = top_ptr->x2;
y_Rmax_1 = top_ptr->y1;
y_Rmax_2 = top_ptr->y2;
}
}

__declspec(align(16)) struct RunParams final {
int nSegments;
unsigned varLen;
float minTheta;
float tx, ty;
int levels;
int maxIter;
float r;
unsigned adaptive;
float eps;
unsigned seed;

text
template<typename Archive>
void serialize(Archive& ar, const unsigned int) {
	ar& nSegments& varLen& minTheta& tx& ty
		& levels& maxIter& r& adaptive& eps& seed;
}

};

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline
void AgpStartManipND(const int nSegments, const bool variableLengths, const float minTheta, const float targetX, const float targetY,
const int peanoLevels, const int maxIterPerBranch, const float r, const bool adaptiveMode, const float epsilon, const unsigned int seed) noexcept {
RunParams p;
p.nSegments = nSegments;
p.varLen = static_cast<unsigned>(variableLengths);
p.minTheta = minTheta;
p.tx = targetX;
p.ty = targetY;
p.levels = peanoLevels;
p.maxIter = maxIterPerBranch;
p.r = r;
p.adaptive = static_cast<unsigned>(adaptiveMode);
p.eps = epsilon;
p.seed = seed;
int i = 1;
const int world = g_world->size();
while (i < world) {
g_world->isend(i, 1, p);
++i;
}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() noexcept {
RunParams p;
float* __restrict q;
size_t qlen;
float bx, by, bf;
size_t oi;
float oa;
while (true) {
if (g_world->iprobe(0, 1)) {
g_world->recv(0, 1, p);
AGP_Manip2D(p.nSegments, static_cast<bool>(p.varLen), p.minTheta, p.tx, p.ty, p.levels, p.maxIter, p.r, static_cast<bool>(p.adaptive), p.eps, p.seed, &q, &qlen, &bx, &by, &bf, &oi, &oa);
}
Sleep(0);
}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun2D() noexcept {
int dummy;
float* __restrict buf;
size_t len;
while (true) {
if (g_world->iprobe(0, 1)) {
g_world->recv(0, 1, dummy);
AGP_2D(2.0f, 10000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.00001f, static_cast<float>(GetTickCount()), &buf, &len);
}
Sleep(0);
}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpStartWorkers() noexcept {
int i = 1;
const int world = g_world->size();
while (i < world) {
g_world->isend(i, 1, 0);
++i;
}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_Free(float* const p) noexcept {
CoTaskMemFree(p);
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void RunPythonOptimization(
const char* backend,
int n_seg, bool var_len,
float min_theta, float tx, float ty,
int levels, int max_iter,
float r_param, float eps, bool adaptive,
float* out_bestF, float* out_bestX, float* out_bestY,
float* out_angles, float* out_lengths,
int* out_iterations, float* out_eps, float* out_micros) {
pybind11::gil_scoped_acquire gil;
pybind11::dict result;

text
if (strcmp(backend, "optuna") == 0) {
	result = optimizer_bridge.attr("run_optuna")(
		n_seg, var_len, min_theta, tx, ty, max_iter);
}
else {
	result = optimizer_bridge.attr("run_iopt")(
		n_seg, var_len, min_theta, tx, ty,
		max_iter, r_param, eps, adaptive);
}

// Извлекаем результаты
*out_bestF = result["BEST_F"].cast<float>();
*out_bestX = result["BEST_X"].cast<float>();
*out_bestY = result["BEST_Y"].cast<float>();
*out_iterations = result["ITERATIONS"].cast<int>();
*out_eps = result["EPS"].cast<float>();
*out_micros = result["TIME"].cast<float>();

// Получаем векторы углов и длин
std::vector<float> angles_vec = result["ANGLES"].cast<std::vector<float>>();
std::vector<float> lengths_vec = result["LENGTHS"].cast<std::vector<float>>();

// Копируем данные в выходные массивы
memcpy(out_angles, angles_vec.data(), n_seg * sizeof(float));
if (var_len) {
	memcpy(out_lengths, lengths_vec.data(), n_seg * sizeof(float));
}

}" код пайтон скрипта: "import sys
import math
import time
import statistics
import warnings
from io import StringIO

import optuna
from optuna.samplers import CmaEsSampler
import iOpt

def forward_kinematics(angles, lengths):
phi = x = y = 0.0
for i in range(len(angles)):
phi += angles[i]
x += lengths[i] * math.cos(phi)
y += lengths[i] * math.sin(phi)
return x, y

def manipulator_cost(angles, lengths, target_x, target_y, min_theta):
x, y = forward_kinematics(angles, lengths)
dx, dy = x - target_x, y - target_y
dist = math.hypot(dx, dy)
scale = 2.0 / (min_theta + 1e-6)
pen_c = arch_pen = 0.0

text
for theta in angles:
    v = abs(theta) - min_theta
    if v > 0.0:
        pen_c += 0.05 * ((1 << int(scale * v)) if scale * v < 100 else 2.0 ** (scale * v) - 1.0)
    t = -theta * 3.0
    arch_pen += 0.02 * (t if t > 10.0 else math.log1p(math.exp(t)))

return dist + pen_c + arch_pen

def build_angles_lengths(trial, n_seg, var_len):
angles = [trial.suggest_float(f"theta_{i}",
-1.0471975511965977 if i == 0 else -2.6179938779914944,
2.6179938779914944) for i in range(n_seg)]
lengths = [trial.suggest_float(f"L_{i}", 0.5, 2.0) if var_len else 1.0 for i in range(n_seg)]
return angles, lengths

def run_optuna(n_seg, var_len, min_theta, tx, ty, max_iter):
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.ERROR)

text
sampler = CmaEsSampler(
    seed=42,
    n_startup_trials=(n_seg * (2 if var_len else 1) - 1) ** 2 + 3,
    consider_pruned_trials=False
)

def objective(trial):
    angles, lengths = build_angles_lengths(trial, n_seg, var_len)
    return manipulator_cost(angles, lengths, tx, ty, min_theta)

study = optuna.create_study(
    direction="minimize",
    sampler=sampler,
    pruner=optuna.pruners.NopPruner()
)

start_time = time.perf_counter()
study.optimize(objective, n_trials=max_iter, show_progress_bar=False, gc_after_trial=False)
elapsed_micros = (time.perf_counter() - start_time) * 1e6

best_trial = study.best_trial
params = best_trial.params

angles = [params[f"theta_{i}"] for i in range(n_seg)]
lengths = [params[f"L_{i}"] for i in range(n_seg)] if var_len else [1.0] * n_seg

best_x, best_y = forward_kinematics(angles, lengths)
iterations = len(study.trials)

recent_trials = iterations // 10
recent_values = [t.value for t in study.trials[-recent_trials:]]
achieved_eps = statistics.stdev(recent_values)

return {
    "BEST_F": float(best_trial.value),
    "BEST_X": float(best_x),
    "BEST_Y": float(best_y),
    "ITERATIONS": int(iterations),
    "EPS": float(achieved_eps),
    "TIME": float(elapsed_micros),
    "ANGLES": angles,
    "LENGTHS": lengths
}

def run_iopt(n_seg, var_len, min_theta, tx, ty, max_iter, r_param, eps, adaptive):
old_stdout, old_stderr = sys.stdout, sys.stderr
sys.stdout, sys.stderr = StringIO(), StringIO()

text
def objective(trial):
    angles, lengths = build_angles_lengths(trial, n_seg, var_len)
    return manipulator_cost(angles, lengths, tx, ty, min_theta)

study = iOpt.create_study()
params = iOpt.SolverParameters(r=r_param, eps=eps, iters_limit=max_iter, refine_solution=adaptive)
study.optimize(objective=objective, solver_parameters=params, type_of_painter="none", console_mode="off")

sys.stdout, sys.stderr = old_stdout, old_stderr

best_params = study.best_float_params_()
angles = best_params[:n_seg]
lengths = best_params[n_seg:n_seg*2] if var_len else [1.0] * n_seg

best_f = study.best_values_()
best_x, best_y = forward_kinematics(angles, lengths)

solution = getattr(study, "solution", None)
achieved_eps = solution.solution_accuracy
iterations = solution.number_of_global_trials
elapsed_micros = solution.solving_time * 1e6

return {
    "BEST_F": float(best_f),
    "BEST_X": float(best_x),
    "BEST_Y": float(best_y),
    "ITERATIONS": int(iterations),
    "EPS": float(achieved_eps),
    "TIME": float(elapsed_micros),
    "ANGLES": angles,
    "LENGTHS": lengths
}

def main():
argv = sys.argv
backend, n_seg, var_len = argv[1], int(argv[2]), bool(int(argv[3]))
min_theta, tx, ty = float(argv[4]), float(argv[5]), float(argv[6])
max_iter = int(argv[8])

text
if backend == "optuna":
    run_optuna(n_seg, var_len, min_theta, tx, ty, max_iter)
else:
    r_param, eps, adaptive = float(argv[9]), float(argv[10]), bool(int(argv[11]))
    run_iopt(n_seg, var_len, min_theta, tx, ty, max_iter, r_param, eps, adaptive)" ситуация странная - когда я вызывю оптуну то всё работает и я могу сколько угодно раз нажимать на кнопку оптимизировать но только для одномерной задачи когда я добавляю звенья - то уже не могу пользоваться оптуной, айопт не могу пользоваться вообще - выбрасывает ошибку, сами пайтон скрипты правильные, инициализация и работа с многопроцессорностью правильная - в целом здесь везде в коде всё верно - так что не ищи ошибки в основной логике - там их нет - есть только ошибка при попытке использовать оптимизацию через pybind через пайтон скрипт, метод для инициализации окружения тоже 100 процентов правильный - проверял - до этого я применял парсер через строковые параметры и всё работало для обоих фреймворков, так что ищи ошибку именно в том правильно ли используется pybind, потому что раньше было так: "import sys

import math
import time
import statistics
from io import StringIO

import optuna
import iOpt

def forward_kinematics(angles, lengths):
n = len(angles)
phi = 0.0
x = 0.0
y = 0.0
cos = math.cos
sin = math.sin
for i in range(n):
phi += angles[i]
L = lengths[i]
x += L * cos(phi)
y += L * sin(phi)
return x, y

def manipulator_cost(angles, lengths, target_x, target_y, min_theta):
x, y = forward_kinematics(angles, lengths)
dx = x - target_x
dy = y - target_y
sqrt = math.sqrt
pow_ = math.pow
log1p = math.log1p
exp = math.exp
abs_ = abs
dist = sqrt(dx * dx + dy * dy)
arch_bias_w = 0.02
arch_bias_k = 3.0
sharp_w = 0.05
scale = 2.0 / (min_theta + 1e-6)
pen_c = 0.0
arch_pen = 0.0
for theta in angles:
a = abs_(theta)
v = a - min_theta
if v > 0.0:
pen_c += sharp_w * (pow_(2.0, scale * v) - 1.0)
t = -theta * arch_bias_k
if t > 10.0:
sp = t
else:
sp = log1p(exp(t))
arch_pen += arch_bias_w * sp
return dist + pen_c + arch_pen

def build_angles_lengths(trial, n_seg, var_len):
theta0_min = -1.0471975511965977
theta0_max = 2.6179938779914944
theta_min = -2.6179938779914944
theta_max = 2.6179938779914944
suggest_float = trial.suggest_float
angles = []
if n_seg > 0:
angles.append(suggest_float("theta_0", theta0_min, theta0_max))
for i in range(1, n_seg):
name = "theta_" + str(i)
angles.append(suggest_float(name, theta_min, theta_max))
if var_len:
lengths = []
for i in range(n_seg):
name = "L_" + str(i)
lengths.append(suggest_float(name, 0.5, 2.0))
else:
lengths = [1.0] * n_seg
return angles, lengths

def run_optuna(n_seg, var_len, min_theta, tx, ty, max_iter, r_param, eps):
optuna.logging.set_verbosity(optuna.logging.ERROR)
build = build_angles_lengths
cost = manipulator_cost

text
def objective(trial):
    angles, lengths = build(trial, n_seg, var_len)
    return cost(angles, lengths, tx, ty, min_theta)

study = optuna.create_study(direction="minimize")
start_time = time.perf_counter()
study.optimize(objective, n_trials=max_iter, show_progress_bar=False)
elapsed_micros = (time.perf_counter() - start_time) * 1e6
best_trial = study.best_trial
best_f = best_trial.value
params = best_trial.params
angles = [params["theta_" + str(i)] for i in range(n_seg)]
if var_len:
    lengths = [params["L_" + str(i)] for i in range(n_seg)]
else:
    lengths = [1.0] * n_seg
best_x, best_y = forward_kinematics(angles, lengths)
trials = study.trials
iterations = len(trials)
recent_trials = max(1, iterations // 10)
recent_values = [t.value for t in trials[-recent_trials:] if t.value is not None]
achieved_eps = statistics.stdev(recent_values)
q_values = angles + lengths
print("BEST_F", best_f)
print("BEST_X", best_x)
print("BEST_Y", best_y)
print("ITERATIONS", iterations)
print("EPS", achieved_eps)
print("TIME", int(elapsed_micros))
print("Q", " ".join(str(q) for q in q_values))

def run_iopt(n_seg, var_len, min_theta, tx, ty, max_iter, r_param, eps, adaptive):
old_stdout = sys.stdout
old_stderr = sys.stderr
sys.stdout = StringIO()
sys.stderr = StringIO()
build = build_angles_lengths
cost = manipulator_cost

text
def objective(trial):
    angles, lengths = build(trial, n_seg, var_len)
    return cost(angles, lengths, tx, ty, min_theta)

study = iOpt.create_study()
params = iOpt.SolverParameters(r=r_param, eps=eps, iters_limit=max_iter, refine_solution=adaptive)
study.optimize(objective=objective, solver_parameters=params, type_of_painter="none", console_mode="off")
sys.stdout = old_stdout
sys.stderr = old_stderr
best_params = study.best_float_params_()
angles = best_params[:n_seg]
if var_len:
    lengths = best_params[n_seg:n_seg * 2]
else:
    lengths = [1.0] * n_seg
best_f = study.best_values_()
best_x, best_y = forward_kinematics(angles, lengths)
solution = getattr(study, "solution", None)
achieved_eps = solution.solution_accuracy
iterations = solution.number_of_global_trials
elapsed_micros = solution.solving_time * 1e6
q_values = angles + lengths
print("BEST_F", best_f)
print("BEST_X", best_x)
print("BEST_Y", best_y)
print("ITERATIONS", iterations)
print("EPS", achieved_eps)
print("TIME", int(elapsed_micros))
print("Q", " ".join(str(q) for q in q_values))

def main():
argv = sys.argv
backend = argv[1]
n_seg = int(argv[2])
var_len = bool(int(argv[3]))
min_theta = float(argv[4])
tx = float(argv[5])
ty = float(argv[6])
_levels = int(argv[7])
max_iter = int(argv[8])
r_param = float(argv[9])
eps = float(argv[10])
adaptive = bool(int(argv[11]))
if backend == "optuna":
run_optuna(n_seg, var_len, min_theta, tx, ty, max_iter, r_param, eps)
elif backend == "iopt":
run_iopt(n_seg, var_len, min_theta, tx, ty, max_iter, r_param, eps, adaptive)

if name == "main":
main()" - в скрипте, и " [MethodImpl(MethodImplOptions::AggressiveInlining)]
bool ParseFloat(String^ s, float% out) {
return Single::TryParse(
s,
NumberStyles::Float,
CultureInfo::InvariantCulture,
out);
}

text
	void RunPythonBackend(String^ which) {
		const int nSeg = nSegments;
		const bool varLen = cbVarLen->Checked;
		const float minTheta = static_cast<float>(nudMinTheta->Value);
		const float tx = static_cast<float>(nudTargetX->Value);
		const float ty = static_cast<float>(nudTargetY->Value);
		const int levels = static_cast<int>(nudLevels->Value);
		const int maxIter = static_cast<int>(nudMaxIter->Value);
		const float r_param = static_cast<float>(nudR->Value);
		const float eps = static_cast<float>(nudEps->Value);
		const bool adaptive = cbAdaptive->Checked;

		String^ pythonExe = System::IO::Path::Combine(
			Application::StartupPath,
			"env",
			"Scripts",
			"python.exe");

		String^ scriptPath = System::IO::Path::Combine(
			Application::StartupPath,
			"optimizer_bridge.py");

		ProcessStartInfo^ psi = gcnew ProcessStartInfo();
		psi->FileName = pythonExe;

		psi->Arguments = String::Format(
			CultureInfo::InvariantCulture,
			"\"{0}\" {1} {2} {3} {4} {5} {6} {7} {8} {9} {10} {11}",
			scriptPath,
			which,
			nSeg,
			varLen ? 1 : 0,
			minTheta,
			tx,
			ty,
			levels,
			maxIter,
			r_param,
			eps,
			adaptive ? 1 : 0);

		psi->UseShellExecute = false;
		psi->CreateNoWindow = true;
		psi->RedirectStandardOutput = true;
		psi->RedirectStandardError = false;
		psi->WorkingDirectory = Application::StartupPath;

		Process^ proc = Process::Start(psi);
		String^ stdoutAll = proc->StandardOutput->ReadToEnd();
		proc->WaitForExit();

		float bestF = 0.0f;
		float bestX = 0.0f;
		float bestY = 0.0f;
		float achievedEps = 0.0f;
		int iterations = 0;
		float micros = 0.0f;

		array<float>^ tempAngles = gcnew array<float>(nSeg);
		array<float>^ tempLengths = gcnew array<float>(nSeg);
		const float baseLen = static_cast<float>(nudBaseLength->Value);

		for (int i = 0; i < nSeg; ++i) {
			tempAngles[i] = 0.0f;
			tempLengths[i] = baseLen;
		}

		array<wchar_t>^ separators = gcnew array<wchar_t>{ '\r', '\n' };
		array<String^>^ lines = stdoutAll->Split(
			separators,
			StringSplitOptions::RemoveEmptyEntries);

		int qIndex = 0;

		for each(String ^ line in lines) {
			array<wchar_t>^ partSeparators = gcnew array<wchar_t>{ ' ', '\t' };
			array<String^>^ parts = line->Split(
				partSeparators,
				StringSplitOptions::RemoveEmptyEntries);

			if (parts->Length == 0) {
				continue;
			}

			String^ key = parts[0];

			if (key == "Q") {
				for (int i = 1; i < parts->Length && qIndex < (varLen ? 2 * nSeg : nSeg); ++i) {
					float qVal;
					if (ParseFloat(parts[i], qVal)) {
						if (qIndex < nSeg) {
							tempAngles[qIndex] = qVal;
						}
						else if (varLen && qIndex < 2 * nSeg) {
							tempLengths[qIndex - nSeg] = qVal;
						}
						++qIndex;
					}
				}
			}
			else if (parts->Length > 1) {
				String^ value = parts[1];

				if (key == "BEST_F") {
					ParseFloat(value, bestF);
				}
				else if (key == "BEST_X") {
					ParseFloat(value, bestX);
				}
				else if (key == "BEST_Y") {
					ParseFloat(value, bestY);
				}
				else if (key == "ITERATIONS") {
					Int32::TryParse(value, iterations);
				}
				else if (key == "EPS") {
					ParseFloat(value, achievedEps);
				}
				else if (key == "TIME") {
					ParseFloat(value, micros);
				}
			}
		}

		angles->Clear();
		lengths->Clear();

		for (int i = 0; i < nSeg; ++i) {
			angles->Add(tempAngles[i]);
			lengths->Add(tempLengths[i]);
		}

		const float dx = bestX - tx;
		const float dy = bestY - ty;
		const double distance = Math::Sqrt(dx * dx + dy * dy);

		lblInfo->Text = String::Format(
			L"Результат:\n"
			L"Близость захвата: {0:F5}\n"
			L"Функционал: {1:F5}\n"
			L"Точка: ({2:F3}, {3:F3})\n"
			L"Время: {4:F0} мкс\n"
			L"Число шагов: {5}\n"
			L"Достигнутая точность: {6:E3}",
			distance,
			bestF,
			bestX,
			bestY,
			micros,
			iterations,
			achievedEps);

		this->Invalidate();
		this->Refresh();
	}" - в соответсвующем методе в интерфейсе и это всё работало без ошибок - так что детально изучи вопрос как должен применяться pybind в этом контексте и исправь ошибки связанные с pybind, deepseek посоветовал мне убрать всё что связано с pybind из глобальной области видимости и из метода AgpInit и переписать метод так: "extern "C" __declspec(dllexport) __declspec(noalias)

void RunPythonOptimization(
const char* backend,
int n_seg, bool var_len,
float min_theta, float tx, float ty,
int levels, int max_iter,
float r_param, float eps, bool adaptive,
float* out_bestF, float* out_bestX, float* out_bestY,
float* out_angles, float* out_lengths,
int* out_iterations, float* out_eps, float* out_micros)
{
// 1. Проверка указателей
if (!out_bestF || !out_bestX || !out_bestY || !out_angles ||
!out_iterations || !out_eps || !out_micros) {
// Устанавливаем значения по умолчанию при ошибке
*out_bestF = std::numeric_limits<float>::max();
*out_iterations = 0;
*out_eps = 0.0f;
*out_micros = 0.0f;
return;
}

text
if (var_len && !out_lengths) {
    *out_bestF = std::numeric_limits<float>::max();
    *out_iterations = 0;
    return;
}

// 2. Ленивая инициализация Python интерпретатора (только один раз)
static std::once_flag interpreter_init_flag;
static bool interpreter_initialized = false;
static std::string last_error;

std::call_once(interpreter_init_flag, []() {
    try {
        // Инициализируем Python интерпретатор
        static pybind11::scoped_interpreter guard{};
        
        // Добавляем текущую директорию в Python path
        pybind11::module_ sys = pybind11::module_::import("sys");
        sys.attr("path").attr("append")(".");
        
        interpreter_initialized = true;
        last_error.clear();
    }
    catch (const pybind11::error_already_set& e) {
        last_error = std::string("Python interpreter init failed: ") + e.what();
        interpreter_initialized = false;
    }
    catch (const std::exception& e) {
        last_error = std::string("C++ exception during init: ") + e.what();
        interpreter_initialized = false;
    }
});

if (!interpreter_initialized && !last_error.empty()) {
    *out_bestF = std::numeric_limits<float>::max();
    *out_iterations = -1; // Код ошибки инициализации
    return;
}

// 3. Ленивая загрузка модуля Python (только один раз)
static std::once_flag module_init_flag;
static pybind11::module_ optimizer_bridge;
static bool module_loaded = false;
static std::string module_error;

std::call_once(module_init_flag, []() {
    try {
        pybind11::gil_scoped_acquire gil; // Захватываем GIL для импорта
        optimizer_bridge = pybind11::module_::import("optimizer_bridge");
        module_loaded = true;
        module_error.clear();
    }
    catch (const pybind11::error_already_set& e) {
        module_error = std::string("Module import failed: ") + e.what();
        module_loaded = false;
    }
});

if (!module_loaded && !module_error.empty()) {
    *out_bestF = std::numeric_limits<float>::max();
    *out_iterations = -2; // Код ошибки загрузки модуля
    return;
}

try {
    // 4. Захватываем GIL для выполнения Python кода
    pybind11::gil_scoped_acquire gil;
    
    pybind11::dict result;
    
    // 5. Вызываем соответствующую функцию Python
    if (strcmp(backend, "optuna") == 0) {
        result = optimizer_bridge.attr("run_optuna")(
            n_seg, var_len, min_theta, tx, ty, max_iter);
    }
    else if (strcmp(backend, "iopt") == 0) {
        result = optimizer_bridge.attr("run_iopt")(
            n_seg, var_len, min_theta, tx, ty,
            max_iter, r_param, eps, adaptive);
    }
    else {
        throw std::runtime_error("Unknown backend: " + std::string(backend));
    }

    // 6. Извлекаем результаты с проверкой типов
    *out_bestF = result["BEST_F"].cast<float>();
    *out_bestX = result["BEST_X"].cast<float>();
    *out_bestY = result["BEST_Y"].cast<float>();
    *out_iterations = result["ITERATIONS"].cast<int>();
    *out_eps = result["EPS"].cast<float>();
    *out_micros = result["TIME"].cast<float>();

    // 7. Получаем векторы углов и длин
    std::vector<float> angles_vec;
    std::vector<float> lengths_vec;
    
    try {
        angles_vec = result["ANGLES"].cast<std::vector<float>>();
        lengths_vec = result["LENGTHS"].cast<std::vector<float>>();
    }
    catch (const pybind11::cast_error& e) {
        // Альтернативный способ извлечения данных для совместимости
        pybind11::list angles_list = result["ANGLES"].cast<pybind11::list>();
        pybind11::list lengths_list = result["LENGTHS"].cast<pybind11::list>();
        
        angles_vec.reserve(angles_list.size());
        lengths_vec.reserve(lengths_list.size());
        
        for (auto& item : angles_list) {
            angles_vec.push_back(item.cast<float>());
        }
        for (auto& item : lengths_list) {
            lengths_vec.push_back(item.cast<float>());
        }
    }

    // 8. Проверяем размеры данных
    if (angles_vec.size() != static_cast<size_t>(n_seg)) {
        throw std::runtime_error("Angles vector size mismatch");
    }
    
    if (var_len && lengths_vec.size() != static_cast<size_t>(n_seg)) {
        throw std::runtime_error("Lengths vector size mismatch");
    }

    // 9. Копируем данные в выходные массивы
    std::copy(angles_vec.begin(), angles_vec.end(), out_angles);
    
    if (var_len) {
        std::copy(lengths_vec.begin(), lengths_vec.end(), out_lengths);
    }
    else {
        // Если длины не переменные, заполняем значением по умолчанию
        std::fill(out_lengths, out_lengths + n_seg, 1.0f);
    }
}
catch (const pybind11::error_already_set& e) {
    // Обработка ошибок Python
    *out_bestF = std::numeric_limits<float>::max();
    *out_iterations = -3; // Код ошибки выполнения Python
    *out_eps = 0.0f;
    *out_micros = 0.0f;
    
    // Можно логировать ошибку, но не в продакшене
    #ifdef _DEBUG
    std::cerr << "Python error in RunPythonOptimization: " << e.what() << std::endl;
    #endif
}
catch (const std::exception& e) {
    // Обработка стандартных ошибок C++
    *out_bestF = std::numeric_limits<float>::max();
    *out_iterations = -4; // Код ошибки C++
    *out_eps = 0.0f;
    *out_micros = 0.0f;
    
    #ifdef _DEBUG
    std::cerr << "C++ error in RunPythonOptimization: " << e.what() << std::endl;
    #endif
}
catch (...) {
    // Обработка неизвестных ошибок
    *out_bestF = std::numeric_limits<float>::max();
    *out_iterations = -5; // Код неизвестной ошибки
    *out_eps = 0.0f;
    *out_micros = 0.0f;
}

}" - но это не помогло - теперь ошибки или исключения не выбрасывается но вижу что манипулятор просто каждый раз становится горизонтально что свидетельствует о том что на самом деле оптимизации не происходит а в окне вывода результатов вижу очень большое число функционала, запускаю приложение через mpiexec, если буду просто запускать приложение через отладчик то даже в одномерных задачах увижу такую ошибку что манипулятор становится горизонтально, при запуске через mpiexec - одномерная задача корректно решается optuna, но двумерная и больше уже не решается, а iOpt не решает даже одномерной как не запускай, причём сами скрипты точно верны - тут дело в другом

Answer 1

Коротко: у тебя ломается не оптимизация как таковая, а связка C++/CLI ⇄ DLL ⇄ pybind11 ⇄ Python.
Скрипты действительно могут быть идеальны, но они либо вообще не вызываются, либо падают внутри, а C++-слой это съедает и возвращает мусор (нули в углах → горизонтальный манипулятор, огромный BEST_F).

Ниже разберу, где именно pybind сейчас “хрупок”, и дам вариант нормального, устойчивого RunPythonOptimization, заточенный под твой сценарий: C++/CLI WinForms + MPI + DLL.

1. Основные проблемы текущего использования pybind

1.1. Инициализация Python в `AgpInit` + глобальные объекты

В исходной версии у тебя было:

cpp
static const boost::mpi::environment* g_env;
static const boost::mpi::communicator* g_world;
static const pybind11::scoped_interpreter* guard;
static pybind11::module_ sys;
static pybind11::object path;
static pybind11::module_ optimizer_bridge;

int AgpInit(...) {
    g_env = new boost::mpi::environment();
    g_world = new boost::mpi::communicator();
    ...
    guard = new pybind11::scoped_interpreter();
    sys = pybind11::module::import("sys");
    path = sys.attr("path");
    path.attr("append")(".");
    optimizer_bridge = pybind11::module::import("optimizer_bridge");
    ...
}

Проблемы:

Статические pybind-объекты, живущие дольше интерпретатора.
Когда scoped_interpreter уничтожится (или при выгрузке DLL/процесса), Python уже финализирован, а module_, object и т.п. ещё имеют C++-деструкторы → UB / падения / странное поведение.
Инициализация Python “сверху” (в конструкторе среды MPI) и вызовы потом из WinForms-кода.
pybind11 официально рекомендует инициализировать интерпретатор лениво, на первом реальном использовании, и не держать глобальных py::object без крайней необходимости.
sys.path.append(".")
Ты раньше вызывал скрипт так:
```
cpp
scriptPath = Path::Combine(Application::StartupPath, "optimizer_bridge.py");
```
Т.е. Python-процесс гарантированно видел нужный файл.
В embedded-варианте ты добавляешь в sys.path просто ".", т.е. текущую рабочую директорию процесса, которая:
- при запуске из Visual Studio (F5) часто не совпадает с директорией exe;
- при запуске через mpiexec вполне может быть другой.
В результате:
– в одном сценарии optimizer_bridge находится (например, когда mpiexec запускается из каталога с exe);
– в другом — нет, и import optimizer_bridge бросает ImportError, который ты либо не ловишь, либо глушишь.

Это идеально объясняет, почему:
- в отладчике даже 1D не работает,
- а через mpiexec простейший случай ещё как-то живёт.

1.2. Новая версия с `std::once_flag`, но без диагностики

Код deepseek’а:

делает ленивую инициализацию интерпретатора и модуля через std::call_once;
при любой ошибке просто пишет out_bestF = FLT_MAX; out_iterations = -N; и молча возвращает.

С точки зрения GUI:

массивы out_angles / out_lengths остаются с нулями / базовыми длинами;
BEST_F становится огромным → ты видишь “перегнутый функционал”;
манипулятор горизонтальный (все углы ~0).

То есть Python вообще не отработал, а ты видишь только следствие.

2. Что ещё может ломать Optuna/iOpt именно через pybind

Помимо пути к модулю, есть ещё несколько характерных моментов:

Разный Python / разные окружения.
В старой схеме ты запускал конкретный env\Scripts\python.exe.
В embedded-схеме pybind по умолчанию поднимает “системный” Python, который может:
- не иметь нужных версий optuna / iOpt,
- иметь другие зависимости,
- под Windows вообще быть иной разрядности или конфигурации.
Многократные вызовы в одном интерпретаторе.
Внешний скрипт выполнялся каждый раз в чистом процессе.
В embedded-варианте весь state Optuna/iOpt живёт в одном процессе навсегда.
Это обычно нормально, но если библиотека где-то хранит глобальное состояние (особенно iOpt), некорректный порядок инициализации вполне может приводить к падениям/исключениям на второй и далее оптимизациях.
MPI + Python.
У тебя MPI-мир создаётся для всех рангов, но Python нужен только на ранге 0 (где WinForms). Если кто-то случайно вызовет RunPythonOptimization на ранге > 0, будет весёлый зоопарк (другой CWD, другие переменные окружения, отсутствие GUI и т.д.).

3. Как правильно организовать pybind в твоём контексте

Ниже — схема, которая реально дружит с:

C++/CLI WinForms,
DLL,
MPI,
и многократными вызовами Python.

3.1. Общая идея

Никакого pybind в AgpInit и глобальных статических переменных.
AgpInit занимается только MPI и вашей AGP-частью.
Интерпретатор и модуль инициализируются лениво внутри RunPythonOptimization через std::once_flag.
В sys.path добавляется не ".", а реальный путь к exe или к каталогу с optimizer_bridge.py.
Под Windows это можно получить через GetModuleFileNameW.
Все Python-ошибки пишутся в лог + код ошибки в out_iterations, чтобы ты видел, где именно всё разваливается:
- −1: не удалось инициализировать интерпретатор;
- −2: не удалось импортировать модуль;
- −3: ошибка во время выполнения Python;
- −4: C++ исключение;
- −5: неизвестная ошибка.
На не-нулевых рангах MPI (если вдруг туда дойдёт вызов) — сразу возвращать код ошибки/NO-OP.

4. Пример нормального `RunPythonOptimization`

Это пример полного варианта, который можно просто вставить в твой .cpp с DLL вместо нынешней реализации RunPythonOptimization.
Я буду использовать pybind11 под псевдонимом py для краткости.

cpp
#include <pybind11/embed.h>
#include <pybind11/stl.h>
#include <mutex>
#include <limits>
#include <string>
#include <fstream>
#include <windows.h>

namespace py = pybind11;

// Если нужен доступ к MPI рангу:
extern const boost::mpi::communicator* g_world;

// Утилита для логирования (чтоб хоть что-то видеть)
static void log_python_error(const std::string& msg) noexcept {
#ifdef _DEBUG
    try {
        std::ofstream f("python_bridge.log", std::ios::app);
        f << msg << "\n";
    } catch (...) {
        // ничего
    }
#endif
}

// Получаем каталог с EXE (или с DLL при желании)
static std::string get_exe_dir() {
    wchar_t buf[MAX_PATH]{};
    DWORD len = GetModuleFileNameW(nullptr, buf, MAX_PATH);
    if (!len || len == MAX_PATH) return ".";
    std::wstring ws(buf, len);
    auto pos = ws.find_last_of(L"\\/");
    if (pos != std::wstring::npos) ws.resize(pos);
    // перекодируем в utf-8
    int n = WideCharToMultiByte(CP_UTF8, 0, ws.c_str(), -1, nullptr, 0, nullptr, nullptr);
    std::string s(n, '\0');
    WideCharToMultiByte(CP_UTF8, 0, ws.c_str(), -1, s.data(), n, nullptr, nullptr);
    if (!s.empty() && s.back() == '\0') s.pop_back();
    return s;
}

extern "C" __declspec(dllexport) __declspec(noalias)
void RunPythonOptimization(
    const char* backend,
    int n_seg, bool var_len,
    float min_theta, float tx, float ty,
    int /*levels*/, int max_iter,
    float r_param, float eps, bool adaptive,
    float* out_bestF, float* out_bestX, float* out_bestY,
    float* out_angles, float* out_lengths,
    int* out_iterations, float* out_eps, float* out_micros)
{
    // --- базовая валидация указателей ---
    if (!out_bestF || !out_bestX || !out_bestY ||
        !out_angles || !out_iterations || !out_eps || !out_micros)
    {
        if (out_bestF) *out_bestF = std::numeric_limits<float>::max();
        if (out_iterations) *out_iterations = -100; // явный код "плохие указатели"
        return;
    }
    if (var_len && !out_lengths) {
        *out_bestF = std::numeric_limits<float>::max();
        *out_iterations = -101;
        return;
    }

    // --- если вдруг вызвали не с ранга 0, просто ничего не делаем ---
    if (g_world && g_world->rank() != 0) {
        *out_bestF = std::numeric_limits<float>::max();
        *out_bestX = 0.0f;
        *out_bestY = 0.0f;
        *out_iterations = -200; // вызов не с rank 0
        *out_eps = 0.0f;
        *out_micros = 0.0f;
        // углы оставляем как есть
        return;
    }

    // --- ленивый запуск интерпретатора ---
    static std::once_flag py_init_flag;
    static bool py_ok = false;
    static std::string py_init_err;

    std::call_once(py_init_flag, []() {
        try {
            // static -> живёт до конца процесса
            static py::scoped_interpreter guard{};

            // Добавляем каталог exe (где лежит optimizer_bridge.py)
            std::string exe_dir = get_exe_dir();
            py::module_ sys = py::module_::import("sys");
            py::list path = sys.attr("path");
            path.append(py::str(exe_dir));

            py_ok = true;
            py_init_err.clear();
            log_python_error("Python interpreter initialized, sys.path += " + exe_dir);
        } catch (const py::error_already_set& e) {
            py_ok = false;
            py_init_err = e.what();
            log_python_error(std::string("Python init failed: ") + e.what());
        } catch (const std::exception& e) {
            py_ok = false;
            py_init_err = e.what();
            log_python_error(std::string("C++ exception during Python init: ") + e.what());
        } catch (...) {
            py_ok = false;
            py_init_err = "Unknown error during Python init";
            log_python_error(py_init_err);
        }
    });

    if (!py_ok) {
        *out_bestF = std::numeric_limits<float>::max();
        *out_bestX = 0.0f;
        *out_bestY = 0.0f;
        *out_iterations = -1; // ошибка инициализации интерпретатора
        *out_eps = 0.0f;
        *out_micros = 0.0f;
        log_python_error("RunPythonOptimization: interpreter not ok: " + py_init_err);
        return;
    }

    // --- ленивый импорт optimizer_bridge ---
    static std::once_flag module_flag;
    static py::module_ optimizer_bridge;
    static bool module_ok = false;
    static std::string module_err;

    std::call_once(module_flag, []() {
        try {
            py::gil_scoped_acquire gil;
            optimizer_bridge = py::module_::import("optimizer_bridge");
            module_ok = true;
            module_err.clear();
            log_python_error("optimizer_bridge imported successfully");
        } catch (const py::error_already_set& e) {
            module_ok = false;
            module_err = e.what();
            log_python_error(std::string("Import optimizer_bridge failed: ") + e.what());
        } catch (const std::exception& e) {
            module_ok = false;
            module_err = e.what();
            log_python_error(std::string("C++ exception during module import: ") + e.what());
        } catch (...) {
            module_ok = false;
            module_err = "Unknown error during module import";
            log_python_error(module_err);
        }
    });

    if (!module_ok) {
        *out_bestF = std::numeric_limits<float>::max();
        *out_bestX = 0.0f;
        *out_bestY = 0.0f;
        *out_iterations = -2; // модуль не импортирован
        *out_eps = 0.0f;
        *out_micros = 0.0f;
        log_python_error("RunPythonOptimization: module not ok: " + module_err);
        return;
    }

    // --- основной вызов Python ---
    try {
        py::gil_scoped_acquire gil;

        py::dict result;
        std::string bstr(backend ? backend : "");

        if (bstr == "optuna") {
            result = optimizer_bridge.attr("run_optuna")(
                n_seg, var_len, min_theta, tx, ty, max_iter
            );
        } else if (bstr == "iopt") {
            result = optimizer_bridge.attr("run_iopt")(
                n_seg, var_len, min_theta, tx, ty,
                max_iter, r_param, eps, adaptive
            );
        } else {
            throw std::runtime_error("Unknown backend: " + bstr);
        }

        // --- извлечение результатов ---
        *out_bestF = result["BEST_F"].cast<float>();
        *out_bestX = result["BEST_X"].cast<float>();
        *out_bestY = result["BEST_Y"].cast<float>();
        *out_iterations = result["ITERATIONS"].cast<int>();
        *out_eps = result["EPS"].cast<float>();
        *out_micros = result["TIME"].cast<float>();

        // --- углы и длины ---
        std::vector<float> angles_vec;
        std::vector<float> lengths_vec;

        try {
            angles_vec = result["ANGLES"].cast<std::vector<float>>();
            lengths_vec = result["LENGTHS"].cast<std::vector<float>>();
        } catch (const py::cast_error&) {
            // fallback через py::list
            py::list a_list = result["ANGLES"].cast<py::list>();
            py::list l_list = result["LENGTHS"].cast<py::list>();
            angles_vec.reserve(a_list.size());
            lengths_vec.reserve(l_list.size());
            for (auto& v : a_list) angles_vec.push_back(v.cast<float>());
            for (auto& v : l_list) lengths_vec.push_back(v.cast<float>());
        }

        if (angles_vec.size() != static_cast<size_t>(n_seg)) {
            throw std::runtime_error("ANGLES size mismatch: got " +
                std::to_string(angles_vec.size()) + ", expected " + std::to_string(n_seg));
        }
        if (var_len && lengths_vec.size() != static_cast<size_t>(n_seg)) {
            throw std::runtime_error("LENGTHS size mismatch: got " +
                std::to_string(lengths_vec.size()) + ", expected " + std::to_string(n_seg));
        }

        std::copy(angles_vec.begin(), angles_vec.end(), out_angles);

        if (var_len) {
            std::copy(lengths_vec.begin(), lengths_vec.end(), out_lengths);
        } else if (out_lengths) {
            // если длины фиксированные, можно либо оставить, как есть,
            // либо задать 1.0f — выбирай, что тебе логичнее
            std::fill(out_lengths, out_lengths + n_seg, 1.0f);
        }
    }
    catch (const py::error_already_set& e) {
        log_python_error(std::string("Python error in RunPythonOptimization: ") + e.what());
        *out_bestF = std::numeric_limits<float>::max();
        *out_bestX = 0.0f;
        *out_bestY = 0.0f;
        *out_iterations = -3; // ошибка выполнения Python
        *out_eps = 0.0f;
        *out_micros = 0.0f;
    }
    catch (const std::exception& e) {
        log_python_error(std::string("C++ exception in RunPythonOptimization: ") + e.what());
        *out_bestF = std::numeric_limits<float>::max();
        *out_bestX = 0.0f;
        *out_bestY = 0.0f;
        *out_iterations = -4; // ошибка C++
        *out_eps = 0.0f;
        *out_micros = 0.0f;
    }
    catch (...) {
        log_python_error("Unknown exception in RunPythonOptimization");
        *out_bestF = std::numeric_limits<float>::max();
        *out_bestX = 0.0f;
        *out_bestY = 0.0f;
        *out_iterations = -5; // неизвестная ошибка
        *out_eps = 0.0f;
        *out_micros = 0.0f;
    }
}

4.1. Что важно на стороне C++/CLI (`MyForm::RunPythonBackend`)

Твой текущий код в форме в целом нормальный:

cpp
array<float>^ tempAngles = gcnew array<float>(nSeg);
array<float>^ tempLengths = gcnew array<float>(nSeg);
pin_ptr<float> pinnedAngles  = &tempAngles[0];
pin_ptr<float> pinnedLengths = &tempLengths[0];

float bestF = 0.0f, bestX = 0.0f, bestY = 0.0f;
float achievedEps = 0.0f, micros = 0.0f;
int iterations = 0;

pRunPythonOptimization(
    backendStr.c_str(),
    nSeg, varLen, minTheta, tx, ty,
    levels, maxIter, r_param, eps, adaptive,
    &bestF, &bestX, &bestY,
    pinnedAngles, pinnedLengths,
    &iterations, &achievedEps, &micros
);

Что можно добавить для диагностики:

cpp
if (iterations < 0) {
    // Это наш код ошибки из C++ части
    String^ msg = String::Format("Python backend error, code = {0}", iterations);
    lblInfo->Text = msg;
    this->Invalidate();
    this->Refresh();
    return;
}

Тогда ты сразу будешь видеть, на каком этапе всё разваливается:

−1 / −2 → проблемы с инициализацией интерпретатора/модуля (чаще всего путь к скрипту, окружение Python);
−3 → реальная ошибка внутри run_optuna / run_iopt (включая размер векторов и т.п.);
−4 / −5 → что-то с C++.

5. Почему это должно решить именно твои симптомы

Горизонтальный манипулятор
Сейчас это следствие того, что tempAngles заполняются нулями (или не заполняются вообще), а RunPythonOptimization не даёт валидных значений. В новой схеме:
- при любой проблеме ты увидишь iterations < 0,
- в python_bridge.log — текст ошибки Python/pybind,
- можно адресно чинить уже конкретную причину.
Разница между запуском через mpiexec и через отладчик
Это, скорее всего, разный working directory → модуль не находится при одном запуске и находится при другом.
Мы теперь добавляем в sys.path каталог exe, аналогично тому, как раньше ты строил путь к optimizer_bridge.py для внешнего Python — так что поведение станет одинаковым в обоих сценариях.
Optuna работает только для 1D
Тут, по ощущениям, у тебя сейчас происходят Python-исключения при больших размерностях (например, из-за n_startup_trials, из-за отсутствия памяти, из-за чего угодно). Они глотаются, и результат не доходит до C++.

В новой версии все такие ошибки:
- попадут в python_bridge.log,
- выставят iterations = -3,
- не будут тихо маскироваться.
iOpt “не работает вообще”
Аналогично: мы чётко увидим, что именно не устраивает iOpt (не тот Python, нет модуля, неверные параметры SolverParameters и т.п.), вместо немой горизонтальной палки.

Если хочешь, можем следующим шагом разобрать уже конкретное содержимое python_bridge.log / коды ошибок iterations и довести до состояния, когда оба backend’а стабильно работают и для многозвенных манипуляторов.

я пробую вызывать пайтоновский скрипт из управляем...

Pergunta

Pensamento

Reviewing global variable usage and Python integration

Troubleshooting Python embedding with mpi and optimization

Focusing on pybind issues in MPI with Python embedding

Analyzing pipeline and mpi setup with pybind

Understanding how Python embedding works in MPI setup

Analyzing Python bug with MPI, multiple segments, and iOpt

Clarifying bool compatibility across C# and native

Investigating issues with multi-segment Python integration

Investigating MPI and Python integration issues

Investigating Python environment path issues in optimization

Exploring threading, interpreter, and environment issues

Considering Python bridging and multi-process interaction issues

Investigating runtime issues, MPI, and Python integration

Exploring MPI, pybind, and static globals issues

Discussing Python interpreter initialization and MPI issues

Investigating Python import issues related to working directory

Resposta

1. Основные проблемы текущего использования pybind

1.1. Инициализация Python в `AgpInit` + глобальные объекты

1.2. Новая версия с `std::once_flag`, но без диагностики

2. Что ещё может ломать Optuna/iOpt именно через pybind

3. Как правильно организовать pybind в твоём контексте

3.1. Общая идея

4. Пример нормального `RunPythonOptimization`

4.1. Что важно на стороне C++/CLI (`MyForm::RunPythonBackend`)

5. Почему это должно решить именно твои симптомы

Compartilhar esta Q&A

Pergunta

Pensamento

Reviewing global variable usage and Python integration

Troubleshooting Python embedding with mpi and optimization

Focusing on pybind issues in MPI with Python embedding

Analyzing pipeline and mpi setup with pybind

Understanding how Python embedding works in MPI setup

Analyzing Python bug with MPI, multiple segments, and iOpt

Clarifying bool compatibility across C# and native

Investigating issues with multi-segment Python integration

Investigating MPI and Python integration issues

Investigating Python environment path issues in optimization

Exploring threading, interpreter, and environment issues

Considering Python bridging and multi-process interaction issues

Investigating runtime issues, MPI, and Python integration

Exploring MPI, pybind, and static globals issues

Discussing Python interpreter initialization and MPI issues

Investigating Python import issues related to working directory

Resposta

1. Основные проблемы текущего использования pybind

1.1. Инициализация Python в AgpInit + глобальные объекты

1.2. Новая версия с std::once_flag, но без диагностики

2. Что ещё может ломать Optuna/iOpt именно через pybind

3. Как правильно организовать pybind в твоём контексте

3.1. Общая идея

4. Пример нормального RunPythonOptimization

4.1. Что важно на стороне C++/CLI (MyForm::RunPythonBackend)

5. Почему это должно решить именно твои симптомы

Compartilhar esta Q&A

1.1. Инициализация Python в `AgpInit` + глобальные объекты

1.2. Новая версия с `std::once_flag`, но без диагностики

4. Пример нормального `RunPythonOptimization`

4.1. Что важно на стороне C++/CLI (`MyForm::RunPythonBackend`)