#include "pch.h" #define XOR_RAND(state, result_v...

Question

#include "pch.h"

#define XOR_RAND(state, result_var)
do {
int s = (state);
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = state * 0x1.0p-32f;
} while (0)

#define XOR_RAND_GRSH(state, result_var)
do {
int s = (state);
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = fmaf(state, 0x1.0p-31f, -1.0f);
} while (0)

#define FABE13_COS(x, result_var)
do {
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530718f);
if (r > 3.14159265359f) {
r = 6.28318530718f - r;
}
if (r < 1.57079632679f) {
const float t2 = r * r;
const float t4 = t2 * t2;
result_var = fmaf(t4, fmaf(t2, -0.0013888889f, 0.0416666667f), fmaf(t2, -0.5f, 1.0f));
} else {
r = 3.14159265359f - r;
const float t2 = r * r;
const float t4 = t2 * t2;
result_var = -fmaf(t4, fmaf(t2, -0.0013888889f, 0.0416666667f), fmaf(t2, -0.5f, 1.0f));
}
} while (0)

#define FABE13_SIN(x, result_var)
do {
const float x = (x);
const float ax = fabsf(x);
float r = fmodf(ax, 6.28318530718f);
bool sfl = r > 3.14159265359f;
if (sfl) r = 6.28318530718f - r;
bool cfl = r > 1.57079632679f;
if (cfl) r = 3.14159265359f - r;
const float t2 = r * r;
float _s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
result_var = ((x < 0.0f) ^ sfl) ? -_s : _s;
} while (0)

#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
const int limit = (n) & ~7;
if ((n) >= 8) {
static __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f);
static __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f);
static __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f);
static __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f);
static __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f);
static __declspec(align(32)) const __m256 VEC_COS_P5 = _mm256_set1_ps(-0.0013888889f);
static __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f);
static __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f);
static __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_SIN_P5 = _mm256_set1_ps(-0.0001984127f);
static __declspec(align(32)) const __m256 VEC_SIN_P3 = _mm256_set1_ps(0.0083333333f);
static __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f);
static __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps();
while (i < limit) {
const __m256 vx = _mm256_load_ps(&(in)[i]);
const __m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
const __m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
const __m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
const __m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
const __m256 t2 = _mm256_mul_ps(r2, r2);
const __m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P5, VEC_COS_P3), VEC_COS_P1), VEC_COS_P0);
const __m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_SIN_P5, VEC_SIN_P3), VEC_SIN_P1), VEC_SIN_P0), r2);
const __m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
const __m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ), _mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
_mm256_store_ps(&(cos_out)[i], _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip));
_mm256_store_ps(&(sin_out)[i], _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip));
i += 8;
}
}
while (i < (n)) {
const float x = (in)[i];
const float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
const bool sflip = r > 3.14159265359f;
if (sflip) r = 6.28318530718f - r;
const bool cflip = r > 1.57079632679f;
if (cflip) r = 3.14159265359f - r;
const float t2 = r * r;
const float c = fmaf(t2, fmaf(t2, fmaf(t2, -0.0013888889f, 0.0416666667f), -0.5f), 1.0f);
const float s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r;
(cos_out)[i] = cflip ? -c : c;
(sin_out)[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while (0)

enum List : uint8_t {
Top = 0b00u,
Down = 0b01u,
Left = 0b10u,
Right = 0b11u
};

__declspec(align(4)) struct Step {
const uint8_t next;
const uint8_t dx;
const uint8_t dy;
};

__declspec(align(4)) struct InvStep {
const uint8_t q;
const uint8_t next;
};

__declspec(align(64)) static const Step g_step_tbl[4][4] = {
{ {Right, 0u, 0u}, {Top, 0u, 1u}, {Top, 1u, 1u}, {Left, 1u, 0u} },
{ {Left, 1u, 1u}, {Down, 1u, 0u}, {Down, 0u, 0u}, {Right, 0u, 1u} },
{ {Down, 1u, 1u}, {Left, 0u, 1u}, {Left, 0u, 0u}, {Top, 1u, 0u} },
{ {Top, 0u, 0u}, {Right, 1u, 0u}, {Right, 1u, 1u}, {Down, 0u, 1u} }
};

__declspec(align(64)) static const InvStep g_inv_tbl[4][4] = {
{ {0u, Right}, {1u, Top}, {3u, Left}, {2u, Top} },
{ {2u, Down}, {3u, Right}, {1u, Down}, {0u, Left} },
{ {2u, Left}, {1u, Left}, {3u, Top}, {0u, Down} },
{ {0u, Top}, {3u, Down}, {1u, Right}, {2u, Right} }
};

static const boost::mpi::environment* g_env;
static const boost::mpi::communicator* g_world;

__declspec(align(16)) struct CrossMsg final {
float s_x1, s_x2;
float e_x1, e_x2;
float Rtop;

text
template<typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept {
	ar& s_x1& s_x2& e_x1& e_x2& Rtop;
}

};

__declspec(align(16)) struct CtrlMsg final {
bool kind;
CrossMsg xchg;

text
template<typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept {
	ar& kind& xchg;
}

};

__declspec(align(16)) struct Slab {
char* const base;
char* current;
char* const end;

text
__forceinline Slab(void* const memory, const size_t usable) :
	base((char*)memory),
	current(base),
	end(base + (usable & ~(size_t)63u)) {
}

};

static tbb::enumerable_thread_specific<Slab*> tls( noexcept {
void* memory = _aligned_malloc(16777216u, 16u);
Slab* slab = (Slab*)_aligned_malloc(32u, 16u);
new (slab) Slab(memory, 16777216u);
char* p = slab->base;
while (p < slab->end) {
*p = 0;
p += 4096u;
}
return slab;
});

__declspec(align(16)) struct Peano2DMap {
const int levels;
const float a, b, c, d;
const float lenx, leny;
const float inv_lenx;
const uint32_t scale;
const uint8_t start;

text
__forceinline Peano2DMap(int L, float _a, float _b, float _c, float _d, uint8_t st) :
	levels(L),
	a(_a), b(_b), c(_c), d(_d),
	lenx(_b - _a),
	leny(_d - _c),
	inv_lenx(1.0f / (_b - _a)),
	scale((uint32_t)1u << (L << 1)),
	start(st) {
}

};

static Peano2DMap gActiveMap(0, 0, 0, 0, 0, 0);

__forceinline bool ComparePtr(const struct Interval* a, const struct Interval* b) noexcept;

__declspec(align(16)) struct Interval {
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float ordinate_factor;
const float N_factor;
const float quadratic_term;
const float M;
float R;

text
__forceinline void* operator new(size_t) noexcept {
	Slab* s = tls.local();
	char* r = s->current;
	s->current += 64u;
	return r;
}

__forceinline Interval(float _x1, float _x2, float _y1, float _y2, float _N) noexcept :
	x1(_x1), x2(_x2), y1(_y1), y2(_y2),
	delta_y(_y2 - _y1),
	ordinate_factor(-(y1 + y2) * 2.0f),
	N_factor(_N == 1.0f ? _x2 - _x1 : _N == 2.0f ? sqrtf(_x2 - _x1) : powf(_x2 - _x1, 1.0f / _N)),
	quadratic_term((1.0f / N_factor)* delta_y* delta_y),
	M((1.0f / N_factor)* fabsf(delta_y)) {
}

__forceinline void ChangeCharacteristic(float _m) noexcept {
	R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor));
}

};

__forceinline bool ComparePtr(const Interval* a, const Interval* b) noexcept {
return a->R < b->R;
}

__forceinline void RecomputeR_ConstM_AVX2(Interval* const* arr, size_t n, float m) {
const __m256 vm = _mm256_set1_ps(m);
__m256 vinvm = _mm256_rcp_ps(vm);
vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));

text
size_t i = 0;
const int limit = (int)(n & ~7u);

if (n >= 8u) {
	while (i < (size_t)limit) {
		__declspec(align(32)) float q[8], nf[8], ord[8];
		for (int k = 0; k < 8; ++k) {
			const Interval* p = arr[i + k];
			q[k] = p->quadratic_term;
			nf[k] = p->N_factor;
			ord[k] = p->ordinate_factor;
		}

		const __m256 vq = _mm256_load_ps(q);
		const __m256 vnf = _mm256_load_ps(nf);
		const __m256 vod = _mm256_load_ps(ord);
		const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
		const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);

		__declspec(align(32)) float out[8];
		_mm256_store_ps(out, res);

		for (int k = 0; k < 8; ++k) {
			arr[i + k]->R = out[k];
		}
		i += 8u;
	}
}

while (i < n) {
	arr[i]->ChangeCharacteristic(m);
	++i;
}

}

__forceinline void RecomputeR_AffineM_AVX2(Interval* const* arr, size_t n, float GF, float alpha) {
const __m256 vGF = _mm256_set1_ps(GF);
const __m256 va = _mm256_set1_ps(alpha);

text
size_t i = 0;
const int limit = (int)(n & ~7u);

if (n >= 8u) {
	while (i < (size_t)limit) {
		__declspec(align(32)) float len[8], Mv[8], q[8], nf[8], ord[8];
		for (int k = 0; k < 8; ++k) {
			const Interval* p = arr[i + k];
			len[k] = p->x2 - p->x1;
			Mv[k] = p->M;
			q[k] = p->quadratic_term;
			nf[k] = p->N_factor;
			ord[k] = p->ordinate_factor;
		}

		const __m256 vlen = _mm256_load_ps(len);
		const __m256 vM = _mm256_load_ps(Mv);
		const __m256 vq = _mm256_load_ps(q);
		const __m256 vnf = _mm256_load_ps(nf);
		const __m256 vod = _mm256_load_ps(ord);

		const __m256 vm = _mm256_fmadd_ps(vGF, vlen, _mm256_mul_ps(va, vM));
		__m256 vinvm = _mm256_rcp_ps(vm);
		vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));

		const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
		const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);

		__declspec(align(32)) float out[8];
		_mm256_store_ps(out, res);

		for (int k = 0; k < 8; ++k) {
			arr[i + k]->R = out[k];
		}
		i += 8u;
	}
}

while (i < n) {
	const Interval* p = arr[i];
	const float mi = fmaf(GF, (p->x2 - p->x1), p->M * alpha);
	arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor));
	++i;
}

}

__forceinline float Shag(float _m, float x1, float x2, float y1, float y2, float _N, float _r) {
const float diff = y2 - y1;
const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(0x3F800000u | (((((uint32_t)&diff)) & 0x80000000u) ^ 0x80000000u))));

text
return _N == 1.0f ? fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f :
	_N == 2.0f ? fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f :
	fmaf(sign_mult / powf(_m, _N), powf(diff, _N) * _r, x1 + x2) * 0.5f;

}

struct MortonCachePerRank {
std::vector<int> permCache;
std::vector<uint64_t> invMaskCache;
std::vector<uint64_t> levelMasks;
uint32_t baseSeed;

MortonCachePerRank() : baseSeed(0) {}

};

static MortonCachePerRank g_mc;

struct MortonND {
int dim;
int levels;
uint64_t scale;
std::vector<float> low, high;
std::vector<int> perm;
std::vector<uint64_t> invMask;
mutable std::vector<uint64_t> coord;

text
MortonND(int D, int L, const float* lows, const float* highs, const MortonCachePerRank& mc) :
	dim(D), levels(L), scale(1ull),
	low(lows, lows + D),
	high(highs, highs + D),
	perm(mc.permCache.begin(), mc.permCache.begin() + D),
	invMask(mc.invMaskCache.begin(), mc.invMaskCache.begin() + D),
	coord(D, 0ull)
{
	uint64_t level_mask = (L <= 12) ? ((1ull << L) - 1ull) : ((1ull << 12) - 1ull);

	for (int k = 0; k < D; ++k) {
		invMask[k] &= level_mask;
	}

	int maxbits = 63;
	int need = levels * D;
	if (need > maxbits) {
		levels = maxbits / D;
		if (levels < 1) levels = 1;
	}
	scale = 1ull << ((uint64_t)levels * (uint64_t)D);
}

__forceinline void map01ToPoint(float t, float* out) const noexcept {
	if (t < 0.0f) t = 0.0f;
	else if (t >= 1.0f) t = nextafterf(1.0f, 0.0f);

	uint64_t idx = (uint64_t)(t * (double)scale);
	if (idx >= scale) idx = scale - 1;

	for (int d = 0; d < dim; ++d) {
		coord[d] = 0ull;
	}

	for (int bit = 0; bit < levels; ++bit) {
		for (int d = 0; d < dim; ++d) {
			int pd = perm[d];
			uint64_t b = (idx >> ((bit * dim) + d)) & 1ull;
			coord[pd] |= (b << bit);
		}
	}

	for (int d = 0; d < dim; ++d) {
		if (invMask[d]) coord[d] ^= invMask[d];
	}

	const float invC = 1.0f / (float)((uint64_t)1 << levels);
	for (int d = 0; d < dim; ++d) {
		float c0 = low[d] + (high[d] - low[d]) * ((float)coord[d] * invC);
		float cs = (high[d] - low[d]) * invC;
		out[d] = fmaf(0.5f, cs, c0);
	}
}

};

struct ManipCost {
int n;
bool variableLen;
float targetX, targetY;
float minTheta;

text
ManipCost(int _n, bool _variableLen, float _targetX, float _targetY, float _minTheta)
	: n(_n), variableLen(_variableLen), targetX(_targetX), targetY(_targetY), minTheta(_minTheta) {
}

__forceinline float operator()(const float* q, float& out_x, float& out_y) const noexcept {
	const float* th = q;
	const float* L = variableLen ? (q + n) : nullptr;
	float x = 0.0f, y = 0.0f, phi = 0.0f;
	float penC = 0.0f;
	const float Lc = 1.0f;

	std::vector<float> phi_arr(n);
	std::vector<float> s_arr(n), c_arr(n);

	for (int i = 0; i < n; ++i) {
		phi += th[i];
		phi_arr[i] = phi;
	}

	FABE13_SINCOS(phi_arr.data(), s_arr.data(), c_arr.data(), n);

	for (int i = 0; i < n; ++i) {
		float Li = variableLen ? L[i] : Lc;
		x = fmaf(Li, c_arr[i], x);
		y = fmaf(Li, s_arr[i], y);
	}

	for (int i = 0; i < n; ++i) {
		float ai = fabsf(th[i]);
		if (ai < minTheta) {
			float violation = minTheta - ai;
			penC += violation * violation;
		}
	}

	float dx = x - targetX;
	float dy = y - targetY;
	float dist = sqrtf(fmaf(dx, dx, dy * dy));

	out_x = x;
	out_y = y;

	return dist + penC;
}

};

__forceinline void HitTest2D_analytic(float x_param, float& out_x1, float& out_x2) {
const float a = gActiveMap.a;
const float inv_lenx = gActiveMap.inv_lenx;
const uint32_t scale = gActiveMap.scale;
const uint32_t scale_minus_1 = scale - 1u;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const float c = gActiveMap.c;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;

text
float norm = (x_param - a) * inv_lenx;
norm = fminf(fmaxf(norm, 0.0f), 0x1.fffffep-1f);
uint32_t idx = (uint32_t)(norm * (float)scale);
idx = idx > scale_minus_1 ? scale_minus_1 : idx;

float sx = lenx, sy = leny;
float x1 = a, x2 = c;
uint8_t type = start;
int l = levels - 1;

while (l >= 0) {
	const uint32_t q = (idx >> (l * 2)) & 3u;
	const Step s = g_step_tbl[type][q];
	type = s.next;
	sx *= 0.5f;
	sy *= 0.5f;
	x1 += s.dx ? sx : 0.0f;
	x2 += s.dy ? sy : 0.0f;
	--l;
}

out_x1 = x1 + sx * 0.5f;
out_x2 = x2 + sy * 0.5f;

}

__forceinline float FindX2D_analytic(float px, float py) {
const float a = gActiveMap.a;
const float b = gActiveMap.b;
const float c = gActiveMap.c;
const float d = gActiveMap.d;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const uint32_t scale = gActiveMap.scale;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;

text
const float clamped_px = fminf(fmaxf(px, a), b);
const float clamped_py = fminf(fmaxf(py, c), d);

float sx = lenx, sy = leny;
float x0 = a, y0 = c;
uint32_t idx = 0u;
uint8_t type = start;
int l = 0;

while (l < levels) {
	sx *= 0.5f;
	sy *= 0.5f;
	const float mx = x0 + sx;
	const float my = y0 + sy;

	const uint32_t tr = (uint32_t)((clamped_px > mx) & (clamped_py > my));
	const uint32_t tl = (uint32_t)((clamped_px < mx) & (clamped_py > my));
	const uint32_t dl = (uint32_t)((clamped_px < mx) & (clamped_py < my));
	const uint32_t none = (uint32_t)(1u ^ (tr | tl | dl));
	const uint32_t dd = (tr << 1) | tr | tl | (none << 1);

	const InvStep inv = g_inv_tbl[type][dd];
	type = inv.next;
	idx = (idx << 2) | inv.q;

	const uint32_t dx = dd >> 1;
	const uint32_t dy = dd & 1u;
	x0 += dx ? sx : 0.0f;
	y0 += dy ? sy : 0.0f;
	++l;
}

const float scale_recip = 1.0f / (float)scale;
return fmaf((float)idx * scale_recip, lenx, a);

}

static __forceinline void agp_run_branch_mpi(const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed, std::vector<Interval*>& H, std::vector<float>& bestQ, float& bestF, float& bestX, float& bestY) {
const int dim = cost.n + (cost.variableLen ? cost.n : 0);

text
std::vector<float> q(dim);

bestQ.reserve(dim);

float x, y;

int no_improve = 0;

auto evalAt = [&](float t) -> float {
	map.map01ToPoint(t, q.data());
	float f = cost(q.data(), x, y);
	if (f < bestF) {
		bestF = f;
		bestQ = q;
		bestX = x;
		bestY = y;
		no_improve = 0;
	}
	else {
		no_improve++;
	}
	return f;
	};

float a = 0.0f, b = 1.0f;

float f_a = evalAt(a);
float f_b = evalAt(b);

const int K = (std::min)((std::max)(2 * dim, 8), 128);

H.clear();
H.reserve((size_t)maxIter + K + 8);

float Mmax = 1e-3f;
float prev_t = a;
float prev_f = f_a;

for (int k = 1; k <= K; ++k) {
	float t = a + (b - a) * ((float)k / (K + 1));
	float f = evalAt(t);

	Interval* I = new Interval(prev_t, t, prev_f, f, (float)dim);
	Mmax = (std::max)(Mmax, I->M);
	I->ChangeCharacteristic(r * Mmax);
	H.emplace_back(I);
	std::push_heap(H.begin(), H.end(), ComparePtr);

	prev_t = t;
	prev_f = f;
}

Interval* tail = new Interval(prev_t, b, prev_f, f_b, (float)dim);
Mmax = (std::max)(Mmax, tail->M);
float m = r * Mmax;
tail->ChangeCharacteristic(m);
H.emplace_back(tail);
std::push_heap(H.begin(), H.end(), ComparePtr);

float dmax = b - a;
const float initial_len = dmax;
const float thr03 = 0.3f * initial_len;
const float inv_thr03 = 1.0f / thr03;

int it = 0;
const int rank = g_world->rank();
const int world = g_world->size();

while (it < maxIter) {
	const float p = fmaf(-1.0f / initial_len, dmax, 1.0f);
	bool stagnation = (no_improve > 100) && (it > 270);

	float k = stagnation
		? fmaf(0.5819767068693265f, expm1f(p), 0.3f)
		: fmaf(0.3491860241215959f, expm1f(p), 0.6f);
	while (g_world->iprobe(boost::mpi::any_source, 0)) {
		CtrlMsg in;
		g_world->recv(boost::mpi::any_source, 0, in);

		if (in.kind) {
			if (!rank) {
				break;
			}
			else {
				return;
			}
		}
		else {
			float sx = in.xchg.s_x1, ex = in.xchg.e_x1;
			if (sx < 0.0f) sx = 0.0f;
			if (ex > 1.0f) ex = 1.0f;
			if (ex <= sx) continue;

			std::vector<float> tmp(dim);
			float tmp_x, tmp_y;
			map.map01ToPoint(sx, tmp.data());
			float y1 = cost(tmp.data(), tmp_x, tmp_y);
			map.map01ToPoint(ex, tmp.data());
			float y2 = cost(tmp.data(), tmp_x, tmp_y);

			Interval* inj = new Interval(sx, ex, y1, y2, (float)dim);
			inj->ChangeCharacteristic(m);

			Interval* top = H.front();
			if (inj->R > 1.15f * top->R) {
				inj->R = in.xchg.Rtop * k;
				H.emplace_back(inj);
				std::push_heap(H.begin(), H.end(), ComparePtr);
			}
		}
	}

	std::pop_heap(H.begin(), H.end(), ComparePtr);
	Interval* cur = H.back();
	H.pop_back();

	const float x1 = cur->x1, x2 = cur->x2, y1 = cur->y1, y2 = cur->y2;
	float tNew = Shag(m, x1, x2, y1, y2, (float)dim, r);
	tNew = fminf(fmaxf(tNew, a), b);

	float fNew = evalAt(tNew);

	Interval* left = new Interval(x1, tNew, y1, fNew, (float)dim);
	Interval* right = new Interval(tNew, x2, fNew, y2, (float)dim);

	float Mloc = (std::max)(left->M, right->M);

	if (adaptive) {
		float len1 = tNew - x1, len2 = x2 - tNew;

		if (len1 + len2 == dmax) {
			dmax = (std::max)(len1, len2);
			for (auto p : H) {
				float L = p->x2 - p->x1;
				if (L > dmax) dmax = L;
			}
		}

		if ((thr03 > dmax && !(it % 3)) || (10.0f * dmax < initial_len)) {
			if (Mloc > Mmax) {
				Mmax = Mloc;
				m = r * Mmax;
			}

			const float progress = fmaf(-dmax, inv_thr03, 1.0f);
			const float alpha = progress * progress;
			const float beta = fmaf(-alpha, 1.0f, 2.0f);
			const float MULT = (1.0f / dmax) * Mmax;
			const float global_coeff = fmaf(MULT, r, -MULT);
			const float GF = fmaf(beta, global_coeff, 0.0f);

			left->ChangeCharacteristic(fmaf(GF, len1, left->M * alpha));
			right->ChangeCharacteristic(fmaf(GF, len2, right->M * alpha));

			size_t sz = H.size();
			RecomputeR_AffineM_AVX2(H.data(), sz, GF, alpha);
			std::make_heap(H.begin(), H.end(), ComparePtr);
		}
		else {
			if (Mloc > Mmax) {
				if (Mloc < 1.15f * Mmax) {
					Mmax = Mloc;
					m = r * Mmax;
					left->ChangeCharacteristic(m);
					right->ChangeCharacteristic(m);
				}
				else {
					Mmax = Mloc;
					m = r * Mmax;
					left->ChangeCharacteristic(m);
					right->ChangeCharacteristic(m);

					size_t sz = H.size();
					RecomputeR_ConstM_AVX2(H.data(), sz, m);
					std::make_heap(H.begin(), H.end(), ComparePtr);
				}
			}
			else {
				left->ChangeCharacteristic(m);
				right->ChangeCharacteristic(m);
			}
		}
	}
	else {
		if (Mloc > Mmax) {
			if (Mloc < 1.15f * Mmax) {
				Mmax = Mloc;
				m = r * Mmax;
				left->ChangeCharacteristic(m);
				right->ChangeCharacteristic(m);
			}
			else {
				Mmax = Mloc;
				m = r * Mmax;
				left->ChangeCharacteristic(m);
				right->ChangeCharacteristic(m);

				size_t sz = H.size();
				RecomputeR_ConstM_AVX2(H.data(), sz, m);
				std::make_heap(H.begin(), H.end(), ComparePtr);
			}
		}
		else {
			left->ChangeCharacteristic(m);
			right->ChangeCharacteristic(m);
		}
	}

	H.push_back(left);
	std::push_heap(H.begin(), H.end(), ComparePtr);
	H.push_back(right);
	std::push_heap(H.begin(), H.end(), ComparePtr);

	Interval* top = H.front();
	float interval_len = top->x2 - top->x1;
	const int T = (int)fmaf(-expm1f(p), 264.0f, 277.0f);

	bool want_term = (powf(interval_len, 1.0f / (float)dim) < eps) || (it == maxIter - 1);

	if (!(it % T) || want_term) {
		CtrlMsg out;
		out.kind = want_term;
		out.xchg.s_x1 = top->x1;
		out.xchg.e_x1 = top->x2;
		out.xchg.s_x2 = 0.0f;
		out.xchg.e_x2 = 0.0f;
		out.xchg.Rtop = top->R;

		for (int i = 0; i < world; ++i) {
			if (i != rank) g_world->isend(i, 0, out);
		}

		if (want_term) {
			break;
		}
	}
	++it;
}

}

struct BestPacket {
float bestF;
int dim;
float bestX;
float bestY;

text
template<typename Archive>
void serialize(Archive& ar, const unsigned int) {
	ar& bestF& dim& bestX& bestY;
}

};

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_Manip2D(int nSegments, bool variableLengths, float minTheta,
float targetX, float targetY, int peanoLevels, int maxIterPerBranch,
float r, bool adaptiveMode, float epsilon, unsigned int seed,
float** out_bestQ, size_t* out_bestQLen,
float* out_bestX, float* out_bestY, float* out_bestF) {

text
const int dim = nSegments + (variableLengths ? nSegments : 0);

const float thetaMin = -3.14159265359f;
const float thetaMax = 3.14159265359f;
const float lenMin = 0.1f;
const float lenMax = 2.0f;

std::vector<float> low, high;
low.reserve(dim);
high.reserve(dim);

for (int i = 0; i < nSegments; ++i) {
	low.push_back(thetaMin);
	high.push_back(thetaMax);
}

if (variableLengths) {
	for (int i = 0; i < nSegments; ++i) {
		low.push_back(lenMin);
		high.push_back(lenMax);
	}
}

ManipCost cost(nSegments, variableLengths, targetX, targetY, minTheta);

const int rank = g_world->rank();
const int world = g_world->size();

MortonND map(dim, peanoLevels, low.data(), high.data(), g_mc);

std::vector<Interval*> H;
std::vector<float> bestQ;
float bestF = FLT_MAX, bx = 0.0f, by = 0.0f;

H.reserve((size_t)maxIterPerBranch + 256);
bestQ.reserve(dim);

agp_run_branch_mpi(map, cost, maxIterPerBranch, r, adaptiveMode, epsilon, seed, H, bestQ, bestF, bx, by);

BestPacket me{ bestF, dim, bx, by };

if (!rank) {
	std::vector<float> winnerQ = bestQ;
	float winF = bestF, wx = bx, wy = by;

	for (int i = 1; i < world; ++i) {
		BestPacket bp;
		g_world->recv(i, 2, bp);
		std::vector<float> qin;
		g_world->recv(i, 3, qin);

		if (bp.bestF < winF) {
			winF = bp.bestF;
			wx = bp.bestX;
			wy = bp.bestY;
			winnerQ = qin;
		}
	}

	*out_bestQLen = winnerQ.size();
	*out_bestQ = (float*)CoTaskMemAlloc(sizeof(float) * (*out_bestQLen));
	std::memcpy(*out_bestQ, winnerQ.data(), sizeof(float) * (*out_bestQLen));
	*out_bestX = wx;
	*out_bestY = wy;
	*out_bestF = winF;
}
else {
	g_world->send(0, 2, me);
	g_world->send(0, 3, bestQ);
}

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(int peanoLevel, float a, float b, float c, float d) {
g_env = new boost::mpi::environment();
g_world = new boost::mpi::communicator();

text
const int rank = g_world->rank();
const int world_size = g_world->size();

if (world_size == 4) {
	switch (rank) {
	case 0:  new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, static_cast<uint8_t>(Top));  break;
	case 1:  new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, static_cast<uint8_t>(Down)); break;
	case 2:  new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, static_cast<uint8_t>(Left)); break;
	default: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, static_cast<uint8_t>(Right));
	}
}

const int MaxDim = 16;
g_mc.baseSeed = 0x9E3779B9u * (rank + 1);

g_mc.permCache.resize(MaxDim);
for (int i = 0; i < MaxDim; ++i) {
	g_mc.permCache[i] = i;
}

g_mc.invMaskCache.resize(MaxDim);
for (int k = 0; k < MaxDim; ++k) {
	g_mc.invMaskCache[k] = 0ull;
}

g_mc.levelMasks.resize(13);
for (int level = 0; level <= 12; ++level) {
	g_mc.levelMasks[level] = (level == 0) ? 0ull : (1ull << level) - 1ull;
}

return rank;

}

__forceinline float ShekelFunc(float x, float seed) {
int i = 0;
float st = seed, r1, r2, res = 0.0f;

text
while (i < 10) {
	XOR_RAND(st, r1);
	float xp = fmaf(-r1, 10.0f, x);
	XOR_RAND(st, r1);
	XOR_RAND(st, r2);
	float d = fmaf(fmaf(r1, 20.0f, 5.0f), xp * xp, fmaf(r2, 0.2f, 1.0f));
	d = copysignf(fmaxf(fabsf(d), FLT_MIN), d);
	res -= 1.0f / d;
	++i;
}
return res;

}

__forceinline float RastriginFunc(float x1, float x2) {
const float t = fmaf(x1, x1, x2 * x2);
float c1, c2;
FABE13_COS(6.28318530717958647692f * x1, c1);
FABE13_COS(6.28318530717958647692f * x2, c2);
return (t - fmaf(c1 + c2, 10.0f, -14.6f)) * fmaf(-t, 0.25f, 18.42f);
}

__forceinline float HillFunc(float x, float seed) {
int j = 0;
__declspec(align(32)) float ang[14u];
float st = 6.28318530717958647692f * x;

text
while (j < 14) {
	ang[j] = st * (float)(j + 1);
	++j;
}

__declspec(align(32)) float sv[14u], cv[14u];
FABE13_SINCOS(ang, sv, cv, 14u);

float state = seed, r1, r2;
XOR_RAND(state, r1);
float res = fmaf(r1, 2.0f, -1.1f);
--j;

while (j >= 0) {
	XOR_RAND(state, r1);
	XOR_RAND(state, r2);
	res += fmaf(fmaf(r1, 2.0f, -1.1f), sv[j], fmaf(r2, 2.0f, -1.1f) * cv[j]);
	--j;
}
return res;

}

__forceinline float GrishaginFunc(float x1, float x2, float seed) {
int j = 0;
__declspec(align(32)) float aj[8u], ak[8u];

text
while (j < 8) {
	float pj = 3.14159265358979323846f * (float)(j + 1);
	aj[j] = pj * x1;
	ak[j] = pj * x2;
	++j;
}

__declspec(align(32)) float sj[8u], cj[8u], sk[8u], ck[8u];
FABE13_SINCOS(aj, sj, cj, 8u);
FABE13_SINCOS(ak, sk, ck, 8u);
--j;

float p1 = 0.0f, p2 = 0.0f;
float st = seed, r1, r2;

while (j >= 0) {
	size_t k = 0u;
	while (k < 8u) {
		float s = sj[j] * sj[j];
		float c = ck[k] * ck[k];
		XOR_RAND_GRSH(st, r1);
		XOR_RAND_GRSH(st, r2);
		p1 = fmaf(r1, s, fmaf(r2, c, p1));
		XOR_RAND_GRSH(st, r1);
		XOR_RAND_GRSH(st, r2);
		p2 = fmaf(-r1, c, fmaf(r2, s, p2));
		++k;
	}
	--j;
}
return -sqrtf(fmaf(p1, p1, p2 * p2));

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_1D(float global_iterations, float a, float b, float r, bool mode, float epsilon, float seed, float** out_data, size_t* out_len) {
Slab* slab = tls.local();
slab->current = slab->base;

text
int schetchick = 0;
const float initial_length = b - a;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length;
const float inv_threshold_03 = 1.0f / threshold_03;
const float start_val = ShekelFunc(a, seed);
float best_f = ShekelFunc(b, seed);
float x_Rmax_1 = a;
float x_Rmax_2 = b;
float y_Rmax_1 = start_val;
float y_Rmax_2 = best_f;

std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
std::vector<Interval*, boost::alignment::aligned_allocator<Interval*, 64u>> R;
Extr.reserve((size_t)global_iterations << 2u);
R.reserve((size_t)global_iterations << 1u);
R.emplace_back(new Interval(a, b, start_val, best_f, 1.0f));

float Mmax = R.front()->M;
float m = r * Mmax;

while (true) {
	const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r);
	const float new_value = ShekelFunc(new_point, seed);

	if (new_value < best_f) {
		best_f = new_value;
		Extr.emplace_back(best_f);
		Extr.emplace_back(new_point);
	}

	std::pop_heap(R.begin(), R.end(), ComparePtr);
	const Interval* pro = R.back();
	const float new_x1 = pro->x1;
	const float new_x2 = pro->x2;
	const float len2 = new_x2 - new_point;
	const float len1 = new_point - new_x1;
	const float interval_len = len1 < len2 ? len1 : len2;

	if (++schetchick == (int)global_iterations || interval_len < epsilon) {
		Extr.emplace_back((float)schetchick);
		Extr.emplace_back(interval_len);
		*out_len = Extr.size();
		*out_data = (float*)CoTaskMemAlloc(sizeof(float) * (*out_len));
		memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
		return;
	}

	Interval* curr = new Interval(new_x1, new_point, pro->y1, new_value, 1.0f);
	Interval* curr1 = new Interval(new_point, new_x2, new_value, pro->y2, 1.0f);
	const float currM = curr->M > curr1->M ? curr->M : curr1->M;
	const size_t r_size = R.size();

	if (mode) {
		if (len2 + len1 == dmax) {
			dmax = len2 > len1 ? len2 : len1;
			for (auto p : R) {
				float L = p->x2 - p->x1;
				if (L > dmax) dmax = L;
			}
		}

		if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) {
			if (currM > Mmax) {
				Mmax = currM;
				m = r * Mmax;
			}

			const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
			const float alpha = progress * progress;
			const float betta = 2.0f - alpha;
			const float MULT = (1.0f / dmax) * Mmax;
			const float global_coeff = fmaf(MULT, r, -MULT);
			const float GF = betta * global_coeff;

			curr->ChangeCharacteristic(fmaf(GF, len1, curr->M * alpha));
			curr1->ChangeCharacteristic(fmaf(GF, len2, curr1->M * alpha));
			RecomputeR_AffineM_AVX2(R.data(), r_size, GF, alpha);
			std::make_heap(R.begin(), R.end(), ComparePtr);
		}
		else {
			if (currM > Mmax) {
				if (currM < 1.15f * Mmax) {
					Mmax = currM;
					m = r * Mmax;
					curr->ChangeCharacteristic(m);
					curr1->ChangeCharacteristic(m);
				}
				else {
					Mmax = currM;
					m = r * Mmax;
					curr->ChangeCharacteristic(m);
					curr1->ChangeCharacteristic(m);
					RecomputeR_ConstM_AVX2(R.data(), r_size, m);
					std::make_heap(R.begin(), R.end(), ComparePtr);
				}
			}
			else {
				curr->ChangeCharacteristic(m);
				curr1->ChangeCharacteristic(m);
			}
		}
	}
	else {
		if (currM > Mmax) {
			if (currM < 1.15f * Mmax) {
				Mmax = currM;
				m = r * Mmax;
				curr->ChangeCharacteristic(m);
				curr1->ChangeCharacteristic(m);
			}
			else {
				Mmax = currM;
				m = r * Mmax;
				curr->ChangeCharacteristic(m);
				curr1->ChangeCharacteristic(m);
				RecomputeR_ConstM_AVX2(R.data(), r_size, m);
				std::make_heap(R.begin(), R.end(), ComparePtr);
			}
		}
		else {
			curr->ChangeCharacteristic(m);
			curr1->ChangeCharacteristic(m);
		}
	}

	R.back() = curr;
	std::push_heap(R.begin(), R.end(), ComparePtr);
	R.emplace_back(curr1);
	std::push_heap(R.begin(), R.end(), ComparePtr);

	const Interval* top = R.front();
	x_Rmax_1 = top->x1;
	x_Rmax_2 = top->x2;
	y_Rmax_1 = top->y1;
	y_Rmax_2 = top->y2;
}

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_2D(
const float N,
const float global_iterations,
const float a,
const float b,
const float c,
const float d,
const float r,
const bool mode,
const float epsilon,
const float seed,
float** const __restrict out_data,
size_t* const __restrict out_len) noexcept
{
Slab* const __restrict slab = tls.local();
slab->current = slab->base;

text
int schetchick = 0;
int no_improve = 0;
const int rank = g_world->rank();
const int world_size = g_world->size();

while (g_world->iprobe(boost::mpi::any_source, 0)) {
	CtrlMsg dummy;
	g_world->recv(boost::mpi::any_source, 0, dummy);
}

const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1));
const float x_addition = (b - a) * inv_divider;
const float y_addition = (d - c) * inv_divider;
const float true_start = a + x_addition;
const float true_end = b - x_addition;

float x_Rmax_1 = true_start;
float x_Rmax_2 = true_end;
const float initial_length = x_Rmax_2 - x_Rmax_1;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length;
const float inv_threshold_03 = 1.0f / threshold_03;
const float start_val = rank % 3 ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition);
float best_f = rank % 2 ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition);
float y_Rmax_1 = start_val;
float y_Rmax_2 = best_f;

std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R;
Extr.clear();
Extr.reserve(static_cast<size_t>(global_iterations) << 2u);
R.clear();
R.reserve(static_cast<size_t>(global_iterations) << 1u);
R.emplace_back(new Interval(true_start, true_end, start_val, best_f, 2.0f));
const Interval* __restrict top_ptr;

float Mmax = R.front()->M;
float m = r * Mmax;

while (true) {
	const float interval_len = x_Rmax_2 - x_Rmax_1;
	const bool stagnation = no_improve > 100 && schetchick > 270;
	const float p = fmaf(-1.0f / initial_length, dmax, 1.0f);

	while (g_world->iprobe(boost::mpi::any_source, 0)) {
		CtrlMsg in;
		g_world->recv(boost::mpi::any_source, 0, in);

		if (in.kind) {
			if (!rank) {
				Extr.emplace_back(static_cast<float>(schetchick));
				Extr.emplace_back(interval_len);
				*out_len = Extr.size();
				*out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
				memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
			}
			return;
		}

		const float sx = FindX2D_analytic(in.xchg.s_x1, in.xchg.s_x2);
		const float ex = FindX2D_analytic(in.xchg.e_x1, in.xchg.e_x2);
		Interval* const __restrict injected = new Interval(sx, ex, RastriginFunc(in.xchg.s_x1, in.xchg.s_x2), RastriginFunc(in.xchg.e_x1, in.xchg.e_x2), 2.0f);
		injected->ChangeCharacteristic(m);

		if (injected->R > 1.15f * top_ptr->R) {
			const float k = stagnation ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f);
			injected->R = in.xchg.Rtop * k;
			R.emplace_back(injected);
			std::push_heap(R.begin(), R.end(), ComparePtr);
		}
	}

	const int T = static_cast<int>(fmaf(-expm1f(p), 264.0f, 277.0f));
	const bool want_term = interval_len < epsilon || schetchick == static_cast<int>(global_iterations);

	if (!(++schetchick % T) || stagnation || want_term) {
		CtrlMsg out;
		out.kind = want_term;

		if (!out.kind) {
			float s_x1, s_x2, e_x1, e_x2;
			HitTest2D_analytic(top_ptr->x1, s_x1, s_x2);
			HitTest2D_analytic(top_ptr->x2, e_x1, e_x2);
			out.xchg = CrossMsg{ s_x1, s_x2, e_x1, e_x2, top_ptr->R };
		}

		for (int i = 0; i < world_size; ++i) {
			if (i != rank) g_world->isend(i, 0, out);
		}

		if (out.kind) {
			if (!rank) {
				Extr.emplace_back(static_cast<float>(schetchick));
				Extr.emplace_back(interval_len);
				*out_len = Extr.size();
				*out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
				memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
			}
			return;
		}
	}

	const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r);
	float new_x1_val, new_x2_val;
	HitTest2D_analytic(new_point, new_x1_val, new_x2_val);
	const float new_value = RastriginFunc(new_x1_val, new_x2_val);

	if (new_value < best_f) {
		best_f = new_value;
		Extr.emplace_back(best_f);
		Extr.emplace_back(new_x1_val);
		Extr.emplace_back(new_x2_val);
		no_improve = 0;
	}
	else {
		++no_improve;
	}

	std::pop_heap(R.begin(), R.end(), ComparePtr);
	Interval* const __restrict promejutochny_otrezok = R.back();

	const float segment_x1 = promejutochny_otrezok->x1;
	const float segment_x2 = promejutochny_otrezok->x2;
	const float len2 = segment_x2 - new_point;
	const float len1 = new_point - segment_x1;

	Interval* const __restrict curr = new Interval(segment_x1, new_point, promejutochny_otrezok->y1, new_value, 2.0f);
	Interval* const __restrict curr1 = new Interval(new_point, segment_x2, new_value, promejutochny_otrezok->y2, 2.0f);
	const float currM = (std::max)(curr->M, curr1->M);
	const size_t r_size = R.size();

	if (mode) {
		if (len2 + len1 == dmax) {
			dmax = (std::max)(len1, len2);
			for (auto p : R) {
				float L = p->x2 - p->x1;
				if (L > dmax) dmax = L;
			}
		}

		if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) {
			if (currM > Mmax) {
				Mmax = currM;
				m = r * Mmax;
			}

			const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
			const float alpha = progress * progress;
			const float betta = 2.0f - alpha;
			const float MULTIPLIER = (1.0f / dmax) * Mmax;
			const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
			const float GLOBAL_FACTOR = betta * global_coeff;

			curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
			curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));
			RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha);
			std::make_heap(R.begin(), R.end(), ComparePtr);
		}
		else {
			if (currM > Mmax) {
				if (currM < 1.15f * Mmax) {
					Mmax = currM;
					m = r * Mmax;
					curr->ChangeCharacteristic(m);
					curr1->ChangeCharacteristic(m);
				}
				else {
					Mmax = currM;
					m = r * Mmax;
					curr->ChangeCharacteristic(m);
					curr1->ChangeCharacteristic(m);
					RecomputeR_ConstM_AVX2(R.data(), r_size, m);
					std::make_heap(R.begin(), R.end(), ComparePtr);
				}
			}
			else {
				curr->ChangeCharacteristic(m);
				curr1->ChangeCharacteristic(m);
			}
		}
	}
	else {
		if (currM > Mmax) {
			if (currM < 1.15f * Mmax) {
				Mmax = currM;
				m = r * Mmax;
				curr->ChangeCharacteristic(m);
				curr1->ChangeCharacteristic(m);
			}
			else {
				Mmax = currM;
				m = r * Mmax;
				curr->ChangeCharacteristic(m);
				curr1->ChangeCharacteristic(m);
				RecomputeR_ConstM_AVX2(R.data(), r_size, m);
				std::make_heap(R.begin(), R.end(), ComparePtr);
			}
		}
		else {
			curr->ChangeCharacteristic(m);
			curr1->ChangeCharacteristic(m);
		}
	}

	R.back() = curr;
	std::push_heap(R.begin(), R.end(), ComparePtr);
	R.emplace_back(curr1);
	std::push_heap(R.begin(), R.end(), ComparePtr);

	top_ptr = R.front();
	x_Rmax_1 = top_ptr->x1;
	x_Rmax_2 = top_ptr->x2;
	y_Rmax_1 = top_ptr->y1;
	y_Rmax_2 = top_ptr->y2;
}

}

struct RunParams {
int nSegments;
uint8_t varLen;
float minTheta;
float tx, ty;
int levels, maxIter;
float r;
uint8_t adaptive;
float eps;
unsigned int seed;

text
template<typename Archive>
void serialize(Archive& ar, const unsigned int) {
	ar& nSegments& varLen& minTheta& tx& ty& levels& maxIter& r& adaptive& eps& seed;
}

};

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartManipND(
int nSegments,
bool variableLengths,
float minTheta,
float targetX,
float targetY,
int peanoLevels,
int maxIterPerBranch,
float r,
bool adaptiveMode,
float epsilon,
unsigned int seed)
{
RunParams p;
p.nSegments = nSegments;
p.varLen = (uint8_t)variableLengths;
p.minTheta = minTheta;
p.tx = targetX;
p.ty = targetY;
p.levels = peanoLevels;
p.maxIter = maxIterPerBranch;
p.r = r;
p.adaptive = (uint8_t)adaptiveMode;
p.eps = epsilon;
p.seed = seed;

text
const int rank = g_world->rank();
const int world = g_world->size();

for (int i = 1; i < world; ++i) {
	if (i != rank) g_world->isend(i, 1, p);
}

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() {
while (true) {
if (g_world->iprobe(0, 1)) {
RunParams p;
g_world->recv(0, 1, p);
float* q = nullptr;
size_t qlen = 0;
float bx = 0, by = 0, bf = 0;

text
		AGP_Manip2D(
			p.nSegments, (bool)p.varLen, p.minTheta,
			p.tx, p.ty, p.levels, p.maxIter, p.r,
			(bool)p.adaptive, p.eps, p.seed,
			&q, &qlen, &bx, &by, &bf
		);

		if (q) CoTaskMemFree(q);
	}
	Sleep(0);
}

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartWorkers() {
int i = 1;
const int world = g_world->size();

text
while (i < world) {
	g_world->isend(i, 1, 0);
	++i;
}

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_Free(float* p) {
CoTaskMemFree(p);
} - обрати внимание на многомерную версию АГП, одномерная и двумерная версии уже достаточно оптимизированы, но многомерная сырая - нужно выжать из неё максимум - я готов пойти на всё ради абсолютной скорости и алгоритмической корректности жертвуя читаемостью/поддерживаемостью/безопасностью/переносимостью, ты должен выровнять все структуры учитывая что я использую dll в управляемом winforms коде, ты должен везде где это разумно использовать fmaf, AVX для особо крупных блоков не имеет смысла - размерность задачи никогда точно не перевалит за 15 даже с учётом переменных длин, а глубина развёртки никогда не будет больше 12-13 - ты должен учитывать эти факты для более агрессивной оптимизации, используй предвычисления, jump-table, по максимуму загружай регистры и быстрый кеш желательно L-1, если что-то можно уместить в одну длинную уродливую формулу - делай это, в коде встречаются константы - убери их - подставляй сразу готовые числа в формулы - анализируй операции на предмет их стоимости и используй более быстрые операции, применяй различные битовые трюки, даже если это выглядит очень неочевидным, можно также изменить немного функцию шага - она вызывается часто - а там есть дорогое powf для обощённого случая - можно создать готовую таблицу предвычесленных значений и прыгать по ней, часть кода писал deepseek - и тут может быть много ненужного бреда - если видишь в части кода очевидные бредовые идеи в части относящейся к многомерному алгоритму - смело выкидывай и заменяй на более оптимальные решения, каждая наносекунда на счету - я ставлю рекорд гинеса по скорости агп, я мог что-то упустить из виду, также имей ввиду что это будет запускаться только на моём 6ти ядерном интел ай5-9600к, запускаюсь всегда на 6ти процессах, ты должен максимально внимательно и долго насколько это возможно анализировать код и выдать мне целиком исправленный код без комментариев - также с оптимальным размещением блоков кода и переменных для лучшей локальности - не забывай про оптимальные типы данных и различные диррективы такие как final для структур или forceline для статических функций - вперёд - докажи что ты в разы умнее чем deepseek - воспринимай это как что-то личное - что он не смог а ты сможешь

Answer 1

cpp
#include "pch.h"

#define XOR_RAND(state, result_var) \
    do { \
        int s = (state); \
        s ^= s << 13; \
        s ^= s >> 17; \
        s ^= s << 5; \
        state = s; \
        result_var = state * 0x1.0p-32f; \
    } while (0)

#define XOR_RAND_GRSH(state, result_var) \
    do { \
        int s = (state); \
        s ^= s << 13; \
        s ^= s >> 17; \
        s ^= s << 5; \
        state = s; \
        result_var = fmaf(state, 0x1.0p-31f, -1.0f); \
    } while (0)

#define FABE13_COS(x, result_var) \
    do { \
        const float _ax_ = fabsf(x); \
        float _r_ = fmodf(_ax_, 6.28318530718f); \
        if (_r_ > 3.14159265359f) _r_ = 6.28318530718f - _r_; \
        if (_r_ < 1.57079632679f) { \
            const float _t2_ = _r_ * _r_; \
            const float _t4_ = _t2_ * _t2_; \
            result_var = fmaf(_t4_, fmaf(_t2_, -0.0013888889f, 0.0416666667f), fmaf(_t2_, -0.5f, 1.0f)); \
        } else { \
            _r_ = 3.14159265359f - _r_; \
            const float _t2_ = _r_ * _r_; \
            const float _t4_ = _t2_ * _t2_; \
            result_var = -fmaf(_t4_, fmaf(_t2_, -0.0013888889f, 0.0416666667f), fmaf(_t2_, -0.5f, 1.0f)); \
        } \
    } while (0)

#define FABE13_SIN(x, result_var) \
    do { \
        const float _x_ = (x); \
        const float _ax_ = fabsf(_x_); \
        float _r_ = fmodf(_ax_, 6.28318530718f); \
        bool _sfl_ = _r_ > 3.14159265359f; \
        if (_sfl_) _r_ = 6.28318530718f - _r_; \
        bool _cfl_ = _r_ > 1.57079632679f; \
        if (_cfl_) _r_ = 3.14159265359f - _r_; \
        const float _t2_ = _r_ * _r_; \
        float _s = fmaf(_t2_, fmaf(_t2_, fmaf(_t2_, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * _r_; \
        result_var = ((_x_ < 0.0f) ^ _sfl_) ? -_s : _s; \
    } while (0)

#define FABE13_SINCOS(in, sin_out, cos_out, n) \
    do { \
        int i = 0; \
        const int limit = (n) & ~7; \
        if ((n) >= 8) { \
            static __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f); \
            static __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f); \
            static __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f); \
            static __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f); \
            static __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f); \
            static __declspec(align(32)) const __m256 VEC_COS_P5 = _mm256_set1_ps(-0.0013888889f); \
            static __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f); \
            static __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f); \
            static __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f); \
            static __declspec(align(32)) const __m256 VEC_SIN_P5 = _mm256_set1_ps(-0.0001984127f); \
            static __declspec(align(32)) const __m256 VEC_SIN_P3 = _mm256_set1_ps(0.0083333333f); \
            static __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f); \
            static __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f); \
            static __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps(); \
            while (i < limit) { \
                const __m256 vx = _mm256_load_ps(&(in)[i]); \
                const __m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx); \
                __m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS); \
                q = _mm256_sub_ps(q, BIAS); \
                const __m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax); \
                const __m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r)); \
                const __m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1)); \
                const __m256 t2 = _mm256_mul_ps(r2, r2); \
                const __m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P5, VEC_COS_P3), VEC_COS_P1), VEC_COS_P0); \
                const __m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_SIN_P5, VEC_SIN_P3), VEC_SIN_P1), VEC_SIN_P0), r2); \
                const __m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ); \
                const __m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ), _mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ)); \
                _mm256_store_ps(&(cos_out)[i], _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip)); \
                _mm256_store_ps(&(sin_out)[i], _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip)); \
                i += 8; \
            } \
        } \
        while (i < (n)) { \
            const float x = (in)[i]; \
            const float ax = fabsf(x); \
            float q = fmaf(ax, 0.15915494309189535f, 12582912.0f); \
            q -= 12582912.0f; \
            float r = fmaf(-6.28318530718f, q, ax); \
            const bool sflip = r > 3.14159265359f; \
            if (sflip) r = 6.28318530718f - r; \
            const bool cflip = r > 1.57079632679f; \
            if (cflip) r = 3.14159265359f - r; \
            const float t2 = r * r; \
            const float c = fmaf(t2, fmaf(t2, fmaf(t2, -0.0013888889f, 0.0416666667f), -0.5f), 1.0f); \
            const float s = fmaf(t2, fmaf(t2, fmaf(t2, -0.0001984127f, 0.0083333333f), -0.16666666f), 1.0f) * r; \
            (cos_out)[i] = cflip ? -c : c; \
            (sin_out)[i] = ((x < 0.0f) ^ sflip) ? -s : s; \
            ++i; \
        } \
    } while (0)

enum List : uint8_t { Top = 0b00u, Down = 0b01u, Left = 0b10u, Right = 0b11u };

__declspec(align(4)) struct Step final { const uint8_t next; const uint8_t dx; const uint8_t dy; };
__declspec(align(4)) struct InvStep final { const uint8_t q; const uint8_t next; };

__declspec(align(64)) static const Step g_step_tbl[4][4] = {
    { {Right, 0u, 0u}, {Top, 0u, 1u}, {Top, 1u, 1u}, {Left, 1u, 0u} },
    { {Left, 1u, 1u}, {Down, 1u, 0u}, {Down, 0u, 0u}, {Right, 0u, 1u} },
    { {Down, 1u, 1u}, {Left, 0u, 1u}, {Left, 0u, 0u}, {Top, 1u, 0u} },
    { {Top, 0u, 0u}, {Right, 1u, 0u}, {Right, 1u, 1u}, {Down, 0u, 1u} }
};

__declspec(align(64)) static const InvStep g_inv_tbl[4][4] = {
    { {0u, Right}, {1u, Top}, {3u, Left}, {2u, Top} },
    { {2u, Down}, {3u, Right}, {1u, Down}, {0u, Left} },
    { {2u, Left}, {1u, Left}, {3u, Top}, {0u, Down} },
    { {0u, Top}, {3u, Down}, {1u, Right}, {2u, Right} }
};

static const boost::mpi::environment* g_env;
static const boost::mpi::communicator* g_world;

__declspec(align(16)) struct CrossMsg final {
    float s_x1, s_x2;
    float e_x1, e_x2;
    float Rtop;
    template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar & s_x1 & s_x2 & e_x1 & e_x2 & Rtop; }
};

__declspec(align(16)) struct CtrlMsg final {
    bool kind;
    CrossMsg xchg;
    template<typename Archive> __declspec(noalias) __forceinline void serialize(Archive& ar, const unsigned int) noexcept { ar & kind & xchg; }
};

__declspec(align(16)) struct Slab final {
    char* const base;
    char* current;
    char* const end;
    __forceinline Slab(void* const memory, const size_t usable) : base((char*)memory), current(base), end(base + (usable & ~(size_t)63u)) {}
};

static tbb::enumerable_thread_specific<Slab*> tls([]() noexcept {
    void* memory = _aligned_malloc(16777216u, 64u);
    Slab* slab = (Slab*)_aligned_malloc(32u, 64u);
    new (slab) Slab(memory, 16777216u);
    char* p = slab->base;
    while (p < slab->end) { *p = 0; p += 4096u; }
    return slab;
});

__declspec(align(16)) struct Peano2DMap final {
    const int levels;
    const float a, b, c, d;
    const float lenx, leny;
    const float inv_lenx;
    const uint32_t scale;
    const uint8_t start;
    __forceinline Peano2DMap(int L, float _a, float _b, float _c, float _d, uint8_t st)
        : levels(L), a(_a), b(_b), c(_c), d(_d), lenx(_b - _a), leny(_d - _c), inv_lenx(1.0f / (_b - _a)), scale((uint32_t)1u << (L << 1)), start(st) {}
};

static Peano2DMap gActiveMap(0, 0, 0, 0, 0, 0);

__forceinline bool ComparePtr(const struct Interval* a, const struct Interval* b) noexcept;

__declspec(align(64)) struct Interval final {
    const float x1;
    const float x2;
    const float y1;
    const float y2;
    const float delta_y;
    const float ordinate_factor;
    const float N_factor;
    const float quadratic_term;
    const float M;
    float R;
    __forceinline void* operator new(size_t) noexcept { Slab* s = tls.local(); char* r = s->current; s->current += 64u; return r; }
    __forceinline Interval(float _x1, float _x2, float _y1, float _y2, float _N) noexcept
        : x1(_x1), x2(_x2), y1(_y1), y2(_y2),
          delta_y(_y2 - _y1),
          ordinate_factor(-(y1 + y2) * 2.0f),
          N_factor(_N == 1.0f ? _x2 - _x1 : _N == 2.0f ? sqrtf(_x2 - _x1) : powf(_x2 - _x1, 1.0f / _N)),
          quadratic_term((1.0f / N_factor) * delta_y * delta_y),
          M((1.0f / N_factor) * fabsf(delta_y)) {}
    __forceinline void ChangeCharacteristic(float _m) noexcept { R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor)); }
};

__forceinline bool ComparePtr(const Interval* a, const Interval* b) noexcept { return a->R < b->R; }

__forceinline void RecomputeR_ConstM_AVX2(Interval* const* arr, size_t n, float m) {
    const __m256 vm = _mm256_set1_ps(m);
    __m256 vinvm = _mm256_rcp_ps(vm);
    vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
    size_t i = 0;
    const int limit = (int)(n & ~7u);
    if (n >= 8u) {
        while (i < (size_t)limit) {
            __declspec(align(32)) float q[8], nf[8], ord[8];
            for (int k = 0; k < 8; ++k) { const Interval* p = arr[i + k]; q[k] = p->quadratic_term; nf[k] = p->N_factor; ord[k] = p->ordinate_factor; }
            const __m256 vq = _mm256_load_ps(q);
            const __m256 vnf = _mm256_load_ps(nf);
            const __m256 vod = _mm256_load_ps(ord);
            const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
            const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
            __declspec(align(32)) float out[8];
            _mm256_store_ps(out, res);
            for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k];
            i += 8u;
        }
    }
    while (i < n) { arr[i]->ChangeCharacteristic(m); ++i; }
}

__forceinline void RecomputeR_AffineM_AVX2(Interval* const* arr, size_t n, float GF, float alpha) {
    const __m256 vGF = _mm256_set1_ps(GF);
    const __m256 va = _mm256_set1_ps(alpha);
    size_t i = 0;
    const int limit = (int)(n & ~7u);
    if (n >= 8u) {
        while (i < (size_t)limit) {
            __declspec(align(32)) float len[8], Mv[8], q[8], nf[8], ord[8];
            for (int k = 0; k < 8; ++k) { const Interval* p = arr[i + k]; len[k] = p->x2 - p->x1; Mv[k] = p->M; q[k] = p->quadratic_term; nf[k] = p->N_factor; ord[k] = p->ordinate_factor; }
            const __m256 vlen = _mm256_load_ps(len);
            const __m256 vM = _mm256_load_ps(Mv);
            const __m256 vq = _mm256_load_ps(q);
            const __m256 vnf = _mm256_load_ps(nf);
            const __m256 vod = _mm256_load_ps(ord);
            const __m256 vm = _mm256_fmadd_ps(vGF, vlen, _mm256_mul_ps(va, vM));
            __m256 vinvm = _mm256_rcp_ps(vm);
            vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f)));
            const __m256 t = _mm256_fmadd_ps(vm, vnf, vod);
            const __m256 res = _mm256_fmadd_ps(vq, vinvm, t);
            __declspec(align(32)) float out[8];
            _mm256_store_ps(out, res);
            for (int k = 0; k < 8; ++k) arr[i + k]->R = out[k];
            i += 8u;
        }
    }
    while (i < n) {
        const Interval* p = arr[i];
        const float mi = fmaf(GF, (p->x2 - p->x1), p->M * alpha);
        arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor));
        ++i;
    }
}

__forceinline float fast_pow_int(float v, int n) {
    float r;
    switch (n) {
        case 1: r = v; break;
        case 2: r = v * v; break;
        case 3: { float v2 = v * v; r = v2 * v; } break;
        case 4: { float v2 = v * v; r = v2 * v2; } break;
        case 5: { float v2 = v * v; r = v2 * v2 * v; } break;
        case 6: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v2; } break;
        case 7: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v2 * v; } break;
        case 8: { float v2 = v * v; float v4 = v2 * v2; r = v4 * v4; } break;
        case 9: { float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v3; } break;
        case 10:{ float v2 = v * v; float v4 = v2 * v2; float v8 = v4 * v4; r = v8 * v2; } break;
        case 11:{ float v2 = v * v; float v4 = v2 * v2; float v8 = v4 * v4; r = v8 * v2 * v; } break;
        case 12:{ float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v6; } break;
        case 13:{ float v3 = v * v * v; float v6 = v3 * v3; r = v6 * v6 * v; } break;
        case 14:{ float v7 = v * v * v * v * v * v * v; r = v7 * v7; } break;
        default:{ float v7 = v * v * v * v * v * v * v; r = v7 * v7 * v; } break;
    }
    return r;
}

__forceinline float Shag(float _m, float x1, float x2, float y1, float y2, float _N, float _r) {
    const float diff = y2 - y1;
    const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(0x3F800000u | (((*((uint32_t*)&diff)) & 0x80000000u) ^ 0x80000000u))));
    const int Ni = (int)(_N + 0.5f);
    if (Ni == 1) return fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f;
    if (Ni == 2) return fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f;
    const float invmN = 1.0f / fast_pow_int(_m, Ni);
    const float dN = fast_pow_int(fabsf(diff), Ni);
    return fmaf(sign_mult * invmN, dN * _r, x1 + x2) * 0.5f;
}

struct MortonCachePerRank { std::vector<int> permCache; std::vector<uint64_t> invMaskCache; std::vector<uint64_t> levelMasks; uint32_t baseSeed; MortonCachePerRank() : baseSeed(0) {} };
static MortonCachePerRank g_mc;

struct MortonND final {
    int dim;
    int levels;
    uint64_t scale;
    std::vector<float> low, high;
    std::vector<float> step, baseOff;
    std::vector<int> perm;
    std::vector<uint64_t> invMask;
    std::vector<uint64_t> pextMask;
    float invScaleLevel;
    __forceinline MortonND(int D, int L, const float* lows, const float* highs, const MortonCachePerRank& mc)
        : dim(D), levels(L), scale(1ull), low(lows, lows + D), high(highs, highs + D),
          step(D, 0.0f), baseOff(D, 0.0f), perm(mc.permCache.begin(), mc.permCache.begin() + D),
          invMask(mc.invMaskCache.begin(), mc.invMaskCache.begin() + D), pextMask(D, 0ull), invScaleLevel(0.0f)
    {
        uint64_t level_mask = (L <= 12) ? ((1ull << L) - 1ull) : ((1ull << 12) - 1ull);
        for (int k = 0; k < D; ++k) invMask[k] &= level_mask;
        int maxbits = 63;
        int need = L * D;
        if (need > maxbits) { levels = maxbits / D; if (levels < 1) levels = 1; }
        scale = 1ull << (uint64_t)(levels * D);
        invScaleLevel = 1.0f / (float)((uint64_t)1 << levels);
        for (int d = 0; d < dim; ++d) {
            float rng = high[d] - low[d];
            float st = rng * invScaleLevel;
            step[d] = st;
            baseOff[d] = fmaf(0.5f, st, low[d]);
        }
        for (int d = 0; d < dim; ++d) {
            uint64_t m = 0ull;
            uint64_t bitpos = (uint64_t)d;
            for (int b = 0; b < levels; ++b) { m |= 1ull << bitpos; bitpos += (uint64_t)dim; }
            pextMask[d] = m;
        }
    }
    __forceinline void map01ToPoint(float t, float* __restrict out) const noexcept {
        if (t < 0.0f) t = 0.0f; else if (t >= 1.0f) t = nextafterf(1.0f, 0.0f);
        uint64_t idx = (uint64_t)((double)t * (double)scale);
        if (idx >= scale) idx = scale - 1;
        for (int d = 0; d < dim; ++d) {
            const int pd = perm[d];
            uint64_t bits = _pext_u64(idx, pextMask[d]);
            if (invMask[pd]) bits ^= invMask[pd];
            out[pd] = fmaf(step[pd], (float)bits, baseOff[pd]);
        }
    }
};

struct ManipCost final {
    int n;
    bool variableLen;
    float targetX, targetY;
    float minTheta;
    __forceinline ManipCost(int _n, bool _variableLen, float _targetX, float _targetY, float _minTheta)
        : n(_n), variableLen(_variableLen), targetX(_targetX), targetY(_targetY), minTheta(_minTheta) {}
    __forceinline float operator()(const float* __restrict q, float& out_x, float& out_y) const noexcept {
        const float* th = q;
        const float* L = variableLen ? (q + n) : nullptr;
        __declspec(align(64)) float phi[16];
        __declspec(align(64)) float s_arr[16];
        __declspec(align(64)) float c_arr[16];
        float x = 0.0f, y = 0.0f, phi_acc = 0.0f, penC = 0.0f;
        for (int i = 0; i < n; ++i) { phi_acc += th[i]; phi[i] = phi_acc; }
        FABE13_SINCOS(phi, s_arr, c_arr, n);
        const float Lc = 1.0f;
        if (variableLen) {
            for (int i = 0; i < n; ++i) { float Li = L[i]; x = fmaf(Li, c_arr[i], x); y = fmaf(Li, s_arr[i], y); }
        } else {
            for (int i = 0; i < n; ++i) { x = fmaf(Lc, c_arr[i], x); y = fmaf(Lc, s_arr[i], y); }
        }
        for (int i = 0; i < n; ++i) { float ai = fabsf(th[i]); float v = minTheta - ai; float u = v > 0.0f ? v : 0.0f; penC = fmaf(u, u, penC); }
        float dx = x - targetX;
        float dy = y - targetY;
        float dist = sqrtf(fmaf(dx, dx, dy * dy));
        out_x = x; out_y = y;
        return dist + penC;
    }
};

__forceinline void HitTest2D_analytic(float x_param, float& out_x1, float& out_x2) {
    const float a = gActiveMap.a;
    const float inv_lenx = gActiveMap.inv_lenx;
    const uint32_t scale = gActiveMap.scale;
    const uint32_t scale_minus_1 = scale - 1u;
    const float lenx = gActiveMap.lenx;
    const float leny = gActiveMap.leny;
    const float c = gActiveMap.c;
    const uint8_t start = gActiveMap.start;
    const int levels = gActiveMap.levels;
    float norm = (x_param - a) * inv_lenx;
    norm = fminf(fmaxf(norm, 0.0f), 0x1.fffffep-1f);
    uint32_t idx = (uint32_t)(norm * (float)scale);
    idx = idx > scale_minus_1 ? scale_minus_1 : idx;
    float sx = lenx, sy = leny;
    float x1 = a, x2 = c;
    uint8_t type = start;
    int l = levels - 1;
    while (l >= 0) {
        const uint32_t q = (idx >> (l * 2)) & 3u;
        const Step s = g_step_tbl[type][q];
        type = s.next;
        sx *= 0.5f; sy *= 0.5f;
        x1 += s.dx ? sx : 0.0f;
        x2 += s.dy ? sy : 0.0f;
        --l;
    }
    out_x1 = x1 + sx * 0.5f;
    out_x2 = x2 + sy * 0.5f;
}

__forceinline float FindX2D_analytic(float px, float py) {
    const float a = gActiveMap.a;
    const float b = gActiveMap.b;
    const float c = gActiveMap.c;
    const float d = gActiveMap.d;
    const float lenx = gActiveMap.lenx;
    const float leny = gActiveMap.leny;
    const uint32_t scale = gActiveMap.scale;
    const uint8_t start = gActiveMap.start;
    const int levels = gActiveMap.levels;
    const float clamped_px = fminf(fmaxf(px, a), b);
    const float clamped_py = fminf(fmaxf(py, c), d);
    float sx = lenx, sy = leny;
    float x0 = a, y0 = c;
    uint32_t idx = 0u;
    uint8_t type = start;
    int l = 0;
    while (l < levels) {
        sx *= 0.5f; sy *= 0.5f;
        const float mx = x0 + sx;
        const float my = y0 + sy;
        const uint32_t tr = (uint32_t)((clamped_px > mx) & (clamped_py > my));
        const uint32_t tl = (uint32_t)((clamped_px < mx) & (clamped_py > my));
        const uint32_t dl = (uint32_t)((clamped_px < mx) & (clamped_py < my));
        const uint32_t none = (uint32_t)(1u ^ (tr | tl | dl));
        const uint32_t dd = (tr << 1) | tr | tl | (none << 1);
        const InvStep inv = g_inv_tbl[type][dd];
        type = inv.next;
        idx = (idx << 2) | inv.q;
        const uint32_t dx = dd >> 1;
        const uint32_t dy = dd & 1u;
        x0 += dx ? sx : 0.0f;
        y0 += dy ? sy : 0.0f;
        ++l;
    }
    const float scale_recip = 1.0f / (float)scale;
    return fmaf((float)idx * scale_recip, lenx, a);
}

static __forceinline void agp_run_branch_mpi(const MortonND& map, const ManipCost& cost, int maxIter, float r, bool adaptive, float eps, unsigned seed, std::vector<Interval*>& H, std::vector<float>& bestQ, float& bestF, float& bestX, float& bestY) {
    const int dim = cost.n + (cost.variableLen ? cost.n : 0);
    __declspec(align(64)) float q_local[16];
    bestQ.reserve(dim);
    float x, y;
    int no_improve = 0;
    auto evalAt = [&](float t) __forceinline -> float {
        map.map01ToPoint(t, q_local);
        float f = cost(q_local, x, y);
        if (f < bestF) { bestF = f; bestQ.assign(q_local, q_local + dim); bestX = x; bestY = y; no_improve = 0; }
        else { no_improve++; }
        return f;
    };
    float a = 0.0f, b = 1.0f;
    float f_a = evalAt(a);
    float f_b = evalAt(b);
    const int K = (std::min)((std::max)(2 * dim, 8), 128);
    H.clear();
    H.reserve((size_t)maxIter + K + 8);
    float Mmax = 1e-3f;
    float prev_t = a;
    float prev_f = f_a;
    for (int k = 1; k <= K; ++k) {
        float t = a + (b - a) * ((float)k / (K + 1));
        float f = evalAt(t);
        Interval* I = new Interval(prev_t, t, prev_f, f, (float)dim);
        Mmax = (std::max)(Mmax, I->M);
        I->ChangeCharacteristic(r * Mmax);
        H.emplace_back(I);
        std::push_heap(H.begin(), H.end(), ComparePtr);
        prev_t = t; prev_f = f;
    }
    Interval* tail = new Interval(prev_t, b, prev_f, f_b, (float)dim);
    Mmax = (std::max)(Mmax, tail->M);
    float m = r * Mmax;
    tail->ChangeCharacteristic(m);
    H.emplace_back(tail);
    std::push_heap(H.begin(), H.end(), ComparePtr);
    float dmax = b - a;
    const float initial_len = dmax;
    const float thr03 = 0.3f * initial_len;
    const float inv_thr03 = 1.0f / thr03;
    int it = 0;
    const int rank = g_world->rank();
    const int world = g_world->size();
    while (it < maxIter) {
        const float p = fmaf(-1.0f / initial_len, dmax, 1.0f);
        bool stagnation = (no_improve > 100) && (it > 270);
        float k = stagnation ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f);
        while (g_world->iprobe(boost::mpi::any_source, 0)) {
            CtrlMsg in;
            g_world->recv(boost::mpi::any_source, 0, in);
            if (in.kind) { if (!rank) break; else return; }
            else {
                float sx = in.xchg.s_x1, ex = in.xchg.e_x1;
                if (sx < 0.0f) sx = 0.0f;
                if (ex > 1.0f) ex = 1.0f;
                if (ex <= sx) continue;
                __declspec(align(64)) float tmp[16];
                float tmp_x, tmp_y;
                map.map01ToPoint(sx, tmp);
                float y1 = cost(tmp, tmp_x, tmp_y);
                map.map01ToPoint(ex, tmp);
                float y2 = cost(tmp, tmp_x, tmp_y);
                Interval* inj = new Interval(sx, ex, y1, y2, (float)dim);
                inj->ChangeCharacteristic(m);
                Interval* top = H.front();
                if (inj->R > 1.15f * top->R) {
                    inj->R = in.xchg.Rtop * k;
                    H.emplace_back(inj);
                    std::push_heap(H.begin(), H.end(), ComparePtr);
                }
            }
        }
        std::pop_heap(H.begin(), H.end(), ComparePtr);
        Interval* cur = H.back();
        H.pop_back();
        const float x1 = cur->x1, x2 = cur->x2, y1 = cur->y1, y2 = cur->y2;
        float tNew = Shag(m, x1, x2, y1, y2, (float)dim, r);
        tNew = fminf(fmaxf(tNew, a), b);
        float fNew = evalAt(tNew);
        Interval* left = new Interval(x1, tNew, y1, fNew, (float)dim);
        Interval* right = new Interval(tNew, x2, fNew, y2, (float)dim);
        float Mloc = (std::max)(left->M, right->M);
        if (adaptive) {
            float len1 = tNew - x1, len2 = x2 - tNew;
            if (len1 + len2 == dmax) {
                dmax = (std::max)(len1, len2);
                for (auto pI : H) { float L = pI->x2 - pI->x1; if (L > dmax) dmax = L; }
            }
            if ((thr03 > dmax && !(it % 3)) || (10.0f * dmax < initial_len)) {
                if (Mloc > Mmax) { Mmax = Mloc; m = r * Mmax; }
                const float progress = fmaf(-dmax, inv_thr03, 1.0f);
                const float alpha = progress * progress;
                const float beta = fmaf(-alpha, 1.0f, 2.0f);
                const float MULT = (1.0f / dmax) * Mmax;
                const float global_coeff = fmaf(MULT, r, -MULT);
                const float GF = fmaf(beta, global_coeff, 0.0f);
                left->ChangeCharacteristic(fmaf(GF, len1, left->M * alpha));
                right->ChangeCharacteristic(fmaf(GF, len2, right->M * alpha));
                size_t sz = H.size();
                RecomputeR_AffineM_AVX2(H.data(), sz, GF, alpha);
                std::make_heap(H.begin(), H.end(), ComparePtr);
            } else {
                if (Mloc > Mmax) {
                    if (Mloc < 1.15f * Mmax) { Mmax = Mloc; m = r * Mmax; left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); }
                    else { Mmax = Mloc; m = r * Mmax; left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); size_t sz = H.size(); RecomputeR_ConstM_AVX2(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtr); }
                } else { left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); }
            }
        } else {
            if (Mloc > Mmax) {
                if (Mloc < 1.15f * Mmax) { Mmax = Mloc; m = r * Mmax; left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); }
                else { Mmax = Mloc; m = r * Mmax; left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); size_t sz = H.size(); RecomputeR_ConstM_AVX2(H.data(), sz, m); std::make_heap(H.begin(), H.end(), ComparePtr); }
            } else { left->ChangeCharacteristic(m); right->ChangeCharacteristic(m); }
        }
        H.push_back(left); std::push_heap(H.begin(), H.end(), ComparePtr);
        H.push_back(right); std::push_heap(H.begin(), H.end(), ComparePtr);
        Interval* top = H.front();
        float interval_len = top->x2 - top->x1;
        const int T = (int)fmaf(-expm1f(p), 264.0f, 277.0f);
        bool want_term = (powf(interval_len, 1.0f / (float)dim) < eps) || (it == maxIter - 1);
        if (!(it % T) || want_term) {
            CtrlMsg out; out.kind = want_term;
            out.xchg.s_x1 = top->x1; out.xchg.e_x1 = top->x2; out.xchg.s_x2 = 0.0f; out.xchg.e_x2 = 0.0f; out.xchg.Rtop = top->R;
            for (int i = 0; i < world; ++i) { if (i != rank) g_world->isend(i, 0, out); }
            if (want_term) break;
        }
        ++it;
    }
}

struct BestPacket { float bestF; int dim; float bestX; float bestY; template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar & bestF & dim & bestX & bestY; } };

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_Manip2D(int nSegments, bool variableLengths, float minTheta,
    float targetX, float targetY, int peanoLevels, int maxIterPerBranch,
    float r, bool adaptiveMode, float epsilon, unsigned int seed,
    float** out_bestQ, size_t* out_bestQLen,
    float* out_bestX, float* out_bestY, float* out_bestF) {
    const int dim = nSegments + (variableLengths ? nSegments : 0);
    const float thetaMin = -3.14159265359f;
    const float thetaMax = 3.14159265359f;
    const float lenMin = 0.1f;
    const float lenMax = 2.0f;
    std::vector<float> low, high;
    low.reserve(dim); high.reserve(dim);
    for (int i = 0; i < nSegments; ++i) { low.push_back(thetaMin); high.push_back(thetaMax); }
    if (variableLengths) { for (int i = 0; i < nSegments; ++i) { low.push_back(lenMin); high.push_back(lenMax); } }
    ManipCost cost(nSegments, variableLengths, targetX, targetY, minTheta);
    const int rank = g_world->rank();
    const int world = g_world->size();
    MortonND map(dim, peanoLevels, low.data(), high.data(), g_mc);
    std::vector<Interval*> H;
    std::vector<float> bestQ;
    float bestF = FLT_MAX, bx = 0.0f, by = 0.0f;
    H.reserve((size_t)maxIterPerBranch + 256);
    bestQ.reserve(dim);
    agp_run_branch_mpi(map, cost, maxIterPerBranch, r, adaptiveMode, epsilon, seed, H, bestQ, bestF, bx, by);
    BestPacket me{ bestF, dim, bx, by };
    if (!rank) {
        std::vector<float> winnerQ = bestQ;
        float winF = bestF, wx = bx, wy = by;
        for (int i = 1; i < world; ++i) {
            BestPacket bp; g_world->recv(i, 2, bp);
            std::vector<float> qin; g_world->recv(i, 3, qin);
            if (bp.bestF < winF) { winF = bp.bestF; wx = bp.bestX; wy = bp.bestY; winnerQ = qin; }
        }
        *out_bestQLen = winnerQ.size();
        *out_bestQ = (float*)CoTaskMemAlloc(sizeof(float) * (*out_bestQLen));
        std::memcpy(*out_bestQ, winnerQ.data(), sizeof(float) * (*out_bestQLen));
        *out_bestX = wx; *out_bestY = wy; *out_bestF = winF;
    } else {
        g_world->send(0, 2, me);
        g_world->send(0, 3, bestQ);
    }
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(int peanoLevel, float a, float b, float c, float d) {
    g_env = new boost::mpi::environment();
    g_world = new boost::mpi::communicator();
    const int rank = g_world->rank();
    const int world_size = g_world->size();
    if (world_size == 4) {
        switch (rank) {
            case 0:  new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, (uint8_t)Top);  break;
            case 1:  new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, (uint8_t)Down); break;
            case 2:  new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, (uint8_t)Left); break;
            default: new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, (uint8_t)Right);
        }
    }
    const int MaxDim = 16;
    g_mc.baseSeed = 0x9E3779B9u * (rank + 1);
    g_mc.permCache.resize(MaxDim);
    for (int i = 0; i < MaxDim; ++i) g_mc.permCache[i] = i;
    g_mc.invMaskCache.resize(MaxDim);
    for (int k = 0; k < MaxDim; ++k) g_mc.invMaskCache[k] = 0ull;
    g_mc.levelMasks.resize(13);
    for (int level = 0; level <= 12; ++level) g_mc.levelMasks[level] = (level == 0) ? 0ull : (1ull << level) - 1ull;
    return rank;
}

__forceinline float ShekelFunc(float x, float seed) {
    int i = 0; float st = seed, r1, r2, res = 0.0f;
    while (i < 10) {
        XOR_RAND(st, r1);
        float xp = fmaf(-r1, 10.0f, x);
        XOR_RAND(st, r1);
        XOR_RAND(st, r2);
        float d = fmaf(fmaf(r1, 20.0f, 5.0f), xp * xp, fmaf(r2, 0.2f, 1.0f));
        d = copysignf(fmaxf(fabsf(d), FLT_MIN), d);
        res -= 1.0f / d;
        ++i;
    }
    return res;
}

__forceinline float RastriginFunc(float x1, float x2) {
    const float t = fmaf(x1, x1, x2 * x2);
    float c1, c2;
    FABE13_COS(6.28318530717958647692f * x1, c1);
    FABE13_COS(6.28318530717958647692f * x2, c2);
    return (t - fmaf(c1 + c2, 10.0f, -14.6f)) * fmaf(-t, 0.25f, 18.42f);
}

__forceinline float HillFunc(float x, float seed) {
    int j = 0;
    __declspec(align(32)) float ang[14u];
    float st = 6.28318530717958647692f * x;
    while (j < 14) { ang[j] = st * (float)(j + 1); ++j; }
    __declspec(align(32)) float sv[14u], cv[14u];
    FABE13_SINCOS(ang, sv, cv, 14u);
    float state = seed, r1, r2;
    XOR_RAND(state, r1);
    float res = fmaf(r1, 2.0f, -1.1f);
    --j;
    while (j >= 0) {
        XOR_RAND(state, r1);
        XOR_RAND(state, r2);
        res += fmaf(fmaf(r1, 2.0f, -1.1f), sv[j], fmaf(r2, 2.0f, -1.1f) * cv[j]);
        --j;
    }
    return res;
}

__forceinline float GrishaginFunc(float x1, float x2, float seed) {
    int j = 0;
    __declspec(align(32)) float aj[8u], ak[8u];
    while (j < 8) { float pj = 3.14159265358979323846f * (float)(j + 1); aj[j] = pj * x1; ak[j] = pj * x2; ++j; }
    __declspec(align(32)) float sj[8u], cj[8u], sk[8u], ck[8u];
    FABE13_SINCOS(aj, sj, cj, 8u);
    FABE13_SINCOS(ak, sk, ck, 8u);
    --j;
    float p1 = 0.0f, p2 = 0.0f;
    float st = seed, r1, r2;
    while (j >= 0) {
        size_t k = 0u;
        while (k < 8u) {
            float s = sj[j] * sj[j];
            float c = ck[k] * ck[k];
            XOR_RAND_GRSH(st, r1);
            XOR_RAND_GRSH(st, r2);
            p1 = fmaf(r1, s, fmaf(r2, c, p1));
            XOR_RAND_GRSH(st, r1);
            XOR_RAND_GRSH(st, r2);
            p2 = fmaf(-r1, c, fmaf(r2, s, p2));
            ++k;
        }
        --j;
    }
    return -sqrtf(fmaf(p1, p1, p2 * p2));
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_1D(float global_iterations, float a, float b, float r, bool mode, float epsilon, float seed, float** out_data, size_t* out_len) {
    Slab* slab = tls.local();
    slab->current = slab->base;
    int schetchick = 0;
    const float initial_length = b - a;
    float dmax = initial_length;
    const float threshold_03 = 0.3f * initial_length;
    const float inv_threshold_03 = 1.0f / threshold_03;
    const float start_val = ShekelFunc(a, seed);
    float best_f = ShekelFunc(b, seed);
    float x_Rmax_1 = a;
    float x_Rmax_2 = b;
    float y_Rmax_1 = start_val;
    float y_Rmax_2 = best_f;
    std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
    std::vector<Interval*, boost::alignment::aligned_allocator<Interval*, 64u>> R;
    Extr.reserve((size_t)global_iterations << 2u);
    R.reserve((size_t)global_iterations << 1u);
    R.emplace_back(new Interval(a, b, start_val, best_f, 1.0f));
    float Mmax = R.front()->M;
    float m = r * Mmax;
    while (true) {
        const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r);
        const float new_value = ShekelFunc(new_point, seed);
        if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_point); }
        std::pop_heap(R.begin(), R.end(), ComparePtr);
        const Interval* pro = R.back();
        const float new_x1 = pro->x1;
        const float new_x2 = pro->x2;
        const float len2 = new_x2 - new_point;
        const float len1 = new_point - new_x1;
        const float interval_len = len1 < len2 ? len1 : len2;
        if (++schetchick == (int)global_iterations || interval_len < epsilon) {
            Extr.emplace_back((float)schetchick);
            Extr.emplace_back(interval_len);
            *out_len = Extr.size();
            *out_data = (float*)CoTaskMemAlloc(sizeof(float) * (*out_len));
            memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
            return;
        }
        Interval* curr = new Interval(new_x1, new_point, pro->y1, new_value, 1.0f);
        Interval* curr1 = new Interval(new_point, new_x2, new_value, pro->y2, 1.0f);
        const float currM = curr->M > curr1->M ? curr->M : curr1->M;
        const size_t r_size = R.size();
        if (mode) {
            if (len2 + len1 == dmax) { dmax = len2 > len1 ? len2 : len1; for (auto p : R) { float L = p->x2 - p->x1; if (L > dmax) dmax = L; } }
            if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) {
                if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
                const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
                const float alpha = progress * progress;
                const float betta = 2.0f - alpha;
                const float MULT = (1.0f / dmax) * Mmax;
                const float global_coeff = fmaf(MULT, r, -MULT);
                const float GF = betta * global_coeff;
                curr->ChangeCharacteristic(fmaf(GF, len1, curr->M * alpha));
                curr1->ChangeCharacteristic(fmaf(GF, len2, curr1->M * alpha));
                RecomputeR_AffineM_AVX2(R.data(), r_size, GF, alpha);
                std::make_heap(R.begin(), R.end(), ComparePtr);
            } else {
                if (currM > Mmax) {
                    if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
                    else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); }
                } else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
            }
        } else {
            if (currM > Mmax) {
                if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
                else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); }
            } else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
        }
        R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr);
        R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr);
        const Interval* top = R.front();
        x_Rmax_1 = top->x1; x_Rmax_2 = top->x2; y_Rmax_1 = top->y1; y_Rmax_2 = top->y2;
    }
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_2D(
    const float N,
    const float global_iterations,
    const float a,
    const float b,
    const float c,
    const float d,
    const float r,
    const bool mode,
    const float epsilon,
    const float seed,
    float** const __restrict out_data,
    size_t* const __restrict out_len) noexcept
{
    Slab* const __restrict slab = tls.local();
    slab->current = slab->base;
    int schetchick = 0;
    int no_improve = 0;
    const int rank = g_world->rank();
    const int world_size = g_world->size();
    while (g_world->iprobe(boost::mpi::any_source, 0)) { CtrlMsg dummy; g_world->recv(boost::mpi::any_source, 0, dummy); }
    const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1));
    const float x_addition = (b - a) * inv_divider;
    const float y_addition = (d - c) * inv_divider;
    const float true_start = a + x_addition;
    const float true_end = b - x_addition;
    float x_Rmax_1 = true_start;
    float x_Rmax_2 = true_end;
    const float initial_length = x_Rmax_2 - x_Rmax_1;
    float dmax = initial_length;
    const float threshold_03 = 0.3f * initial_length;
    const float inv_threshold_03 = 1.0f / threshold_03;
    const float start_val = rank % 3 ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition);
    float best_f = rank % 2 ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition);
    float y_Rmax_1 = start_val;
    float y_Rmax_2 = best_f;
    std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr;
    std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R;
    Extr.clear(); Extr.reserve(static_cast<size_t>(global_iterations) << 2u);
    R.clear(); R.reserve(static_cast<size_t>(global_iterations) << 1u);
    R.emplace_back(new Interval(true_start, true_end, start_val, best_f, 2.0f));
    const Interval* __restrict top_ptr;
    float Mmax = R.front()->M;
    float m = r * Mmax;
    while (true) {
        const float interval_len = x_Rmax_2 - x_Rmax_1;
        const bool stagnation = no_improve > 100 && schetchick > 270;
        const float p = fmaf(-1.0f / initial_length, dmax, 1.0f);
        while (g_world->iprobe(boost::mpi::any_source, 0)) {
            CtrlMsg in; g_world->recv(boost::mpi::any_source, 0, in);
            if (in.kind) {
                if (!rank) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); }
                return;
            }
            const float sx = FindX2D_analytic(in.xchg.s_x1, in.xchg.s_x2);
            const float ex = FindX2D_analytic(in.xchg.e_x1, in.xchg.e_x2);
            Interval* const __restrict injected = new Interval(sx, ex, RastriginFunc(in.xchg.s_x1, in.xchg.s_x2), RastriginFunc(in.xchg.e_x1, in.xchg.e_x2), 2.0f);
            injected->ChangeCharacteristic(m);
            if (injected->R > 1.15f * top_ptr->R) {
                const float k = stagnation ? fmaf(0.5819767068693265f, expm1f(p), 0.3f) : fmaf(0.3491860241215959f, expm1f(p), 0.6f);
                injected->R = in.xchg.Rtop * k;
                R.emplace_back(injected);
                std::push_heap(R.begin(), R.end(), ComparePtr);
            }
        }
        const int T = static_cast<int>(fmaf(-expm1f(p), 264.0f, 277.0f));
        const bool want_term = interval_len < epsilon || schetchick == static_cast<int>(global_iterations);
        if (!(++schetchick % T) || stagnation || want_term) {
            CtrlMsg out; out.kind = want_term;
            if (!out.kind) {
                float s_x1, s_x2, e_x1, e_x2;
                HitTest2D_analytic(top_ptr->x1, s_x1, s_x2);
                HitTest2D_analytic(top_ptr->x2, e_x1, e_x2);
                out.xchg = CrossMsg{ s_x1, s_x2, e_x1, e_x2, top_ptr->R };
            }
            for (int i = 0; i < world_size; ++i) { if (i != rank) g_world->isend(i, 0, out); }
            if (out.kind) {
                if (!rank) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); }
                return;
            }
        }
        const float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r);
        float new_x1_val, new_x2_val;
        HitTest2D_analytic(new_point, new_x1_val, new_x2_val);
        const float new_value = RastriginFunc(new_x1_val, new_x2_val);
        if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_x1_val); Extr.emplace_back(new_x2_val); no_improve = 0; } else { ++no_improve; }
        std::pop_heap(R.begin(), R.end(), ComparePtr);
        Interval* const __restrict promejutochny_otrezok = R.back();
        const float segment_x1 = promejutochny_otrezok->x1;
        const float segment_x2 = promejutochny_otrezok->x2;
        const float len2 = segment_x2 - new_point;
        const float len1 = new_point - segment_x1;
        Interval* const __restrict curr = new Interval(segment_x1, new_point, promejutochny_otrezok->y1, new_value, 2.0f);
        Interval* const __restrict curr1 = new Interval(new_point, segment_x2, new_value, promejutochny_otrezok->y2, 2.0f);
        const float currM = (std::max)(curr->M, curr1->M);
        const size_t r_size = R.size();
        if (mode) {
            if (len2 + len1 == dmax) { dmax = (std::max)(len1, len2); for (auto pI : R) { float L = pI->x2 - pI->x1; if (L > dmax) dmax = L; } }
            if (threshold_03 > dmax && !(schetchick % 3) || 10.0f * dmax < initial_length) {
                if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
                const float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
                const float alpha = progress * progress;
                const float betta = 2.0f - alpha;
                const float MULTIPLIER = (1.0f / dmax) * Mmax;
                const float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
                const float GLOBAL_FACTOR = betta * global_coeff;
                curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
                curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));
                RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha);
                std::make_heap(R.begin(), R.end(), ComparePtr);
            } else {
                if (currM > Mmax) {
                    if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
                    else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); }
                } else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
            }
        } else {
            if (currM > Mmax) {
                if (currM < 1.15f * Mmax) { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
                else { Mmax = currM; m = r * Mmax; curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); RecomputeR_ConstM_AVX2(R.data(), r_size, m); std::make_heap(R.begin(), R.end(), ComparePtr); }
            } else { curr->ChangeCharacteristic(m); curr1->ChangeCharacteristic(m); }
        }
        R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr);
        R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr);
        top_ptr = R.front();
        x_Rmax_1 = top_ptr->x1; x_Rmax_2 = top_ptr->x2; y_Rmax_1 = top_ptr->y1; y_Rmax_2 = top_ptr->y2;
    }
}

struct RunParams { int nSegments; uint8_t varLen; float minTheta; float tx, ty; int levels, maxIter; float r; uint8_t adaptive; float eps; unsigned int seed; template<typename Archive> void serialize(Archive& ar, const unsigned int) { ar & nSegments & varLen & minTheta & tx & ty & levels & maxIter & r & adaptive & eps & seed; } };

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartManipND(int nSegments, bool variableLengths, float minTheta, float targetX, float targetY, int peanoLevels, int maxIterPerBranch, float r, bool adaptiveMode, float epsilon, unsigned int seed) {
    RunParams p; p.nSegments = nSegments; p.varLen = (uint8_t)variableLengths; p.minTheta = minTheta; p.tx = targetX; p.ty = targetY; p.levels = peanoLevels; p.maxIter = maxIterPerBranch; p.r = r; p.adaptive = (uint8_t)adaptiveMode; p.eps = epsilon; p.seed = seed;
    const int rank = g_world->rank(); const int world = g_world->size();
    for (int i = 1; i < world; ++i) { if (i != rank) g_world->isend(i, 1, p); }
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() {
    while (true) {
        if (g_world->iprobe(0, 1)) {
            RunParams p; g_world->recv(0, 1, p);
            float* q = nullptr; size_t qlen = 0; float bx = 0, by = 0, bf = 0;
            AGP_Manip2D(p.nSegments, (bool)p.varLen, p.minTheta, p.tx, p.ty, p.levels, p.maxIter, p.r, (bool)p.adaptive, p.eps, p.seed, &q, &qlen, &bx, &by, &bf);
            if (q) CoTaskMemFree(q);
        }
        Sleep(0);
    }
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AgpStartWorkers() { int i = 1; const int world = g_world->size(); while (i < world) { g_world->isend(i, 1, 0); ++i; } }

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_Free(float* p) { CoTaskMemFree(p); }

#include "pch.h" #define XOR_RAND(state, result_v...

Вопрос

Мысль

Planning code modifications

Clarifying Shag parameters

Clarifying the Shag function details

Heap behavior and inlining thoughts

Optimizing data structures

Optimizing calculations

Ответ

Поделиться этим Q&A