#include "pch.h" #define XOR_RAND(state, result_v...

Question

#include "pch.h"

#define XOR_RAND(state, result_var)
do {
int s = state;
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = state * 0x1.0p-32f;
} while (0)

#define XOR_RAND_GRSH(state, result_var)
do {
int s = state;
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = fmaf(state, 0x1.0p-31f, -1.0f);
} while (0)

#define FABE13_COS(x, result_var)
do {
float abs_val = fabsf(x);
float reduced = fmodf(abs_val, 6.28318530718f);
if (reduced > 3.14159265359f) {
reduced = 6.28318530718f - reduced;
}
if (reduced < 1.57079632679f) {
float val2 = reduced * reduced;
result_var = fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
} else {
reduced = 3.14159265359f - reduced;
float val2 = reduced * reduced;
result_var = -fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
}
} while (0)

#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
int limit = n & ~7;
if (n >= 8) {
static __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f);
static __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f);
static __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f);
static __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f);
static __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f);
static __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f);
static __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f);
static __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f);
static __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps();
while (i < limit) {
__m256 vx = _mm256_load_ps(&in[i]);
__m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
__m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
__m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
__m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
__m256 t2 = _mm256_mul_ps(r2, r2);
__m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P3, VEC_COS_P1), VEC_COS_P0);
__m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, VEC_SIN_P1, VEC_SIN_P0), r2);
__m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
__m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ),
_mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
cosv = _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip);
sinv = _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip);
_mm256_store_ps(&cos_out[i], cosv);
_mm256_store_ps(&sin_out[i], sinv);
i += 8;
}
}
while (i < n) {
float x = in[i];
float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
bool sflip = (r > 3.14159265359f);
if (sflip) r = 6.28318530718f - r;
bool cflip = (r > 1.57079632679f);
if (cflip) r = 3.14159265359f - r;
float t2 = r * r;
float c = fmaf(t2, fmaf(t2, 0.0416666667f, -0.5f), 1.0f);
float s = fmaf(t2, -0.16666666f, 1.0f) * r;
cos_out[i] = cflip ? -c : c;
sin_out[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while (0)

static __declspec(noalias) __forceinline float ShekelFunc(const float x, const float seed) noexcept
{
int i = 0;
float current_state = seed, current_res, current_res2, res = 0.0f;
while (i < 10) {
XOR_RAND(current_state, current_res);
const float x_part = fmaf(-current_res, 10.0f, x);
XOR_RAND(current_state, current_res);
XOR_RAND(current_state, current_res2);
float delimiter = fmaf(fmaf(current_res, 20.0f, 5.0f),
x_part * x_part,
fmaf(current_res2, 0.2f, 1.0f));
delimiter = copysignf(fmaxf(fabsf(delimiter), FLT_MIN), delimiter);
res -= 1.0f / delimiter;
++i;
}
return res;
}

static __declspec(noalias) __forceinline float RastriginFunc(const float x1, const float x2) noexcept
{
const float term1 = fmaf(x1, x1, x2 * x2);
float cos1, cos2;
FABE13_COS(6.28318530717958647692f * x1, cos1);
FABE13_COS(6.28318530717958647692f * x2, cos2);
return (term1 - fmaf(cos1 + cos2, 10.0f, -14.6f)) * fmaf(-term1, 0.25f, 18.42f);
}

static __declspec(noalias) __forceinline float HillFunc(const float x, const float seed) noexcept
{
int j = 0;
static __declspec(align(32)) float angles[14u];
const float start_angle = 6.28318530717958647692f * x;
#pragma loop(ivdep)
while (j < 14) {
angles[j] = start_angle * (j + 1);
++j;
}
static __declspec(align(32)) float sin_vals[14u];
static __declspec(align(32)) float cos_vals[14u];
FABE13_SINCOS(angles, sin_vals, cos_vals, 14u);
float current_state = seed, current_res, current_res2;
XOR_RAND(current_state, current_res);
float res = fmaf(current_res, 2.0f, -1.1f);
--j;
while (j >= 0) {
XOR_RAND(current_state, current_res);
XOR_RAND(current_state, current_res2);
res += fmaf(fmaf(current_res, 2.0f, -1.1f), sin_vals[j],
fmaf(current_res, 2.0f, -1.1f) * cos_vals[j]);
--j;
}
return res;
}

static __declspec(noalias) __forceinline float GrishaginFunc(const float x1, const float x2, const float seed) noexcept
{
int j = 0;
static __declspec(align(32)) float angles_j[8u];
static __declspec(align(32)) float angles_k[8u];
#pragma loop(ivdep)
while (j < 8) {
const float pj_mult = 3.14159265358979323846f * (j + 1);
angles_j[j] = pj_mult * x1;
angles_k[j] = pj_mult * x2;
++j;
}
static __declspec(align(32)) float sin_j[8u], cos_j[8u];
static __declspec(align(32)) float sin_k[8u], cos_k[8u];
FABE13_SINCOS(angles_j, sin_j, cos_j, 8u);
FABE13_SINCOS(angles_k, sin_k, cos_k, 8u);
--j;
float part1 = 0.0f;
float part2 = 0.0f;
float current_state = seed, current_res, current_res2;
while (j >= 0) {
size_t k = 0u;
while (k < 8u) {
const float sin_term = sin_j[j] * sin_j[j];
const float cos_term = cos_k[k] * cos_k[k];
XOR_RAND_GRSH(current_state, current_res);
XOR_RAND_GRSH(current_state, current_res2);
part1 = fmaf(current_res, sin_term,
fmaf(current_res2, cos_term, part1));
XOR_RAND_GRSH(current_state, current_res);
XOR_RAND_GRSH(current_state, current_res2);
part2 = fmaf(-current_res, cos_term,
fmaf(current_res2, sin_term, part2));
++k;
}
--j;
}
return -sqrtf(fmaf(part1, part1, part2 * part2));
}

static __declspec(noalias) __forceinline float Shag(const float _m, const float x1, const float x2, const float y1,
const float y2, const float _N, const float _r) noexcept
{
const float diff = y2 - y1;
const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(
0x3F800000 | ((((const int)&diff) & 0x80000000) ^ 0x80000000)
)));
return _N == 1.0f
? fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f
: _N == 2.0f
? fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f
: fmaf(sign_mult / powf(_m, _N), powf(diff, _N) * _r, x1 + x2) * 0.5f;
}

__declspec(align(64)) struct PreallocatedPeanoMemoryManager final {
__declspec(align(64)) struct MemoryBlock final {
char* __restrict data;
char* __restrict current;
const char* __restrict end;

text
	__declspec(noalias) __forceinline MemoryBlock() noexcept
		: data(nullptr), current(nullptr), end(nullptr)
	{
	}

	const __declspec(noalias) __forceinline void initialize(const size_t bytes) noexcept
	{
		const size_t aligned_need = (bytes + 1048638) & ~(1048575);
		void* p = VirtualAlloc(nullptr, aligned_need,
			MEM_RESERVE | MEM_COMMIT,
			PAGE_READWRITE);

		data = reinterpret_cast<char* __restrict>(p);
		const uintptr_t base = reinterpret_cast<uintptr_t>(data);
		const uintptr_t aligned = (base + 63) & ~63;
		current = reinterpret_cast<char* __restrict>(aligned);
		end = reinterpret_cast<char*>(current + bytes);
	}

	__declspec(noalias) __forceinline void* allocate_from_block(const size_t alloc_size) noexcept
	{
		const size_t n = (alloc_size + 63) & ~63;
		char* __restrict result = current;
		current += n;
		return result;
	}
};

tbb::enumerable_thread_specific<MemoryBlock> tls;

const __declspec(noalias) __forceinline size_t calculate_tree_size_bytes(const int max_depth) const noexcept
{
	static const size_t bytes[] = {
			1398080u, 5592384u, 22369600u, 89478464u, 357913920u, 1431655744u, 5726623040u
	};
	return bytes[max_depth - 7u];
}

const __declspec(noalias) __forceinline void preallocate_for_depth(const int max_depth) noexcept
{
	const size_t required_size = calculate_tree_size_bytes(max_depth);
	tls.clear();
	tls = tbb::enumerable_thread_specific<MemoryBlock>(
		[required_size]() {
			MemoryBlock b;
			b.initialize(required_size);
			return b;
		}
	);
}

__declspec(noalias) __forceinline void* allocate(const size_t size) noexcept
{
	return tls.local().allocate_from_block(size);
}

};

static PreallocatedPeanoMemoryManager peano_memory_manager;

__declspec(align(16)) struct Interval final {
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float N_factor;
const float M;
float R;

text
__declspec(noalias) __forceinline void* operator new(size_t sz) noexcept { return peano_memory_manager.allocate(sz); }

__declspec(noalias) __forceinline Interval(const float _x1, const float _x2,
	const float _y1, const float _y2, const float _N) noexcept
	: x1(_x1), x2(_x2), y1(_y1), y2(_y2),
	delta_y(_y2 - _y1),
	N_factor(_N == 1.0f ? _x2 - _x1 :
		_N == 2.0f ? sqrtf(_x2 - _x1) :
		powf(_x2 - _x1, 1.0f / _N)),
	M(fabsf(delta_y)* (1.0f / N_factor))
{
}

__declspec(noalias) __forceinline void ChangeCharacteristic(const float _m) noexcept
{
	R = fmaf(
		-(y2 + y1),
		2.0f,
		fmaf(_m, N_factor, (delta_y * delta_y) * (1.0f / (_m * N_factor))));
}

};

static __declspec(noalias) __forceinline bool ComparePtr(const Interval* __restrict a, const Interval* __restrict b) noexcept
{
return a->R < b->R;
}

const enum List : uint8_t {
Top = 0b00,
Down = 0b01,
Left = 0b10,
Right = 0b11
};

__declspec(align(16)) struct PeanoCurve_2D final {
const List Type;
const int razvertka;
const float a;
const float b;
const float c;
const float d;
const float x1;
const float x2;
PeanoCurve_2D* __restrict DownLeft;
PeanoCurve_2D* __restrict TopLeft;
PeanoCurve_2D* __restrict TopRight;
PeanoCurve_2D* __restrict DownRight;

text
__declspec(noalias) __forceinline void* operator new(size_t sz) noexcept { return peano_memory_manager.allocate(sz); }

__declspec(noalias) __forceinline PeanoCurve_2D(
	const List _Type,
	int _razvertka,
	const float _a,
	const float _b,
	const float _c,
	const float _d) noexcept
	: Type(_Type),
	razvertka(_razvertka),
	a(_a),
	b(_b),
	c(_c),
	d(_d),
	x1(fmaf(0.5f, _a, 0.5f * _b)),
	x2(fmaf(0.5f, _c, 0.5f * _d))
{
	if (_razvertka-- != 0) {
		if (_razvertka > 8) {
			switch (_Type) {
			case Top:
				tbb::parallel_invoke(
					[&]() { DownLeft = new PeanoCurve_2D(Right, _razvertka, _a, x1, _c, x2); },
					[&]() { TopLeft = new PeanoCurve_2D(Top, _razvertka, _a, x1, x2, _d); },
					[&]() { TopRight = new PeanoCurve_2D(Top, _razvertka, x1, _b, x2, _d); },
					[&]() { DownRight = new PeanoCurve_2D(Left, _razvertka, x1, _b, _c, x2); }
				);
				break;
			case Down:
				tbb::parallel_invoke(
					[&]() { TopRight = new PeanoCurve_2D(Left, _razvertka, x1, _b, x2, _d); },
					[&]() { DownRight = new PeanoCurve_2D(Down, _razvertka, x1, _b, _c, x2); },
					[&]() { DownLeft = new PeanoCurve_2D(Down, _razvertka, _a, x1, _c, x2); },
					[&]() { TopLeft = new PeanoCurve_2D(Right, _razvertka, _a, x1, x2, _d); }
				);
				break;
			case Right:
				tbb::parallel_invoke(
					[&]() { DownLeft = new PeanoCurve_2D(Top, _razvertka, _a, x1, _c, x2); },
					[&]() { DownRight = new PeanoCurve_2D(Right, _razvertka, x1, _b, _c, x2); },
					[&]() { TopRight = new PeanoCurve_2D(Right, _razvertka, x1, _b, x2, _d); },
					[&]() { TopLeft = new PeanoCurve_2D(Down, _razvertka, _a, x1, x2, _d); }
				);
				break;
			default:
				tbb::parallel_invoke(
					[&]() { TopRight = new PeanoCurve_2D(Down, _razvertka, x1, _b, x2, _d); },
					[&]() { TopLeft = new PeanoCurve_2D(Left, _razvertka, _a, x1, x2, _d); },
					[&]() { DownLeft = new PeanoCurve_2D(Left, _razvertka, _a, x1, _c, x2); },
					[&]() { DownRight = new PeanoCurve_2D(Top, _razvertka, x1, _b, _c, x2); }
				);
			}
		}
		else {
			switch (_Type) {
			case Top:
				DownLeft = new PeanoCurve_2D(Right, _razvertka, _a, x1, _c, x2);
				TopLeft = new PeanoCurve_2D(Top, _razvertka, _a, x1, x2, _d);
				TopRight = new PeanoCurve_2D(Top, _razvertka, x1, _b, x2, _d);
				DownRight = new PeanoCurve_2D(Left, _razvertka, x1, _b, _c, x2);
				break;
			case Down:
				TopRight = new PeanoCurve_2D(Left, _razvertka, x1, _b, x2, _d);
				DownRight = new PeanoCurve_2D(Down, _razvertka, x1, _b, _c, x2);
				DownLeft = new PeanoCurve_2D(Down, _razvertka, _a, x1, _c, x2);
				TopLeft = new PeanoCurve_2D(Right, _razvertka, _a, x1, x2, _d);
				break;
			case Right:
				DownLeft = new PeanoCurve_2D(Top, _razvertka, _a, x1, _c, x2);
				DownRight = new PeanoCurve_2D(Right, _razvertka, x1, _b, _c, x2);
				TopRight = new PeanoCurve_2D(Right, _razvertka, x1, _b, x2, _d);
				TopLeft = new PeanoCurve_2D(Down, _razvertka, _a, x1, x2, _d);
				break;
			default:
				TopRight = new PeanoCurve_2D(Down, _razvertka, x1, _b, x2, _d);
				TopLeft = new PeanoCurve_2D(Left, _razvertka, _a, x1, x2, _d);
				DownLeft = new PeanoCurve_2D(Left, _razvertka, _a, x1, _c, x2);
				DownRight = new PeanoCurve_2D(Top, _razvertka, x1, _b, _c, x2);
			}
		}
	}
}

const __declspec(noalias) __forceinline PeanoCurve_2D* __restrict HitTest_2D(float x) const noexcept
{
	int i = 0;
	const int _razvertka = this->razvertka;
	int num;
	const float this_a = this->a;
	x -= this_a;
	const float b_minus_a = this->b - this_a;
	const float inv_b_minus_a = 1.0f / b_minus_a;
	const PeanoCurve_2D* __restrict Curr = this;
	while (i != _razvertka) {
		const int shift = 1 << ++i + i;
		num = shift * x * inv_b_minus_a;
		x = fmaf(-ldexp(1.0f, -(i << 1)) * num, b_minus_a, x);
		const List currType = Curr->Type;
		switch (num) {
		case 0:
			Curr = (currType == Top || currType == Right)
				? Curr->DownLeft
				: Curr->TopRight;
			break;
		case 1:
			Curr = (currType == Top || currType == Left)
				? Curr->TopLeft
				: Curr->DownRight;
			break;
		case 2:
			Curr = (currType == Top || currType == Right)
				? Curr->TopRight
				: Curr->DownLeft;
			break;
		default:
			Curr = (currType == Top || currType == Left)
				? Curr->DownRight
				: Curr->TopLeft;
		}
	}
	return const_cast<PeanoCurve_2D * __restrict>(Curr);
}

const __declspec(noalias) __forceinline float FindX_2D(const float target_x1, const float target_x2) const noexcept
{
	int _razvertka = this->razvertka;
	int _razvertka1 = _razvertka;
	float x1, x2, x = this->a;
	const float b_minus_a = this->b - x;
	const PeanoCurve_2D* __restrict Curr = this;
	while (_razvertka != 0) {
		const int exponent = _razvertka1 - _razvertka-- << 1;
		x1 = Curr->x1;
		x2 = Curr->x2;
		const List currType = Curr->Type;
		if (target_x1 > x1 && target_x2 > x2) {
			Curr = Curr->TopRight;
			if (currType == Top || currType == Right) {
				x = fmaf(ldexpf(1.0f, -(exponent + 1)), b_minus_a, x);
			}
		}
		else if (target_x1 < x1 && target_x2 > x2) {
			Curr = Curr->TopLeft;
			if (currType == Top || currType == Left) {
				x = fmaf(ldexpf(1.0f, -(exponent + 2)), b_minus_a, x);
			}
			else {
				x = fmaf(ldexpf(1.0f, -exponent) * 0.75f, b_minus_a, x);
			}
		}
		else if (target_x1 < x1 && target_x2 < x2) {
			Curr = Curr->DownLeft;
			if (currType == Down || currType == Left) {
				x = fmaf(ldexpf(1.0f, -(exponent + 1)), b_minus_a, x);
			}
		}
		else {
			Curr = Curr->DownRight;
			if (currType == Top || currType == Left) {
				x = fmaf(ldexpf(1.0f, -exponent) * 0.75f, b_minus_a, x);
			}
			else {
				x = fmaf(ldexpf(1.0f, -(exponent + 2)), b_minus_a, x);
			}
		}
	}
	return x;
}

};

static std::vector<float, tbb::scalable_allocator<float>> Extr;
static std::vector<Interval*, tbb::scalable_allocator<Interval*>> R;
static boost::mpi::environment* __restrict g_env;
static boost::mpi::communicator* __restrict g_world;
static boost::optionalboost::mpi::status probe_status;
static std::aligned_storage_t<sizeof(PeanoCurve_2D), alignof(PeanoCurve_2D)> curveStorage;
static PeanoCurve_2D& __restrict Curve = reinterpret_cast<PeanoCurve_2D & __restrict>(curveStorage);

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(const int peanoLevel, const float a, const float b, const float c, const float d) noexcept
{
peano_memory_manager.preallocate_for_depth(peanoLevel);
g_env = ::new boost::mpi::environment();
g_world = ::new boost::mpi::communicator();
int rank = g_world->rank();
if (rank) {
::new (&Curve) PeanoCurve_2D(List::Down, peanoLevel, a, b, c, d);
}
else {
::new (&Curve) PeanoCurve_2D(List::Top, peanoLevel, a, b, c, d);
}
return rank;
}

__declspec(align(16)) struct CrossMsg final {
float s_x1, s_x2;
float e_x1, e_x2;
float Rtop;
template <typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& __restrict const ar, const unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }
};

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_1D(const float global_iterations,
const float a, const float b, const float r,
const bool mode, const float epsilon, const float seed,
float** __restrict out_data, size_t* __restrict out_len) noexcept
{
int schetchick = 0;
const float initial_length = b - a;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length;
const float inv_threshold_03 = 1.0f / threshold_03;
const float start_val = ShekelFunc(a, seed);
float best_f = ShekelFunc(b, seed);
float x_Rmax_1 = a;
float x_Rmax_2 = b;
float y_Rmax_1 = start_val;
float y_Rmax_2 = best_f;

text
Extr.reserve(static_cast<size_t>(global_iterations) << 2u);
Extr.clear();
R.reserve(static_cast<size_t>(global_iterations) << 1u);
R.clear();
R.emplace_back(new Interval(a, b, start_val, best_f, 1.0f));

float Mmax = R.front()->M;
float m = r * Mmax;

while (true) {
	float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r);
	float new_value = ShekelFunc(new_point, seed);

	if (new_value < best_f) {
		best_f = new_value;
		Extr.emplace_back(best_f);
		Extr.emplace_back(new_point);
	}

	std::pop_heap(R.begin(), R.end(), ComparePtr);
	Interval* __restrict promejutochny_otrezok = R.back();
	R.pop_back();

	float new_x1 = promejutochny_otrezok->x1;
	float new_x2 = promejutochny_otrezok->x2;
	float len2 = new_x2 - new_point;
	float len1 = new_point - new_x1;
	float interval_len = (len1 < len2) ? len1 : len2;

	if (interval_len < epsilon || ++schetchick == static_cast<int>(global_iterations)) {
		Extr.emplace_back(static_cast<float>(schetchick));
		Extr.emplace_back(interval_len);
		*out_len = Extr.size();
		*out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
		memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
		return;
	}

	Interval* __restrict curr = new Interval(new_x1, new_point, promejutochny_otrezok->y1, new_value, 1.0f);
	Interval* __restrict curr1 = new Interval(new_point, new_x2, new_value, promejutochny_otrezok->y2, 1.0f);
	float currM = (curr->M > curr1->M) ? curr->M : curr1->M;
	size_t r_size = R.size();

	if (mode) {
		if ((len2 + len1) == dmax) {
			dmax = (len2 > len1) ? len2 : len1;
			size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
float len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}

text
		if (threshold_03 > dmax && schetchick % 3 == 0 || 10.0f * dmax < initial_length) {
			if (currM > Mmax) {
				Mmax = currM;
				m = r * Mmax;
			}
			float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
			float alpha = fmaf(progress, progress, 1.0f);
			float betta = 2.0f - alpha;
			float MULTIPLIER = (1.0f / dmax) * Mmax;
			float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
			float GLOBAL_FACTOR = betta * global_coeff;

			curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
			curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));

			size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}

text
	R.emplace_back(curr);
	std::push_heap(R.begin(), R.end(), ComparePtr);
	R.emplace_back(curr1);
	std::push_heap(R.begin(), R.end(), ComparePtr);

	Interval* __restrict top_ptr = R.front();
	x_Rmax_1 = top_ptr->x1;
	x_Rmax_2 = top_ptr->x2;
	y_Rmax_1 = top_ptr->y1;
	y_Rmax_2 = top_ptr->y2;
}

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_ND(
const float N, const float global_iterations,
const float a, const float b, const float c, const float d, const float r,
const bool mode, const float epsilon, const float seed,
float** __restrict out_data, size_t* __restrict out_len) noexcept
{
const int rank = g_world->rank();
const int partner = rank ^ 1;

text
if (N == 2.0f) {
	const float inv_divider = ldexpf(1.0f, -((Curve.razvertka << 1) + 1));
	const float x_addition = (b - a) * inv_divider;
	const float y_addition = (d - c) * inv_divider;
	const float true_start = a + x_addition;
	const float true_end = b - x_addition;

	float x_Rmax_1 = true_start;
	float x_Rmax_2 = true_end;
	const float initial_length = true_end - true_start;
	float dmax = initial_length;
	const float threshold_03 = 0.3f * initial_length;
	const float inv_threshold_03 = 1.0f / threshold_03;
	const float start_val = rank ? RastriginFunc(true_end, d - y_addition)
		: RastriginFunc(true_start, c + y_addition);
	float y_Rmax_1 = start_val;
	float best_f = rank ? RastriginFunc(true_start, d - y_addition)
		: RastriginFunc(true_end, c + y_addition);
	float y_Rmax_2 = best_f;

	Extr.reserve(static_cast<size_t>(global_iterations) << 2u);
	Extr.clear();
	R.reserve(static_cast<size_t>(global_iterations) << 1u);
	R.clear();
	R.emplace_back(new Interval(true_start, true_end, start_val, best_f, 2.0f));

	float Mmax = R.front()->M;
	float m = r * Mmax;
	int schetchick = 0;
	float interval_len = 0.0f;

	while (true) {
		probe_status = g_world->iprobe(1, 0);
		if (probe_status) {
			Extr.emplace_back(static_cast<float>(schetchick));
			Extr.emplace_back(interval_len);
			*out_len = Extr.size();
			*out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
			memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
			int dummy;
			g_world->recv(1, 0, dummy);
			return;
		}

		float cooling = ldexpf(1.0f, -(1.0f / 138.63f) * ++schetchick);
		int T = static_cast<int>(fmaf(20.0f, cooling, 10.0f));
		float k = fmaf(0.2f, cooling, 0.7f);

		Interval* __restrict top_ptr = R.front();
		if (schetchick % T == 0) {
			const PeanoCurve_2D* __restrict p1L = Curve.HitTest_2D(top_ptr->x1);
			const PeanoCurve_2D* __restrict p2L = Curve.HitTest_2D(top_ptr->x2);
			CrossMsg outbound = CrossMsg{ p1L->x1, p1L->x2, p2L->x1, p2L->x2, top_ptr->R };
			CrossMsg inbound;
			g_world->sendrecv(partner, 0, outbound, partner, 0, inbound);
			Interval* __restrict injected = new Interval(Curve.FindX_2D(inbound.s_x1, inbound.s_x2), Curve.FindX_2D(inbound.e_x1, inbound.e_x2), RastriginFunc(inbound.s_x1, inbound.s_x2), RastriginFunc(inbound.e_x1, inbound.e_x2), 2.0f);
			injected->R = inbound.Rtop * k;
			R.emplace_back(injected);
			std::push_heap(R.begin(), R.end(), ComparePtr);
		}

		float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r);
		const PeanoCurve_2D* __restrict pc = Curve.HitTest_2D(new_point);
		float new_x1_val = pc->x1;
		float new_x2_val = pc->x2;
		float new_value = RastriginFunc(new_x1_val, new_x2_val);

		if (new_value < best_f) {
			best_f = new_value;
			Extr.emplace_back(best_f);
			Extr.emplace_back(new_x1_val);
			Extr.emplace_back(new_x2_val);
		}

		std::pop_heap(R.begin(), R.end(), ComparePtr);
		Interval* __restrict promejutochny_otrezok = R.back();
		R.pop_back();

		float segment_x1 = promejutochny_otrezok->x1;
		float segment_x2 = promejutochny_otrezok->x2;
		float len2 = segment_x2 - new_point;
		float len1 = new_point - segment_x1;
		interval_len = (len1 < len2) ? len1 : len2;

		if (interval_len < epsilon || schetchick == static_cast<int>(global_iterations)) {
			g_world->send(0, 0, 0);
			return;
		}

		Interval* __restrict curr = new Interval(segment_x1, new_point, promejutochny_otrezok->y1, new_value, 2.0f);
		Interval* __restrict curr1 = new Interval(new_point, segment_x2, new_value, promejutochny_otrezok->y2, 2.0f);
		float currM = (curr->M > curr1->M) ? curr->M : curr1->M;
		size_t r_size = R.size();

		if (mode) {
			if ((len2 + len1) == dmax) {
				dmax = (len2 > len1) ? len2 : len1;
				size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
float len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}

text
			if (threshold_03 > dmax && schetchick % 3 == 0 || 10.0f * dmax < initial_length) {
				if (currM > Mmax) {
					Mmax = currM;
					m = r * Mmax;
				}
				float progress = fmaf(-inv_threshold_03, dmax, 1.0f);
				float alpha = fmaf(progress, progress, 1.0f);
				float betta = 2.0f - alpha;
				float MULTIPLIER = (1.0f / dmax) * Mmax;
				float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
				float GLOBAL_FACTOR = betta * global_coeff;

				curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
				curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));

				size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}

text
		R.emplace_back(curr);
		std::push_heap(R.begin(), R.end(), ComparePtr);
		R.emplace_back(curr1);
		std::push_heap(R.begin(), R.end(), ComparePtr);

		top_ptr = R.front();
		x_Rmax_1 = top_ptr->x1;
		x_Rmax_2 = top_ptr->x2;
		y_Rmax_1 = top_ptr->y1;
		y_Rmax_2 = top_ptr->y2;
	}
}
else {
}

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() noexcept
{
int dummy;
float* __restrict buf;
size_t len;
while (true) {
probe_status = g_world->iprobe(0, 0);
if (probe_status) {
g_world->recv(0, 0, dummy);
AGP_ND(2.0f, 1000.0f, -2.2f, 1.8f, -2.2f, 1.8f,
2.5f, false, 0.001f, GetTickCount(),
&buf, &len);
}
}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpStartWorkers() noexcept
{
g_world->isend(1, 0, 0);
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_ND_Free(float* __restrict p) noexcept
{
CoTaskMemFree(p);
} как я могу ещё сильнее увеличить производительность (в первую очередь скорость, требования по читаемости удобности и т д не имеют значения) в частности в двумерной ветке

Answer 1

Классный кусок кода — видно, что вы уже выжали многое (FMA, AVX2 sin/cos, bump-alloc, heap и т. д.). Ниже — приоритетный список именно на скорость, с акцентом на двумерную ветку AGP_ND (N==2). Я даю «большие выигрыши» сверху вниз и сразу показываю, что именно менять.

1) Убрать дерево Пеано из горячего пути

Сейчас HitTest_2D(new_point) идёт по указателям по глубине razvertka, постоянно прыгая по памяти → промахи кэша и ветвления. Для N==2 координаты (x₁, x₂) можно восстанавливать аналитически из параметра вдоль кривой, не строя дерево и не делая pointer-chase.

Идея: представить параметр как число в базе 4 (по 2 бита на уровень), на каждом уровне применять маленькую 2×2 трансформацию (ориентация Top/Down/Left/Right) и накапливать смещения. Это даёт полностью предсказуемый цикл без ветвлений и без кистонов по куче узлов.

Пример быстрой замены HitTest_2D (работает только для N==2; дерево не нужно вообще — хранить только razvertka, [a,b]×[c,d]):

cpp
struct Peano2DMap {
    int levels;
    float a,b,c,d;
};

// табличка: [type][quad] -> next_type и (dx,dy) в {0,1}
struct Step { uint8_t next; uint8_t dx; uint8_t dy; };

static inline Step step_tbl[4][4] = {
/*Top  */ {{Right,0,0},{Top,0,1},{Top,1,1},{Left,1,0}},
/*Down */ {{Left,1,1},{Down,1,0},{Down,0,0},{Right,0,1}},
/*Left */ {{Top,0,1},{Left,0,0},{Left,1,0},{Down,1,1}},
/*Right*/ {{Down,1,0},{Right,1,1},{Right,0,1},{Top,0,0}},
};

__forceinline void HitTest2D_analytic(const Peano2DMap& M, float x_param,
                                      float &out_x1, float &out_x2) noexcept
{
    // нормируем параметр в [0,1), квантируем в 4^L ячеек
    float lenx = M.b - M.a;
    float leny = M.d - M.c;
    // избегаем деления в цикле
    float norm = (x_param - M.a) * (1.0f / lenx);
    uint32_t idx = (uint32_t) (norm * (float)(1u << (M.levels*2)));

    // размер клетки на текущем уровне
    float sx = lenx;
    float sy = leny;
    float x1 = M.a;
    float x2 = M.c;

    uint8_t type = Top; // как в ранге 0

    // обходим уровни от старших к младшим 2-битным цифрам
    for (int l = M.levels-1; l >= 0; --l) {
        uint32_t q = (idx >> (l*2)) & 3u; // квартал 0..3
        const Step s = step_tbl[type][q];
        type = s.next;

        sx *= 0.5f;
        sy *= 0.5f;
        x1 += s.dx ? sx : 0.0f;
        x2 += s.dy ? sy : 0.0f;
    }
    // центр ячейки (как у вас с x_addition/y_addition)
    out_x1 = x1 + sx*0.5f;
    out_x2 = x2 + sy*0.5f;
}

Замените вызовы Curve.HitTest_2D(new_point) на это.
Плюсы: никаких аллокаций и рекурсий, нулевой branch-miss, линейный проход по регистрами → обычно самый большой буст в 2D-ветке.

Бонус: FindX_2D тоже можно выразить аналитически, инвертируя ту же таблицу (по (x₁,x₂) возвращать idx, затем обратно x в [a,b]). Тогда вообще не нужно хранить Curve.

2) Специализировать горячий цикл под N==2 и mode==false

В AGP_ND большинство условных веток не нужны в реальном запуске:

N всегда 2 → вынесите отдельный AGP_2D_loop_modeFalse(...) без if (N==2.0f) и без ветки mode==true.
Уберите % в schetchick % T == 0 — держите int next_x = schetchick + T; и сравнивайте schetchick == next_x, после чего пересчитывайте T и новый next_x. Это убирает дорогой модуль в горячем пути.
Shag(...) специализируйте для N==2 (и для N==1 если нужно в 1D):

cpp
__forceinline float Shag2(const float m, const float x1, const float x2,
                          const float y1, const float y2, const float r) noexcept
{
    const float diff = y2 - y1;
    const float sign_mult = std::copysignf(1.0f, -diff); // тот же смысл
    // N==2: (x1+x2)/2 + sign*(diff*diff*r)/(2*m*m)
    return 0.5f * (x1 + x2 + sign_mult * (diff*diff * r) / (m*m));
}

И дальше вызовите в 2D-цикле Shag2 вместо общего Shag.

3) Дешёвая переоценка характеристик (R) + векторизация

В ChangeCharacteristic у вас каждый раз:

R = -2*(y2+y1) + m*N_factor + (delta_y^2) / (m*N_factor)

Преобразуйте Interval так, чтобы хранить три константы на весь жизненный цикл:

A = -2*(y2+y1)
B = N_factor
C = (delta_y*delta_y)/N_factor

Тогда:

cpp
__forceinline void ChangeCharacteristic_fast(float m) noexcept {
    R = A + m*B + C*(1.0f/m); // две FMA + один rcp (см. ниже)
}

Дальше две вещи:

Ленивая пересчётка: когда меняется только m, вам не нужно пересчитывать R для каждого интервала сразу. Держите глобальный current_m и «эпоху». В компараторе используйте быструю формулу с текущим m, а полную переупаковку кучи (make_heap) делайте только при больших сдвигах m (вы и так её делаете). Это резко сокращает обход R[i] в циклах «подогрева».
SIMD-батчи: если хотите пересчитывать массово — переведите хранилище интервалов в SOA на время апдейта: отдельные массивы A[], B[], C[], len[], R[]. Тогда апдейт 8 штук за раз:

cpp
// m_v = _mm256_set1_ps(m); invm_v = _mm256_rcp_ps(m_v); (с /fp:fast)
R_v = A_v + m_v*B_v + C_v*invm_v;

Да, для кучи по-прежнему нужны указатели, но сами значения R вы получите векторно без циклов.

4) Снизить churn кучи

Сейчас вы делаете: pop_heap → добавили 2 интервала → push_heap дважды.
Опция быстрее: заменить «верх» на один из новых и вызвать std::push_heap один раз:

cpp
// pop top -> back
std::pop_heap(R.begin(), R.end(), ComparePtr);
// вместо erase: возьмите back как место для одного из новых
Interval* repl = R.back();
*repl = *curr;       // или просто R.back() = curr; если храните объекты
std::push_heap(R.begin(), R.end(), ComparePtr);

// второй новый — обычный push (один раз)
R.emplace_back(curr1);
std::push_heap(R.begin(), R.end(), ComparePtr);

Или написать свой «заменить вершину» (sift-down + один push) — это меньше движений элементов.

5) Коммуникации: убрать `iprobe` на каждом такте и перейти на persistent requests

iprobe перед каждым шагом — это системный вызов и барьер. Сделайте постоянные MPI_Irecv + MPI_Test раз в T итераций (вы и так синхронизируетесь на schetchick % T == 0 / новый next_x). А ещё лучше — если оба ранга на одном узле — просто заменить на 2 потока + MPMC очередь (boost lockfree / folly) и избавиться от MPI в горячем пути (MPI можно оставить только для межузлового режима).

6) Вычисления «соседа» дешевле

В блоке cross-обмена вы вычисляете HitTest, шлёте (x1,x2) и Rtop, а партнёр потом вызывает FindX_2D и пересчитывает Rastrigin. Если перейти на аналитический маппер из п.1, весь этот блок становится чистой арифметикой без дерева; дополнительно можно отправлять индекс ячейки idx (uint32), а не вещественные координаты — это уменьшит упаковку/распаковку и сделает FindX тривиальным.

7) Компилятор/линкер/CPU настройки (реальные проценты)

MSVC: /O2 /GL /Gw /Gy /Ot /Oi /Ob3 /fp:fast /arch:AVX2 (или AVX-512 если есть) + PGO (/GL + /LTCG:PGINSTRUMENT → /LTCG:PGOPTIMIZE).
Clang-cl часто даёт быстрее: -O3 -Ofast -ffast-math -fno-math-errno -freciprocal-math -march=skylake-avx512 (или -mavx2 -mfma).
Зафиксируйте частоты и энергосхему (High Performance), привяжите процессы/потоки к NUMA-нодам и ядрам (SetThreadAffinityMask/start /affinity).
Большие страницы: VirtualAlloc(MEM_LARGE_PAGES, …) (если есть привилегия) — снизит TLB-pressure для вашего большого prealloc.

8) Микро-моменты в 2D-ветке

cooling через ldexpf(…) с нецелым показателем на самом деле квантуется по ~138 шагам (второй аргумент — int). Это дёшево, но знайте поведение. Если хотите плавно — exp2f(-schetchick/138.63f); если хотите ещё быстрее — оставьте как есть (ступени), просто не вызывайте это каждый шаг, а только когда меняете T/k.
RastriginFunc: вы уже используете лёгкий cos; для двух аргументов можно объединить дом. редукцию и 2 полинома в один блок (микроэкономия вызова макроса дважды).
Статические локальные массивы static float angles[14] и пр. в других функциях лучше сделать thread_local (иначе ложное разделение кэша между ранками/потоками). В 2D-ветке это почти не используется, но общая гигиена.
Спины: если оставляете «ожидания» — _mm_pause() в коротких спинах.

9) Что даст больше всего в реальности

Аналитический маппинг Peano вместо дерева — обычно самый большой выигрыш (меньше случайных доступов, меньше веток).
Специализация N==2, mode==false и снятие % — чистый CPU-счёт, почти всегда даёт приятные проценты.
Быстрый пересчёт R (A+B*m+C/m) + SIMD батчи / ленивая стратегия — заметно ускоряет «фазы пересчёта».
MPI → persistent / потоки — убирает системные вызовы из tight-loop.

Если нужно, могу накидать вам drop-in патч: HitTest2D_analytic, Shag2, переписанный цикл AGP_2D_loop_modeFalse и «ленивый» апдейт R (A,B,C).

#include "pch.h" #define XOR_RAND(state, result_v...

Pregunta

Pensamiento

Optimizing performance in C++

Performance optimization recommendations

Optimizing heap operations and characteristic updates

Heap and ChangeCharacteristic optimization

Optimizing the code and hardware usage

Optimizing functions and memory

Optimizing 2D branch and memory

Respuesta

1) Убрать дерево Пеано из горячего пути

2) Специализировать горячий цикл под N==2 и mode==false

3) Дешёвая переоценка характеристик (R) + векторизация

4) Снизить churn кучи

5) Коммуникации: убрать `iprobe` на каждом такте и перейти на persistent requests

6) Вычисления «соседа» дешевле

7) Компилятор/линкер/CPU настройки (реальные проценты)

8) Микро-моменты в 2D-ветке

9) Что даст больше всего в реальности

Compartir este Q&A

Pregunta

Pensamiento

Optimizing performance in C++

Performance optimization recommendations

Optimizing heap operations and characteristic updates

Heap and ChangeCharacteristic optimization

Optimizing the code and hardware usage

Optimizing functions and memory

Optimizing 2D branch and memory

Respuesta

1) Убрать дерево Пеано из горячего пути

2) Специализировать горячий цикл под N==2 и mode==false

3) Дешёвая переоценка характеристик (R) + векторизация

4) Снизить churn кучи

5) Коммуникации: убрать iprobe на каждом такте и перейти на persistent requests

6) Вычисления «соседа» дешевле

7) Компилятор/линкер/CPU настройки (реальные проценты)

8) Микро-моменты в 2D-ветке

9) Что даст больше всего в реальности

Compartir este Q&A

5) Коммуникации: убрать `iprobe` на каждом такте и перейти на persistent requests