#include "pch.h" #define XOR_RAND(state, result_v...

Creado el: 16 de octubre de 2025

Respondido usando GPT-5 Thinking por Chat01

Pregunta

#include "pch.h"

#define XOR_RAND(state, result_var)
do {
int s = state;
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = state * 0x1.0p-32f;
} while(0)

#define XOR_RAND_GRSH(state, result_var)
do {
int s = state;
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = fmaf(state, 0x1.0p-31f, -1.0f);
} while(0)

#define FABE13_COS(x, result_var)
do {
float abs_val = fabsf(x);
float reduced = fmodf(abs_val, 6.28318530718f);
if(reduced > 3.14159265359f) {
reduced = 6.28318530718f - reduced;
}
if(reduced < 1.57079632679f) {
float val2 = reduced * reduced;
result_var = fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
} else {
reduced = 3.14159265359f - reduced;
float val2 = reduced * reduced;
result_var = -fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
}
} while(0)

#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
int limit = n & ~7;
if(n >= 8) {
static __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f);
static __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f);
static __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f);
static __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f);
static __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f);
static __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f);
static __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f);
static __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f);
static __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps();
while(i < limit) {
__m256 vx = _mm256_load_ps(&in[i]);
__m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
__m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
__m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
__m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
__m256 t2 = _mm256_mul_ps(r2, r2);
__m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P3, VEC_COS_P1), VEC_COS_P0);
__m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, VEC_SIN_P1, VEC_SIN_P0), r2);
__m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
__m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ), _mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
cosv = _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip);
sinv = _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip);
_mm256_store_ps(&cos_out[i], cosv);
_mm256_store_ps(&sin_out[i], sinv);
i += 8;
}
}
while(i < n) {
float x = in[i];
float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
bool sflip = r > 3.14159265359f;
if(sflip) r = 6.28318530718f - r;
bool cflip = r > 1.57079632679f;
if(cflip) r = 3.14159265359f - r;
float t2 = r * r;
float c = fmaf(t2, fmaf(t2, 0.0416666667f, -0.5f), 1.0f);
float s = fmaf(t2, -0.16666666f, 1.0f) * r;
cos_out[i] = cflip ? -c : c;
sin_out[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while(0)

static __declspec(noalias) __forceinline float ShekelFunc(const float x, const float seed) noexcept
{
int i = 0;
float current_state = seed, current_res, current_res2, res = 0.0f;
while (i < 10) {
XOR_RAND(current_state, current_res);
const float x_part = fmaf(-current_res, 10.0f, x);
XOR_RAND(current_state, current_res);
XOR_RAND(current_state, current_res2);
float delimiter = fmaf(fmaf(current_res, 20.0f, 5.0f), x_part * x_part, fmaf(current_res2, 0.2f, 1.0f));
delimiter = copysignf(fmaxf(fabsf(delimiter), FLT_MIN), delimiter);
res -= 1.0f / delimiter;
++i;
}
return res;
}

static __declspec(noalias) __forceinline float RastriginFunc(const float x1, const float x2) noexcept
{
const float term1 = fmaf(x1, x1, x2 * x2);
float cos1, cos2;
FABE13_COS(6.28318530717958647692f * x1, cos1);
FABE13_COS(6.28318530717958647692f * x2, cos2);
return (term1 - fmaf(cos1 + cos2, 10.0f, -14.6f)) * fmaf(-term1, 0.25f, 18.42f);
}

static __declspec(noalias) __forceinline float HillFunc(const float x, const float seed) noexcept
{
int j = 0;
__declspec(align(32)) float angles[14u];
const float start_angle = 6.28318530717958647692f * x;
#pragma loop(ivdep)
while (j < 14) {
angles[j] = start_angle * static_cast<float>(j + 1);
++j;
}
__declspec(align(32)) float sin_vals[14u];
__declspec(align(32)) float cos_vals[14u];
FABE13_SINCOS(angles, sin_vals, cos_vals, 14u);
float current_state = seed, current_res, current_res2;
XOR_RAND(current_state, current_res);
float res = fmaf(current_res, 2.0f, -1.1f);
--j;
while (j >= 0) {
XOR_RAND(current_state, current_res);
XOR_RAND(current_state, current_res2);
res += fmaf(fmaf(current_res, 2.0f, -1.1f), sin_vals[j], fmaf(current_res, 2.0f, -1.1f) * cos_vals[j]);
--j;
}
return res;
}

static __declspec(noalias) __forceinline float GrishaginFunc(const float x1, const float x2, const float seed) noexcept
{
int j = 0;
__declspec(align(32)) float angles_j[8u];
__declspec(align(32)) float angles_k[8u];
#pragma loop(ivdep)
while (j < 8) {
const float pj_mult = 3.14159265358979323846f * static_cast<float>(j + 1);
angles_j[j] = pj_mult * x1;
angles_k[j] = pj_mult * x2;
++j;
}
__declspec(align(32)) float sin_j[8u], cos_j[8u];
__declspec(align(32)) float sin_k[8u], cos_k[8u];
FABE13_SINCOS(angles_j, sin_j, cos_j, 8u);
FABE13_SINCOS(angles_k, sin_k, cos_k, 8u);
--j;
float part1 = 0.0f;
float part2 = 0.0f;
float current_state = seed, current_res, current_res2;
while (j >= 0) {
size_t k = 0u;
while (k < 8u) {
const float sin_term = sin_j[j] * sin_j[j];
const float cos_term = cos_k[k] * cos_k[k];
XOR_RAND_GRSH(current_state, current_res);
XOR_RAND_GRSH(current_state, current_res2);
part1 = fmaf(current_res, sin_term, fmaf(current_res2, cos_term, part1));
XOR_RAND_GRSH(current_state, current_res);
XOR_RAND_GRSH(current_state, current_res2);
part2 = fmaf(-current_res, cos_term, fmaf(current_res2, sin_term, part2));
++k;
}
--j;
}
return -sqrtf(fmaf(part1, part1, part2 * part2));
}

static __declspec(noalias) __forceinline float Shag(const float _m, const float x1, const float x2, const float y1,
const float y2, const float _N, const float _r) noexcept
{
const float diff = y2 - y1;
const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(
0x3F800000u | ((reinterpret_cast<const uint32_t>(&diff) & 0x80000000u) ^ 0x80000000u))));
return _N == 1.0f
? fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f
: _N == 2.0f
? fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f
: fmaf(sign_mult / powf(_m, _N), powf(diff, _N) * _r, x1 + x2) * 0.5f;
}

__declspec(align(16)) struct Slab final {
char* __restrict base;
char* __restrict current;
char* __restrict end;

text
__declspec(noalias) __forceinline Slab(void* __restrict const memory, const size_t usable_size) noexcept : base(static_cast<char* __restrict>(memory)) , current(base) , end(base + (usable_size & ~static_cast<size_t>(63u))) { }

};

static tbb::enumerable_thread_specific<Slab*> tls( noexcept {
void* __restrict const memory = _aligned_malloc(16777216u, 16u);
Slab* __restrict const slab = static_cast<Slab*>(_aligned_malloc(32u, 16u));
*slab = Slab(memory, 16777216u);

char* __restrict p = slab->base;

#pragma loop(ivdep)
while (p < slab->end) {
*p = 0u;
p += 4096u;
}
return slab;
}());

__declspec(align(64)) struct Interval final {
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float ordinate_factor;
const float N_factor;
const float quadratic_term;
const float M;
float R;

text
__declspec(noalias) __forceinline void* operator new(const size_t) noexcept { Slab* __restrict s = tls.local(); char* __restrict result = s->current; s->current += 64u; return result; } __declspec(noalias) __forceinline Interval(const float _x1, const float _x2, const float _y1, const float _y2, const float _N) noexcept : x1(_x1), x2(_x2), y1(_y1), y2(_y2) , delta_y(_y2 - _y1) , ordinate_factor(-(y1 + y2) * 2.0f) , N_factor(_N == 1.0f ? _x2 - _x1 : _N == 2.0f ? sqrtf(_x2 - _x1) : powf(_x2 - _x1, 1.0f / _N)) , quadratic_term((1.0f / N_factor)* delta_y* delta_y) , M((1.0f / N_factor)* fabsf(delta_y)) { } __declspec(noalias) __forceinline void ChangeCharacteristic(const float _m) noexcept { R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor)); }

};

static __declspec(noalias) __forceinline bool ComparePtr(const Interval* __restrict a, const Interval* __restrict b) noexcept
{
return a->R < b->R;
}

const enum List : uint8_t { Top = 0b00u, Down = 0b01u, Left = 0b10u, Right = 0b11u };

__declspec(align(16)) struct Peano2DMap final {
const int levels;
const float a, b, c, d;
const float lenx, leny;
const float inv_lenx;
const uint32_t scale;
const uint8_t start;

text
__declspec(noalias) __forceinline Peano2DMap( const int L, const float _a, const float _b, const float _c, const float _d, const uint8_t startType ) noexcept : levels(L) , a(_a), b(_b), c(_c), d(_d) , lenx(_b - _a) , leny(_d - _c) , inv_lenx(1.0f / (_b - _a)) , scale(static_cast<uint32_t>(1u) << (L << 1)) , start(startType) { }

};

__declspec(align(4)) struct Step final {
const uint8_t next;
const uint8_t dx;
const uint8_t dy;
};

__declspec(align(4)) struct InvStep final {
const uint8_t q;
const uint8_t next;
};

__declspec(align(64)) static const Step g_step_tbl[4][4] = {
{ {Right,0u,0u}, {Top,0u,1u}, {Top,1u,1u}, {Left,1u,0u} },
{ {Left,1u,1u}, {Down,1u,0u}, {Down,0u,0u}, {Right,0u,1u} },
{ {Down,1u,1u}, {Left,0u,1u}, {Left,0u,0u}, {Top,1u,0u} },
{ {Top,0u,0u}, {Right,1u,0u}, {Right,1u,1u}, {Down,0u,1u} }
};

__declspec(align(64)) static const InvStep g_inv_tbl[4][4] = {
{ {0u,Right}, {1u,Top}, {3u,Left}, {2u,Top} },
{ {2u,Down}, {3u,Right}, {1u,Down}, {0u,Left} },
{ {2u,Left}, {1u,Left}, {3u,Top}, {0u,Down} },
{ {0u,Top}, {3u,Down}, {1u,Right}, {2u,Right} }
};

static Peano2DMap gActiveMap(0, 0.0f, 0.0f, 0.0f, 0.0f, 0b00u);

__declspec(noalias) __forceinline void HitTest2D_analytic(const float x_param, float& out_x1, float& out_x2) noexcept
{
const float a = gActiveMap.a;
const float inv_lenx = gActiveMap.inv_lenx;
const uint32_t scale = gActiveMap.scale;
const uint32_t scale_minus_1 = scale - 1u;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const float c = gActiveMap.c;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;

text
float norm = (x_param - a) * inv_lenx; norm = fminf(fmaxf(norm, 0.0f), 0x1.fffffep-1f); uint32_t idx = static_cast<uint32_t>(norm * static_cast<float>(scale)); idx = idx > scale_minus_1 ? scale_minus_1 : idx; float sx = lenx, sy = leny; float x1 = a, x2 = c; uint8_t type = start; int l = levels - 1;

#pragma loop(ivdep)
while (l >= 0) {
const uint32_t q = (idx >> (l * 2)) & 3u;
const Step s = g_step_tbl[type][q];
type = s.next;
sx *= 0.5f; sy *= 0.5f;
x1 += s.dx ? sx : 0.0f;
x2 += s.dy ? sy : 0.0f;
--l;
}
out_x1 = x1 + sx * 0.5f;
out_x2 = x2 + sy * 0.5f;
}

__declspec(noalias) __forceinline float FindX2D_analytic(const float px, const float py) noexcept
{
const float a = gActiveMap.a;
const float b = gActiveMap.b;
const float c = gActiveMap.c;
const float d = gActiveMap.d;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const uint32_t scale = gActiveMap.scale;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;

text
const float clamped_px = fminf(fmaxf(px, a), b); const float clamped_py = fminf(fmaxf(py, c), d); float sx = lenx, sy = leny; float x0 = a, y0 = c; uint32_t idx = 0u; uint8_t type = start; int l = 0;

#pragma loop(ivdep)
while (l < levels) {
sx *= 0.5f; sy *= 0.5f;
const float mx = x0 + sx;
const float my = y0 + sy;

text
const uint32_t tr = static_cast<uint32_t>((clamped_px > mx) & (clamped_py > my)); const uint32_t tl = static_cast<uint32_t>((clamped_px < mx) & (clamped_py > my)); const uint32_t dl = static_cast<uint32_t>((clamped_px < mx) & (clamped_py < my)); const uint32_t none = static_cast<uint32_t>(1u ^ (tr | tl | dl)); const uint32_t dd = (tr << 1) | tr | tl | (none << 1); const InvStep inv = g_inv_tbl[type][dd]; type = inv.next; idx = (idx << 2) | inv.q; const uint32_t dx = dd >> 1; const uint32_t dy = dd & 1u; x0 += dx ? sx : 0.0f; y0 += dy ? sy : 0.0f; ++l; } const float scale_reciprocal = 1.0f / static_cast<float>(scale); return fmaf(static_cast<float>(idx) * scale_reciprocal, lenx, a);

}

static boost::mpi::environment* __restrict g_env;
static boost::mpi::communicator* __restrict g_world;
static boost::mpi::request recv_req;

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(const int peanoLevel, const float a, const float b, const float c, const float d) noexcept
{
g_env = new boost::mpi::environment();
g_world = new boost::mpi::communicator();
const int rank = g_world->rank();

text
new(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, rank ? static_cast<uint8_t>(Down) : static_cast<uint8_t>(Top)); return rank;

}

__declspec(align(16)) struct CrossMsg final {
float s_x1, s_x2;
float e_x1, e_x2;
float Rtop;
template<typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& __restrict const ar, const unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }
};

static __declspec(noalias) __forceinline void RecomputeR_ConstM_AVX2(Interval* const* __restrict arr, const size_t n, const float m) noexcept {
const __m256 vm = _mm256_set1_ps(m);

text
__m256 vinvm = _mm256_rcp_ps(vm); vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f))); size_t i = 0; const int limit = static_cast<int>(n & ~7u); if (n >= 8u) { while (i < static_cast<size_t>(limit)) { __declspec(align(32)) float q[8], nf[8], ord[8]; int k = 0;

#pragma loop(ivdep)
while (k < 8) {
const Interval* const __restrict p = arr[i + k];
q[k] = p->quadratic_term;
nf[k] = p->N_factor;
ord[k] = p->ordinate_factor;
++k;
}
__m256 vq = _mm256_load_ps(q);
__m256 vnf = _mm256_load_ps(nf);
__m256 vod = _mm256_load_ps(ord);

text
__m256 t = _mm256_fmadd_ps(vm, vnf, vod); __m256 res = _mm256_fmadd_ps(vq, vinvm, t); __declspec(align(32)) float out[8]; _mm256_store_ps(out, res); k = 0;

#pragma loop(ivdep)
while (k < 8) {
arr[i + k]->R = out[k];
++k;
}
i += 8u;
}
}
while (i < n) {
arr[i]->ChangeCharacteristic(m);
++i;
}
}

static __declspec(noalias) __forceinline void RecomputeR_AffineM_AVX2(Interval* const* __restrict arr, const size_t n, const float GLOBAL_FACTOR, const float alpha) noexcept {
const __m256 vGF = _mm256_set1_ps(GLOBAL_FACTOR);
const __m256 va = _mm256_set1_ps(alpha);

text
size_t i = 0; int limit = static_cast<int>(n & ~7u); if (n >= 8u) { while (i < static_cast<size_t>(limit)) { __declspec(align(32)) float len[8], Mv[8], q[8], nf[8], ord[8]; int k = 0;

#pragma loop(ivdep)
while (k < 8) {
const Interval* p = arr[i + k];
len[k] = p->x2 - p->x1;
Mv[k] = p->M;
q[k] = p->quadratic_term;
nf[k] = p->N_factor;
ord[k] = p->ordinate_factor;
++k;
}
__m256 vlen = _mm256_load_ps(len);
__m256 vM = _mm256_load_ps(Mv);
__m256 vq = _mm256_load_ps(q);
__m256 vnf = _mm256_load_ps(nf);
__m256 vod = _mm256_load_ps(ord);

text
__m256 vm = _mm256_fmadd_ps(vGF, vlen, _mm256_mul_ps(va, vM)); __m256 vinvm = _mm256_rcp_ps(vm); vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f))); __m256 t = _mm256_fmadd_ps(vm, vnf, vod); __m256 res = _mm256_fmadd_ps(vq, vinvm, t); __declspec(align(32)) float out[8]; _mm256_store_ps(out, res); k = 0;

#pragma loop(ivdep)
while (k < 8) {
arr[i + k]->R = out[k];
++k;
}
i += 8u;
}
}
while (i < n) {
const Interval* const __restrict p = arr[i];
const float mi = fmaf(GLOBAL_FACTOR, p->x2 - p->x1, p->M * alpha);
arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor));
++i;
}
}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_1D(const float global_iterations,
const float a, const float b, const float r,
const bool mode, const float epsilon, const float seed,
float** __restrict out_data, size_t* __restrict out_len) noexcept
{
int schetchick = 0;
const float initial_length = b - a;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length;
const float inv_threshold_03 = 1.0f / threshold_03;
const float start_val = ShekelFunc(a, seed);
float best_f = ShekelFunc(b, seed);
float x_Rmax_1 = a;
float x_Rmax_2 = b;
float y_Rmax_1 = start_val;
float y_Rmax_2 = best_f;

text
std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr; std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R; Extr.clear(); Extr.reserve(static_cast<size_t>(global_iterations) << 2u); R.clear(); R.reserve(static_cast<size_t>(global_iterations) << 1u); R.emplace_back(new Interval(a, b, start_val, best_f, 1.0f)); float Mmax = R.front()->M; float m = r * Mmax; while (true) { float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r); float new_value = ShekelFunc(new_point, seed); if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_point); } std::pop_heap(R.begin(), R.end(), ComparePtr); Interval* __restrict promejutochny_otrezok = R.back(); float new_x1 = promejutochny_otrezok->x1; float new_x2 = promejutochny_otrezok->x2; float len2 = new_x2 - new_point; float len1 = new_point - new_x1; float interval_len = len1 < len2 ? len1 : len2; if (interval_len < epsilon || ++schetchick == static_cast<int>(global_iterations)) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); return; } Interval* __restrict curr = new Interval(new_x1, new_point, promejutochny_otrezok->y1, new_value, 1.0f); Interval* __restrict curr1 = new Interval(new_point, new_x2, new_value, promejutochny_otrezok->y2, 1.0f); float currM = curr->M > curr1->M ? curr->M : curr1->M; size_t r_size = R.size(); if (mode) { if (len2 + len1 == dmax) { dmax = len2 > len1 ? len2 : len1; size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
float len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}

text
if (threshold_03 > dmax && schetchick % 3 == 0 || 10.0f * dmax < initial_length) { if (currM > Mmax) { Mmax = currM; m = r * Mmax; } float progress = fmaf(-inv_threshold_03, dmax, 1.0f); float alpha = fmaf(progress, progress, 1.0f); float betta = 2.0f - alpha; float MULTIPLIER = (1.0f / dmax) * Mmax; float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER); float GLOBAL_FACTOR = betta * global_coeff; curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha)); curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha)); if (r_size < 64u) { size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
}
else {
RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
if (currM - Mmax < Mmax * 0.15f) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
if (r_size < 64u) {
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
if (currM - Mmax < Mmax * 0.15f) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
if (r_size < 64u) {
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}

text
R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr); R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr); Interval* __restrict top_ptr = R.front(); x_Rmax_1 = top_ptr->x1; x_Rmax_2 = top_ptr->x2; y_Rmax_1 = top_ptr->y1; y_Rmax_2 = top_ptr->y2; }

}

extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_ND(
const float N, const float global_iterations,
const float a, const float b, const float c, const float d, const float r,
const bool mode, const float epsilon, const float seed,
float** __restrict out_data, size_t* __restrict out_len) noexcept
{
const int rank = g_world->rank();
const int partner = rank ^ 1;
int dummy;
if (rank) recv_req = g_world->irecv(1, 0, dummy);

text
if (N == 2.0f) { const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1)); const float x_addition = (b - a) * inv_divider; const float y_addition = (d - c) * inv_divider; const float true_start = a + x_addition; const float true_end = b - x_addition; float x_Rmax_1 = true_start; float x_Rmax_2 = true_end; const float initial_length = true_end - true_start; float dmax = initial_length; const float threshold_03 = 0.3f * initial_length; const float inv_threshold_03 = 1.0f / threshold_03; const float start_val = rank ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition); float y_Rmax_1 = start_val; float best_f = rank ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition); float y_Rmax_2 = best_f; std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr; std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R; Extr.clear(); Extr.reserve(static_cast<size_t>(global_iterations) << 2u); R.clear(); R.reserve(static_cast<size_t>(global_iterations) << 1u); R.emplace_back(new Interval(true_start, true_end, start_val, best_f, 2.0f)); float Mmax = R.front()->M; float m = r * Mmax; int schetchick = 0; float interval_len = 0.0f; while (true) { if (g_world->iprobe(1, 0)) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); recv_req.wait(); recv_req = g_world->irecv(1, 0, dummy); return; } float cooling = ldexpf(1.0f, -(1.0f / 138.63f) * ++schetchick); int T = static_cast<int>(fmaf(20.0f, cooling, 10.0f)); float k = fmaf(0.2f, cooling, 0.7f); Interval* __restrict top_ptr = R.front(); if (schetchick % T == 0) { float s_x1, s_x2, e_x1, e_x2; HitTest2D_analytic(top_ptr->x1, s_x1, s_x2); HitTest2D_analytic(top_ptr->x2, e_x1, e_x2); CrossMsg outbound = CrossMsg{ s_x1, s_x2, e_x1, e_x2, top_ptr->R }; CrossMsg inbound; g_world->sendrecv(partner, 0, outbound, partner, 0, inbound); const float sx = FindX2D_analytic(inbound.s_x1, inbound.s_x2); const float ex = FindX2D_analytic(inbound.e_x1, inbound.e_x2); Interval* __restrict injected = new Interval(sx, ex, RastriginFunc(inbound.s_x1, inbound.s_x2), RastriginFunc(inbound.e_x1, inbound.e_x2), 2.0f); injected->R = inbound.Rtop * k; R.emplace_back(injected); std::push_heap(R.begin(), R.end(), ComparePtr); } float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r); float new_x1_val, new_x2_val; HitTest2D_analytic(new_point, new_x1_val, new_x2_val); float new_value = RastriginFunc(new_x1_val, new_x2_val); if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_x1_val); Extr.emplace_back(new_x2_val); } std::pop_heap(R.begin(), R.end(), ComparePtr); Interval* __restrict promejutochny_otrezok = R.back(); float segment_x1 = promejutochny_otrezok->x1; float segment_x2 = promejutochny_otrezok->x2; float len2 = segment_x2 - new_point; float len1 = new_point - segment_x1; interval_len = len1 < len2 ? len1 : len2; if (interval_len < epsilon || schetchick == static_cast<int>(global_iterations)) { g_world->send(0, 0, 0); return; } Interval* __restrict curr = new Interval(segment_x1, new_point, promejutochny_otrezok->y1, new_value, 2.0f); Interval* __restrict curr1 = new Interval(new_point, segment_x2, new_value, promejutochny_otrezok->y2, 2.0f); float currM = curr->M > curr1->M ? curr->M : curr1->M; size_t r_size = R.size(); if (mode) { if (len2 + len1 == dmax) { dmax = len2 > len1 ? len2 : len1; size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
float len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}

text
if (threshold_03 > dmax && schetchick % 3 == 0 || 10.0f * dmax < initial_length) { if (currM > Mmax) { Mmax = currM; m = r * Mmax; } float progress = fmaf(-inv_threshold_03, dmax, 1.0f); float alpha = fmaf(progress, progress, 1.0f); float betta = 2.0f - alpha; float MULTIPLIER = (1.0f / dmax) * Mmax; float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER); float GLOBAL_FACTOR = betta * global_coeff; curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha)); curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha)); if (r_size < 64u) { size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
}
else {
RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
if (currM - Mmax < Mmax * 0.15f) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
if (r_size < 64u) {
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
if (currM - Mmax < Mmax * 0.15f) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
if (r_size < 64u) {
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}

text
R.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr); R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr); top_ptr = R.front(); x_Rmax_1 = top_ptr->x1; x_Rmax_2 = top_ptr->x2; y_Rmax_1 = top_ptr->y1; y_Rmax_2 = top_ptr->y2; } } else { }

}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() noexcept
{
int dummy;
recv_req = g_world->irecv(0, 0, dummy);
float* __restrict buf;
size_t len;
while (true) {
if (recv_req.test()) {
AGP_ND(2.0f, 1000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.001f, GetTickCount(), &buf, &len);
recv_req = g_world->irecv(0, 0, dummy);
}
}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpStartWorkers() noexcept
{
g_world->isend(1, 0, 0);
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_ND_Free(float* __restrict p) noexcept
{
CoTaskMemFree(p);
} зацени код, я его написал

Pensamiento

Reviewing user code

The user has shared a complex C++ code for a random number generator, simulating optimization algorithms and using advanced techniques like AVX2, MPI, and custom memory allocation. I’ll focus on several areas: correctness, performance, safety, and readability. Notably, using signed ints introduces undefined behavior in left shifts; I’d recommend using uint32_t. I’ll provide feedback on other points too, like state variable handling and potential performance optimizations.

Reviewing macro usage and potential issues

There’s a macro that uses a local variable 's' assigned from 'state', then updates 'state'. Since the macro parameter is an expression, it might evaluate multiple times, causing side effects. One issue involves passing a float seed to an integer-only random generator – this could break logic. Additionally, left shifts with signed integers can result in undefined behavior. We should ensure the type consistency across functions.

Identifying the bug with XOR_RAND

The XOR_RAND macro has a serious bug. It implicitly casts the float 'state' to an int for operations, but this truncates the float value and loses precision. Later, it stores this integer back into 'state' as a float, which corrupts the randomness. The Shekel function uses this broken approach, making the RNG deterministic but flawed.

Reviewing random number generation issues

The RastriginFunc doesn't use RNG, while HillFunc and GrishaginFunc both use a float current_state. The XOR_RAND_GRSH macro causes issues since it manipulates state as both a float and an int using bitwise operations. This creates a problem: state is first set to an int value, then converted back into a float, which results in unexpected behavior due to truncation.

Analyzing function performance and potential flaws

The RNG has a flaw, changing bits during an integer shift, which can lead to poor results. Using a uint32_t for state rather than a float helps improve reliability. The SIMD operations and constants used in trig functions may cause issues with strict aliasing and potential memory mismanagement with Slab allocation. The use of __forceinline with exported functions is unnecessary across translations.

Identifying bugs and improving stability

In the code, potential issues include uninitialized outputs returning from certain functions, division by zero for certain calculations, and improper type handling in functions like ldexpf. There are also concerns with memory allocation, concurrency, and the need for compiler-specific optimizations. Improvements could guard against zero values and avoid type casting issues, ensuring robustness across platforms.

Ensuring thread safety and improving efficiency

The code has some thread-safety concerns, like the global gActiveMap being accessed by multiple functions. It also has potential bugs, such as incorrect handling of pop_heap and float equality checks. Improvements could include using inline functions instead of macros, refining RNG handling, and fixing calculation and cooling issues. Additionally, using constexpr constants and proper alignment for arrays would enhance stability.

Reviewing the code

The code shows strong low-level optimization, but there are some potential issues. One issue is the incorrect use of MPI endpoints, which could cause communication bugs between ranks. There's also a risk of undefined behavior (UB) with the unaligned memory access. Fixing these issues involves adjusting the use of the partner variable in MPI communication and ensuring proper memory alignment.

Thought for 2m 23s

Respuesta

Круто! Видно, что ты любишь низкоуровневые фокусы: AVX2, FMA, собственный slab-аллокатор, MPI-инъекции интервалов — прям “жирная” оптимизация. Ниже — самое важное, что бросилось в глаза. Я держал фокус на корректности/детерминизме, численных нюансах и переносимости.

⚠️ Потенциальные баги / тонкие места

  1. ГПСЧ в макросах (XOR_RAND*) использует int и даже float как state

    • В ShekelFunc, HillFunc, GrishaginFunc current_statefloat, а макрос внутри делает int s = state; s ^= s<<13; ...; state = s;
      Это приводит к неоднозначным преобразованиям float→int (усечение), затем обратно int→float; плюс сдвиг знаковых int — UB в C++.
    • Рекомендация: заменить на uint32_t и инлайн-функции вместо макросов.
      cpp
      struct Xor32 { uint32_t s; inline float u01() noexcept { // [0,1) s ^= s<<13; s ^= s>>17; s ^= s<<5; return (float)s * 0x1p-32f; } inline float u11() noexcept { // [-1,1) s ^= s<<13; s ^= s>>17; s ^= s<<5; return fmaf((float)s, 0x1p-31f, -1.0f); } }; // и использовать Xor32 rng{seed_as_uint};
    • Плюс: уберёшь множественное вычисление аргументов макросами и избежишь случайных сайд-эффектов.
  2. HillFunc: потерян current_res2
    Здесь оба коэффициента берутся из current_res, а current_res2 не используется:

    cpp
    res += fmaf(fmaf(current_res, 2.0f, -1.1f), sin_vals[j], fmaf(current_res, 2.0f, -1.1f) * cos_vals[j]);

    Скорее всего, второе слагаемое должно использовать current_res2.

  3. ldexpf с нецелым показателем

    cpp
    float cooling = ldexpf(1.0f, -(1.0f / 138.63f) * ++schetchick); // второй арг. должен быть int

    Это не то, что ты хочешь. Возьми exp2f/scalbnf:

    cpp
    float cooling = exp2f(-(++schetchick) / 138.63f); // гладкое экспоненциальное охлаждение
  4. MPI: источники/приёмники захардкожены 1 вместо partner
    В AGP_ND инициализируешь:

    cpp
    const int partner = rank ^ 1; if (rank) recv_req = g_world->irecv(1, 0, dummy); // <- надо partner // ... if (g_world->iprobe(1, 0)) { ... } // <- тоже partner

    Аналогично в AgpWaitStartAndRun: irecv(0,0,...) у rank=0 слушает сам себя. Поменять на partner везде, где обмен между ранками.

  5. AGP_ND: ранние выходы без заполнения out_data/out_len
    В нескольких ветках делаешь return после send, но не заполняешь буфер результата — API становится недетерминированным для вызывающей стороны. Прежде чем выйти, всегда формируй Extr и выставляй *out_data/*out_len.

  6. Потенциальное деление на ноль в характеристике
    m = r * Mmax; и далее 1.0f / mChangeCharacteristic). Если delta_y≈0, Mmax может обнулиться. Лучше зажать:

    cpp
    constexpr float M_EPS = 1e-12f; m = std::max(r * Mmax, M_EPS);
  7. AVX2: _mm256_load_ps требует 32-байтового выравнивания входа
    В FABE13_SINCOS грузишь in[i] через _mm256_load_ps. Сейчас ты передаёшь туда свои локальные align(32) массивы — ок, но макрос соблазнительно использовать и с невыравненными буферами. Либо явно требуй выравнивание (через assume_aligned/assert), либо используй _mm256_loadu_ps.

  8. Сравнение dmax по ==
    Условие if (len2 + len1 == dmax) хрупко из-за ошибок округления. Лучше сравнение с допуском или другой триггер обновления.

  9. Strict aliasing / type-punning
    В Shag делаешь побитовый трюк с reinterpret_cast<const uint32_t*>. Это UB в строгом смысле. Возьми std::bit_cast<uint32_t>(diff) (C++20) или через memcpy.

  10. Инлайнинг экспортируемых функций
    __forceinline на extern "C" __declspec(dllexport) почти бессмысленен. Экспортируемые символы как правило не инлайнится за границы TU.

Численные моменты

  • Редукция угла через “bias trick” у тебя корректная (для |x| < 2^23) и дублирована в SIMD/скаляре — всё норм.
  • Косинус полиномом 4-й степени (1 - x²/2 + x⁴/24) после редукции на [0, π/2] даст MAPE ~1e-3…1e-2 (в зависимости от редукции и FMA). Если критично — можно добавить терм -x⁶/720 для улучшения края.

Переносимость и чистота

  • Код сильно привязан к MSVC/Windows (__declspec, _aligned_malloc, CoTaskMemAlloc). Подружи это через тонкую абстракцию:
    cpp
    #if MSVC # define ALIGN(n) __declspec(align(n)) #else # define ALIGN(n) alignas(n) #endif
    и заменить явные вызовы на кросс-платформенные аналоги (или обёртки).
  • Для выравнивания локальных массивов лучше alignas(32) вместо __declspec(align(32)).
  • Освобождение TLS-slab’а отсутствует (утечка на жизнь процесса). Возможно ок, но стоит продумать деструктор enumerable_thread_specific.

Микро-улучшения / стиль

  • Вынеси константы π/2/2π и коэффициенты полиномов в constexpr/std::numbers::pi_v<float>.
  • Поясни формат Extr (сейчас в 1D — пары {best, x}, в конце — {iters, len}; в 2D — тройки {best, x, y}). Лучше сделать маленький POD-заголовок или отдельные выходные структуры.
  • В RecomputeR_* уже делаешь Ньютоновскую итерацию к _rcp_ps — nice. Можно заодно добавить #pragma clang loop vectorize(enable) там, где AVX выключен.

Если хочешь, могу быстро накидать патч со следующим набором правок: (1) замена RNG-макросов на Xor32, (2) фикс HillFunc, (3) exp2f вместо ldexpf, (4) правка MPI источников на partner, (5) гарантированное заполнение out_* в AGP_ND, (6) m с эпсилоном, (7) loadu_ps в FABE13_SINCOS.

Compartir este Q&A