#include "pch.h" #define XOR_RAND(state, result_v...
تم الإنشاء في: ١٦ أكتوبر ٢٠٢٥
تمت الإجابة باستخدام GPT-5 Thinking بواسطة Chat01
تم الإنشاء في: ١٦ أكتوبر ٢٠٢٥
تمت الإجابة باستخدام GPT-5 Thinking بواسطة Chat01
#include "pch.h"
#define XOR_RAND(state, result_var)
do {
int s = state;
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = state * 0x1.0p-32f;
} while(0)
#define XOR_RAND_GRSH(state, result_var)
do {
int s = state;
s ^= s << 13;
s ^= s >> 17;
s ^= s << 5;
state = s;
result_var = fmaf(state, 0x1.0p-31f, -1.0f);
} while(0)
#define FABE13_COS(x, result_var)
do {
float abs_val = fabsf(x);
float reduced = fmodf(abs_val, 6.28318530718f);
if(reduced > 3.14159265359f) {
reduced = 6.28318530718f - reduced;
}
if(reduced < 1.57079632679f) {
float val2 = reduced * reduced;
result_var = fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
} else {
reduced = 3.14159265359f - reduced;
float val2 = reduced * reduced;
result_var = -fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
}
} while(0)
#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
int limit = n & ~7;
if(n >= 8) {
static __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f);
static __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f);
static __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f);
static __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f);
static __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f);
static __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f);
static __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f);
static __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f);
static __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f);
static __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps();
while(i < limit) {
__m256 vx = _mm256_load_ps(&in[i]);
__m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
__m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
__m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
__m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
__m256 t2 = _mm256_mul_ps(r2, r2);
__m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P3, VEC_COS_P1), VEC_COS_P0);
__m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, VEC_SIN_P1, VEC_SIN_P0), r2);
__m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
__m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ), _mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
cosv = _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip);
sinv = _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip);
_mm256_store_ps(&cos_out[i], cosv);
_mm256_store_ps(&sin_out[i], sinv);
i += 8;
}
}
while(i < n) {
float x = in[i];
float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
bool sflip = r > 3.14159265359f;
if(sflip) r = 6.28318530718f - r;
bool cflip = r > 1.57079632679f;
if(cflip) r = 3.14159265359f - r;
float t2 = r * r;
float c = fmaf(t2, fmaf(t2, 0.0416666667f, -0.5f), 1.0f);
float s = fmaf(t2, -0.16666666f, 1.0f) * r;
cos_out[i] = cflip ? -c : c;
sin_out[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while(0)
static __declspec(noalias) __forceinline float ShekelFunc(const float x, const float seed) noexcept
{
int i = 0;
float current_state = seed, current_res, current_res2, res = 0.0f;
while (i < 10) {
XOR_RAND(current_state, current_res);
const float x_part = fmaf(-current_res, 10.0f, x);
XOR_RAND(current_state, current_res);
XOR_RAND(current_state, current_res2);
float delimiter = fmaf(fmaf(current_res, 20.0f, 5.0f), x_part * x_part, fmaf(current_res2, 0.2f, 1.0f));
delimiter = copysignf(fmaxf(fabsf(delimiter), FLT_MIN), delimiter);
res -= 1.0f / delimiter;
++i;
}
return res;
}
static __declspec(noalias) __forceinline float RastriginFunc(const float x1, const float x2) noexcept
{
const float term1 = fmaf(x1, x1, x2 * x2);
float cos1, cos2;
FABE13_COS(6.28318530717958647692f * x1, cos1);
FABE13_COS(6.28318530717958647692f * x2, cos2);
return (term1 - fmaf(cos1 + cos2, 10.0f, -14.6f)) * fmaf(-term1, 0.25f, 18.42f);
}
static __declspec(noalias) __forceinline float HillFunc(const float x, const float seed) noexcept
{
int j = 0;
__declspec(align(32)) float angles[14u];
const float start_angle = 6.28318530717958647692f * x;
#pragma loop(ivdep)
while (j < 14) {
angles[j] = start_angle * static_cast<float>(j + 1);
++j;
}
__declspec(align(32)) float sin_vals[14u];
__declspec(align(32)) float cos_vals[14u];
FABE13_SINCOS(angles, sin_vals, cos_vals, 14u);
float current_state = seed, current_res, current_res2;
XOR_RAND(current_state, current_res);
float res = fmaf(current_res, 2.0f, -1.1f);
--j;
while (j >= 0) {
XOR_RAND(current_state, current_res);
XOR_RAND(current_state, current_res2);
res += fmaf(fmaf(current_res, 2.0f, -1.1f), sin_vals[j], fmaf(current_res, 2.0f, -1.1f) * cos_vals[j]);
--j;
}
return res;
}
static __declspec(noalias) __forceinline float GrishaginFunc(const float x1, const float x2, const float seed) noexcept
{
int j = 0;
__declspec(align(32)) float angles_j[8u];
__declspec(align(32)) float angles_k[8u];
#pragma loop(ivdep)
while (j < 8) {
const float pj_mult = 3.14159265358979323846f * static_cast<float>(j + 1);
angles_j[j] = pj_mult * x1;
angles_k[j] = pj_mult * x2;
++j;
}
__declspec(align(32)) float sin_j[8u], cos_j[8u];
__declspec(align(32)) float sin_k[8u], cos_k[8u];
FABE13_SINCOS(angles_j, sin_j, cos_j, 8u);
FABE13_SINCOS(angles_k, sin_k, cos_k, 8u);
--j;
float part1 = 0.0f;
float part2 = 0.0f;
float current_state = seed, current_res, current_res2;
while (j >= 0) {
size_t k = 0u;
while (k < 8u) {
const float sin_term = sin_j[j] * sin_j[j];
const float cos_term = cos_k[k] * cos_k[k];
XOR_RAND_GRSH(current_state, current_res);
XOR_RAND_GRSH(current_state, current_res2);
part1 = fmaf(current_res, sin_term, fmaf(current_res2, cos_term, part1));
XOR_RAND_GRSH(current_state, current_res);
XOR_RAND_GRSH(current_state, current_res2);
part2 = fmaf(-current_res, cos_term, fmaf(current_res2, sin_term, part2));
++k;
}
--j;
}
return -sqrtf(fmaf(part1, part1, part2 * part2));
}
static __declspec(noalias) __forceinline float Shag(const float _m, const float x1, const float x2, const float y1,
const float y2, const float _N, const float _r) noexcept
{
const float diff = y2 - y1;
const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(
0x3F800000u | ((reinterpret_cast<const uint32_t>(&diff) & 0x80000000u) ^ 0x80000000u))));
return _N == 1.0f
? fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f
: _N == 2.0f
? fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f
: fmaf(sign_mult / powf(_m, _N), powf(diff, _N) * _r, x1 + x2) * 0.5f;
}
__declspec(align(16)) struct Slab final {
char* __restrict base;
char* __restrict current;
char* __restrict end;
text__declspec(noalias) __forceinline Slab(void* __restrict const memory, const size_t usable_size) noexcept : base(static_cast<char* __restrict>(memory)) , current(base) , end(base + (usable_size & ~static_cast<size_t>(63u))) { }
};
static tbb::enumerable_thread_specific<Slab*> tls( noexcept {
void* __restrict const memory = _aligned_malloc(16777216u, 16u);
Slab* __restrict const slab = static_cast<Slab*>(_aligned_malloc(32u, 16u));
*slab = Slab(memory, 16777216u);
char* __restrict p = slab->base;
#pragma loop(ivdep)
while (p < slab->end) {
*p = 0u;
p += 4096u;
}
return slab;
}());
__declspec(align(64)) struct Interval final {
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float ordinate_factor;
const float N_factor;
const float quadratic_term;
const float M;
float R;
text__declspec(noalias) __forceinline void* operator new(const size_t) noexcept { Slab* __restrict s = tls.local(); char* __restrict result = s->current; s->current += 64u; return result; } __declspec(noalias) __forceinline Interval(const float _x1, const float _x2, const float _y1, const float _y2, const float _N) noexcept : x1(_x1), x2(_x2), y1(_y1), y2(_y2) , delta_y(_y2 - _y1) , ordinate_factor(-(y1 + y2) * 2.0f) , N_factor(_N == 1.0f ? _x2 - _x1 : _N == 2.0f ? sqrtf(_x2 - _x1) : powf(_x2 - _x1, 1.0f / _N)) , quadratic_term((1.0f / N_factor)* delta_y* delta_y) , M((1.0f / N_factor)* fabsf(delta_y)) { } __declspec(noalias) __forceinline void ChangeCharacteristic(const float _m) noexcept { R = fmaf(1.0f / _m, quadratic_term, fmaf(_m, N_factor, ordinate_factor)); }
};
static __declspec(noalias) __forceinline bool ComparePtr(const Interval* __restrict a, const Interval* __restrict b) noexcept
{
return a->R < b->R;
}
const enum List : uint8_t { Top = 0b00u, Down = 0b01u, Left = 0b10u, Right = 0b11u };
__declspec(align(16)) struct Peano2DMap final {
const int levels;
const float a, b, c, d;
const float lenx, leny;
const float inv_lenx;
const uint32_t scale;
const uint8_t start;
text__declspec(noalias) __forceinline Peano2DMap( const int L, const float _a, const float _b, const float _c, const float _d, const uint8_t startType ) noexcept : levels(L) , a(_a), b(_b), c(_c), d(_d) , lenx(_b - _a) , leny(_d - _c) , inv_lenx(1.0f / (_b - _a)) , scale(static_cast<uint32_t>(1u) << (L << 1)) , start(startType) { }
};
__declspec(align(4)) struct Step final {
const uint8_t next;
const uint8_t dx;
const uint8_t dy;
};
__declspec(align(4)) struct InvStep final {
const uint8_t q;
const uint8_t next;
};
__declspec(align(64)) static const Step g_step_tbl[4][4] = {
{ {Right,0u,0u}, {Top,0u,1u}, {Top,1u,1u}, {Left,1u,0u} },
{ {Left,1u,1u}, {Down,1u,0u}, {Down,0u,0u}, {Right,0u,1u} },
{ {Down,1u,1u}, {Left,0u,1u}, {Left,0u,0u}, {Top,1u,0u} },
{ {Top,0u,0u}, {Right,1u,0u}, {Right,1u,1u}, {Down,0u,1u} }
};
__declspec(align(64)) static const InvStep g_inv_tbl[4][4] = {
{ {0u,Right}, {1u,Top}, {3u,Left}, {2u,Top} },
{ {2u,Down}, {3u,Right}, {1u,Down}, {0u,Left} },
{ {2u,Left}, {1u,Left}, {3u,Top}, {0u,Down} },
{ {0u,Top}, {3u,Down}, {1u,Right}, {2u,Right} }
};
static Peano2DMap gActiveMap(0, 0.0f, 0.0f, 0.0f, 0.0f, 0b00u);
__declspec(noalias) __forceinline void HitTest2D_analytic(const float x_param, float& out_x1, float& out_x2) noexcept
{
const float a = gActiveMap.a;
const float inv_lenx = gActiveMap.inv_lenx;
const uint32_t scale = gActiveMap.scale;
const uint32_t scale_minus_1 = scale - 1u;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const float c = gActiveMap.c;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;
textfloat norm = (x_param - a) * inv_lenx; norm = fminf(fmaxf(norm, 0.0f), 0x1.fffffep-1f); uint32_t idx = static_cast<uint32_t>(norm * static_cast<float>(scale)); idx = idx > scale_minus_1 ? scale_minus_1 : idx; float sx = lenx, sy = leny; float x1 = a, x2 = c; uint8_t type = start; int l = levels - 1;
#pragma loop(ivdep)
while (l >= 0) {
const uint32_t q = (idx >> (l * 2)) & 3u;
const Step s = g_step_tbl[type][q];
type = s.next;
sx *= 0.5f; sy *= 0.5f;
x1 += s.dx ? sx : 0.0f;
x2 += s.dy ? sy : 0.0f;
--l;
}
out_x1 = x1 + sx * 0.5f;
out_x2 = x2 + sy * 0.5f;
}
__declspec(noalias) __forceinline float FindX2D_analytic(const float px, const float py) noexcept
{
const float a = gActiveMap.a;
const float b = gActiveMap.b;
const float c = gActiveMap.c;
const float d = gActiveMap.d;
const float lenx = gActiveMap.lenx;
const float leny = gActiveMap.leny;
const uint32_t scale = gActiveMap.scale;
const uint8_t start = gActiveMap.start;
const int levels = gActiveMap.levels;
textconst float clamped_px = fminf(fmaxf(px, a), b); const float clamped_py = fminf(fmaxf(py, c), d); float sx = lenx, sy = leny; float x0 = a, y0 = c; uint32_t idx = 0u; uint8_t type = start; int l = 0;
#pragma loop(ivdep)
while (l < levels) {
sx *= 0.5f; sy *= 0.5f;
const float mx = x0 + sx;
const float my = y0 + sy;
textconst uint32_t tr = static_cast<uint32_t>((clamped_px > mx) & (clamped_py > my)); const uint32_t tl = static_cast<uint32_t>((clamped_px < mx) & (clamped_py > my)); const uint32_t dl = static_cast<uint32_t>((clamped_px < mx) & (clamped_py < my)); const uint32_t none = static_cast<uint32_t>(1u ^ (tr | tl | dl)); const uint32_t dd = (tr << 1) | tr | tl | (none << 1); const InvStep inv = g_inv_tbl[type][dd]; type = inv.next; idx = (idx << 2) | inv.q; const uint32_t dx = dd >> 1; const uint32_t dy = dd & 1u; x0 += dx ? sx : 0.0f; y0 += dy ? sy : 0.0f; ++l; } const float scale_reciprocal = 1.0f / static_cast<float>(scale); return fmaf(static_cast<float>(idx) * scale_reciprocal, lenx, a);
}
static boost::mpi::environment* __restrict g_env;
static boost::mpi::communicator* __restrict g_world;
static boost::mpi::request recv_req;
extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(const int peanoLevel, const float a, const float b, const float c, const float d) noexcept
{
g_env = new boost::mpi::environment();
g_world = new boost::mpi::communicator();
const int rank = g_world->rank();
textnew(&gActiveMap) Peano2DMap(peanoLevel, a, b, c, d, rank ? static_cast<uint8_t>(Down) : static_cast<uint8_t>(Top)); return rank;
}
__declspec(align(16)) struct CrossMsg final {
float s_x1, s_x2;
float e_x1, e_x2;
float Rtop;
template<typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& __restrict const ar, const unsigned int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }
};
static __declspec(noalias) __forceinline void RecomputeR_ConstM_AVX2(Interval* const* __restrict arr, const size_t n, const float m) noexcept {
const __m256 vm = _mm256_set1_ps(m);
text__m256 vinvm = _mm256_rcp_ps(vm); vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f))); size_t i = 0; const int limit = static_cast<int>(n & ~7u); if (n >= 8u) { while (i < static_cast<size_t>(limit)) { __declspec(align(32)) float q[8], nf[8], ord[8]; int k = 0;
#pragma loop(ivdep)
while (k < 8) {
const Interval* const __restrict p = arr[i + k];
q[k] = p->quadratic_term;
nf[k] = p->N_factor;
ord[k] = p->ordinate_factor;
++k;
}
__m256 vq = _mm256_load_ps(q);
__m256 vnf = _mm256_load_ps(nf);
__m256 vod = _mm256_load_ps(ord);
text__m256 t = _mm256_fmadd_ps(vm, vnf, vod); __m256 res = _mm256_fmadd_ps(vq, vinvm, t); __declspec(align(32)) float out[8]; _mm256_store_ps(out, res); k = 0;
#pragma loop(ivdep)
while (k < 8) {
arr[i + k]->R = out[k];
++k;
}
i += 8u;
}
}
while (i < n) {
arr[i]->ChangeCharacteristic(m);
++i;
}
}
static __declspec(noalias) __forceinline void RecomputeR_AffineM_AVX2(Interval* const* __restrict arr, const size_t n, const float GLOBAL_FACTOR, const float alpha) noexcept {
const __m256 vGF = _mm256_set1_ps(GLOBAL_FACTOR);
const __m256 va = _mm256_set1_ps(alpha);
textsize_t i = 0; int limit = static_cast<int>(n & ~7u); if (n >= 8u) { while (i < static_cast<size_t>(limit)) { __declspec(align(32)) float len[8], Mv[8], q[8], nf[8], ord[8]; int k = 0;
#pragma loop(ivdep)
while (k < 8) {
const Interval* p = arr[i + k];
len[k] = p->x2 - p->x1;
Mv[k] = p->M;
q[k] = p->quadratic_term;
nf[k] = p->N_factor;
ord[k] = p->ordinate_factor;
++k;
}
__m256 vlen = _mm256_load_ps(len);
__m256 vM = _mm256_load_ps(Mv);
__m256 vq = _mm256_load_ps(q);
__m256 vnf = _mm256_load_ps(nf);
__m256 vod = _mm256_load_ps(ord);
text__m256 vm = _mm256_fmadd_ps(vGF, vlen, _mm256_mul_ps(va, vM)); __m256 vinvm = _mm256_rcp_ps(vm); vinvm = _mm256_mul_ps(vinvm, _mm256_fnmadd_ps(vm, vinvm, _mm256_set1_ps(2.0f))); __m256 t = _mm256_fmadd_ps(vm, vnf, vod); __m256 res = _mm256_fmadd_ps(vq, vinvm, t); __declspec(align(32)) float out[8]; _mm256_store_ps(out, res); k = 0;
#pragma loop(ivdep)
while (k < 8) {
arr[i + k]->R = out[k];
++k;
}
i += 8u;
}
}
while (i < n) {
const Interval* const __restrict p = arr[i];
const float mi = fmaf(GLOBAL_FACTOR, p->x2 - p->x1, p->M * alpha);
arr[i]->R = fmaf(1.0f / mi, p->quadratic_term, fmaf(mi, p->N_factor, p->ordinate_factor));
++i;
}
}
extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_1D(const float global_iterations,
const float a, const float b, const float r,
const bool mode, const float epsilon, const float seed,
float** __restrict out_data, size_t* __restrict out_len) noexcept
{
int schetchick = 0;
const float initial_length = b - a;
float dmax = initial_length;
const float threshold_03 = 0.3f * initial_length;
const float inv_threshold_03 = 1.0f / threshold_03;
const float start_val = ShekelFunc(a, seed);
float best_f = ShekelFunc(b, seed);
float x_Rmax_1 = a;
float x_Rmax_2 = b;
float y_Rmax_1 = start_val;
float y_Rmax_2 = best_f;
textstd::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr; std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R; Extr.clear(); Extr.reserve(static_cast<size_t>(global_iterations) << 2u); R.clear(); R.reserve(static_cast<size_t>(global_iterations) << 1u); R.emplace_back(new Interval(a, b, start_val, best_f, 1.0f)); float Mmax = R.front()->M; float m = r * Mmax; while (true) { float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 1.0f, r); float new_value = ShekelFunc(new_point, seed); if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_point); } std::pop_heap(R.begin(), R.end(), ComparePtr); Interval* __restrict promejutochny_otrezok = R.back(); float new_x1 = promejutochny_otrezok->x1; float new_x2 = promejutochny_otrezok->x2; float len2 = new_x2 - new_point; float len1 = new_point - new_x1; float interval_len = len1 < len2 ? len1 : len2; if (interval_len < epsilon || ++schetchick == static_cast<int>(global_iterations)) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); return; } Interval* __restrict curr = new Interval(new_x1, new_point, promejutochny_otrezok->y1, new_value, 1.0f); Interval* __restrict curr1 = new Interval(new_point, new_x2, new_value, promejutochny_otrezok->y2, 1.0f); float currM = curr->M > curr1->M ? curr->M : curr1->M; size_t r_size = R.size(); if (mode) { if (len2 + len1 == dmax) { dmax = len2 > len1 ? len2 : len1; size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
float len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}
textif (threshold_03 > dmax && schetchick % 3 == 0 || 10.0f * dmax < initial_length) { if (currM > Mmax) { Mmax = currM; m = r * Mmax; } float progress = fmaf(-inv_threshold_03, dmax, 1.0f); float alpha = fmaf(progress, progress, 1.0f); float betta = 2.0f - alpha; float MULTIPLIER = (1.0f / dmax) * Mmax; float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER); float GLOBAL_FACTOR = betta * global_coeff; curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha)); curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha)); if (r_size < 64u) { size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
}
else {
RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
if (currM - Mmax < Mmax * 0.15f) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
if (r_size < 64u) {
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
if (currM - Mmax < Mmax * 0.15f) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
if (r_size < 64u) {
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
textR.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr); R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr); Interval* __restrict top_ptr = R.front(); x_Rmax_1 = top_ptr->x1; x_Rmax_2 = top_ptr->x2; y_Rmax_1 = top_ptr->y1; y_Rmax_2 = top_ptr->y2; }
}
extern "C" __declspec(dllexport) __declspec(noalias)
void AGP_ND(
const float N, const float global_iterations,
const float a, const float b, const float c, const float d, const float r,
const bool mode, const float epsilon, const float seed,
float** __restrict out_data, size_t* __restrict out_len) noexcept
{
const int rank = g_world->rank();
const int partner = rank ^ 1;
int dummy;
if (rank) recv_req = g_world->irecv(1, 0, dummy);
textif (N == 2.0f) { const float inv_divider = ldexpf(1.0f, -((gActiveMap.levels << 1) + 1)); const float x_addition = (b - a) * inv_divider; const float y_addition = (d - c) * inv_divider; const float true_start = a + x_addition; const float true_end = b - x_addition; float x_Rmax_1 = true_start; float x_Rmax_2 = true_end; const float initial_length = true_end - true_start; float dmax = initial_length; const float threshold_03 = 0.3f * initial_length; const float inv_threshold_03 = 1.0f / threshold_03; const float start_val = rank ? RastriginFunc(true_end, d - y_addition) : RastriginFunc(true_start, c + y_addition); float y_Rmax_1 = start_val; float best_f = rank ? RastriginFunc(true_start, d - y_addition) : RastriginFunc(true_end, c + y_addition); float y_Rmax_2 = best_f; std::vector<float, boost::alignment::aligned_allocator<float, 16u>> Extr; std::vector<Interval* __restrict, boost::alignment::aligned_allocator<Interval* __restrict, 64u>> R; Extr.clear(); Extr.reserve(static_cast<size_t>(global_iterations) << 2u); R.clear(); R.reserve(static_cast<size_t>(global_iterations) << 1u); R.emplace_back(new Interval(true_start, true_end, start_val, best_f, 2.0f)); float Mmax = R.front()->M; float m = r * Mmax; int schetchick = 0; float interval_len = 0.0f; while (true) { if (g_world->iprobe(1, 0)) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); recv_req.wait(); recv_req = g_world->irecv(1, 0, dummy); return; } float cooling = ldexpf(1.0f, -(1.0f / 138.63f) * ++schetchick); int T = static_cast<int>(fmaf(20.0f, cooling, 10.0f)); float k = fmaf(0.2f, cooling, 0.7f); Interval* __restrict top_ptr = R.front(); if (schetchick % T == 0) { float s_x1, s_x2, e_x1, e_x2; HitTest2D_analytic(top_ptr->x1, s_x1, s_x2); HitTest2D_analytic(top_ptr->x2, e_x1, e_x2); CrossMsg outbound = CrossMsg{ s_x1, s_x2, e_x1, e_x2, top_ptr->R }; CrossMsg inbound; g_world->sendrecv(partner, 0, outbound, partner, 0, inbound); const float sx = FindX2D_analytic(inbound.s_x1, inbound.s_x2); const float ex = FindX2D_analytic(inbound.e_x1, inbound.e_x2); Interval* __restrict injected = new Interval(sx, ex, RastriginFunc(inbound.s_x1, inbound.s_x2), RastriginFunc(inbound.e_x1, inbound.e_x2), 2.0f); injected->R = inbound.Rtop * k; R.emplace_back(injected); std::push_heap(R.begin(), R.end(), ComparePtr); } float new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, 2.0f, r); float new_x1_val, new_x2_val; HitTest2D_analytic(new_point, new_x1_val, new_x2_val); float new_value = RastriginFunc(new_x1_val, new_x2_val); if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_x1_val); Extr.emplace_back(new_x2_val); } std::pop_heap(R.begin(), R.end(), ComparePtr); Interval* __restrict promejutochny_otrezok = R.back(); float segment_x1 = promejutochny_otrezok->x1; float segment_x2 = promejutochny_otrezok->x2; float len2 = segment_x2 - new_point; float len1 = new_point - segment_x1; interval_len = len1 < len2 ? len1 : len2; if (interval_len < epsilon || schetchick == static_cast<int>(global_iterations)) { g_world->send(0, 0, 0); return; } Interval* __restrict curr = new Interval(segment_x1, new_point, promejutochny_otrezok->y1, new_value, 2.0f); Interval* __restrict curr1 = new Interval(new_point, segment_x2, new_value, promejutochny_otrezok->y2, 2.0f); float currM = curr->M > curr1->M ? curr->M : curr1->M; size_t r_size = R.size(); if (mode) { if (len2 + len1 == dmax) { dmax = len2 > len1 ? len2 : len1; size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
float len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}
textif (threshold_03 > dmax && schetchick % 3 == 0 || 10.0f * dmax < initial_length) { if (currM > Mmax) { Mmax = currM; m = r * Mmax; } float progress = fmaf(-inv_threshold_03, dmax, 1.0f); float alpha = fmaf(progress, progress, 1.0f); float betta = 2.0f - alpha; float MULTIPLIER = (1.0f / dmax) * Mmax; float global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER); float GLOBAL_FACTOR = betta * global_coeff; curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha)); curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha)); if (r_size < 64u) { size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
}
else {
RecomputeR_AffineM_AVX2(R.data(), r_size, GLOBAL_FACTOR, alpha);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
if (currM - Mmax < Mmax * 0.15f) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
if (r_size < 64u) {
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
if (currM - Mmax < Mmax * 0.15f) {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
else {
Mmax = currM;
m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
if (r_size < 64u) {
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(m);
++i;
}
}
else {
RecomputeR_ConstM_AVX2(R.data(), r_size, m);
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
textR.back() = curr; std::push_heap(R.begin(), R.end(), ComparePtr); R.emplace_back(curr1); std::push_heap(R.begin(), R.end(), ComparePtr); top_ptr = R.front(); x_Rmax_1 = top_ptr->x1; x_Rmax_2 = top_ptr->x2; y_Rmax_1 = top_ptr->y1; y_Rmax_2 = top_ptr->y2; } } else { }
}
extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() noexcept
{
int dummy;
recv_req = g_world->irecv(0, 0, dummy);
float* __restrict buf;
size_t len;
while (true) {
if (recv_req.test()) {
AGP_ND(2.0f, 1000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.001f, GetTickCount(), &buf, &len);
recv_req = g_world->irecv(0, 0, dummy);
}
}
}
extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpStartWorkers() noexcept
{
g_world->isend(1, 0, 0);
}
extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AGP_ND_Free(float* __restrict p) noexcept
{
CoTaskMemFree(p);
} зацени код, я его написал
Круто! Видно, что ты любишь низкоуровневые фокусы: AVX2, FMA, собственный slab-аллокатор, MPI-инъекции интервалов — прям “жирная” оптимизация. Ниже — самое важное, что бросилось в глаза. Я держал фокус на корректности/детерминизме, численных нюансах и переносимости.
ГПСЧ в макросах (XOR_RAND*) использует int и даже float как state
ShekelFunc, HillFunc, GrishaginFunc current_state — float, а макрос внутри делает int s = state; s ^= s<<13; ...; state = s;int — UB в C++.uint32_t и инлайн-функции вместо макросов.
cppstruct Xor32 { uint32_t s; inline float u01() noexcept { // [0,1) s ^= s<<13; s ^= s>>17; s ^= s<<5; return (float)s * 0x1p-32f; } inline float u11() noexcept { // [-1,1) s ^= s<<13; s ^= s>>17; s ^= s<<5; return fmaf((float)s, 0x1p-31f, -1.0f); } }; // и использовать Xor32 rng{seed_as_uint};
HillFunc: потерян current_res2
Здесь оба коэффициента берутся из current_res, а current_res2 не используется:
cppres += fmaf(fmaf(current_res, 2.0f, -1.1f), sin_vals[j], fmaf(current_res, 2.0f, -1.1f) * cos_vals[j]);
Скорее всего, второе слагаемое должно использовать current_res2.
ldexpf с нецелым показателем
cppfloat cooling = ldexpf(1.0f, -(1.0f / 138.63f) * ++schetchick); // второй арг. должен быть int
Это не то, что ты хочешь. Возьми exp2f/scalbnf:
cppfloat cooling = exp2f(-(++schetchick) / 138.63f); // гладкое экспоненциальное охлаждение
MPI: источники/приёмники захардкожены 1 вместо partner
В AGP_ND инициализируешь:
cppconst int partner = rank ^ 1; if (rank) recv_req = g_world->irecv(1, 0, dummy); // <- надо partner // ... if (g_world->iprobe(1, 0)) { ... } // <- тоже partner
Аналогично в AgpWaitStartAndRun: irecv(0,0,...) у rank=0 слушает сам себя. Поменять на partner везде, где обмен между ранками.
AGP_ND: ранние выходы без заполнения out_data/out_len
В нескольких ветках делаешь return после send, но не заполняешь буфер результата — API становится недетерминированным для вызывающей стороны. Прежде чем выйти, всегда формируй Extr и выставляй *out_data/*out_len.
Потенциальное деление на ноль в характеристике
m = r * Mmax; и далее 1.0f / m (в ChangeCharacteristic). Если delta_y≈0, Mmax может обнулиться. Лучше зажать:
cppconstexpr float M_EPS = 1e-12f; m = std::max(r * Mmax, M_EPS);
AVX2: _mm256_load_ps требует 32-байтового выравнивания входа
В FABE13_SINCOS грузишь in[i] через _mm256_load_ps. Сейчас ты передаёшь туда свои локальные align(32) массивы — ок, но макрос соблазнительно использовать и с невыравненными буферами. Либо явно требуй выравнивание (через assume_aligned/assert), либо используй _mm256_loadu_ps.
Сравнение dmax по ==
Условие if (len2 + len1 == dmax) хрупко из-за ошибок округления. Лучше сравнение с допуском или другой триггер обновления.
Strict aliasing / type-punning
В Shag делаешь побитовый трюк с reinterpret_cast<const uint32_t*>. Это UB в строгом смысле. Возьми std::bit_cast<uint32_t>(diff) (C++20) или через memcpy.
Инлайнинг экспортируемых функций
__forceinline на extern "C" __declspec(dllexport) почти бессмысленен. Экспортируемые символы как правило не инлайнится за границы TU.
|x| < 2^23) и дублирована в SIMD/скаляре — всё норм.1 - x²/2 + x⁴/24) после редукции на [0, π/2] даст MAPE ~1e-3…1e-2 (в зависимости от редукции и FMA). Если критично — можно добавить терм -x⁶/720 для улучшения края.__declspec, _aligned_malloc, CoTaskMemAlloc). Подружи это через тонкую абстракцию:
и заменить явные вызовы на кросс-платформенные аналоги (или обёртки).cpp#if MSVC # define ALIGN(n) __declspec(align(n)) #else # define ALIGN(n) alignas(n) #endif
alignas(32) вместо __declspec(align(32)).enumerable_thread_specific.constexpr/std::numbers::pi_v<float>.Extr (сейчас в 1D — пары {best, x}, в конце — {iters, len}; в 2D — тройки {best, x, y}). Лучше сделать маленький POD-заголовок или отдельные выходные структуры.RecomputeR_* уже делаешь Ньютоновскую итерацию к _rcp_ps — nice. Можно заодно добавить #pragma clang loop vectorize(enable) там, где AVX выключен.Если хочешь, могу быстро накидать патч со следующим набором правок: (1) замена RNG-макросов на Xor32, (2) фикс HillFunc, (3) exp2f вместо ldexpf, (4) правка MPI источников на partner, (5) гарантированное заполнение out_* в AGP_ND, (6) m с эпсилоном, (7) loadu_ps в FABE13_SINCOS.