#include "pch.h" #define LCG_RAND(state) ...
생성일: 2025년 10월 7일
답변에 사용된 모델 GPT-5 Thinking by Chat01
생성일: 2025년 10월 7일
답변에 사용된 모델 GPT-5 Thinking by Chat01
#include "pch.h"
#define LCG_RAND(state)
do {
float reduced = fmodf((state), 1.618033988749895f);
reduced = fmaf(reduced, 1.618033988749895f, reduced * 0.5772156649f);
reduced -= (float)(int)(reduced);
reduced = fmaf(reduced, reduced + 1.0f, 0.0f);
reduced -= (float)(int)(reduced);
(state) = reduced;
} while (0)
#define LCG_RAND_GRSH(state)
do {
float reduced = fmodf((state), 1.618033988749895f);
reduced = fmaf(reduced, 1.618033988749895f, reduced * 0.5772156649f);
reduced -= (float)(int)(reduced);
reduced = fmaf(reduced, reduced + 1.0f, 0.0f);
reduced -= (float)(int)(reduced);
(state) = fmaf(reduced, 2.0f, -1.0f);
} while (0)
#define FABE13_COS(x, result_var)
do {
float abs_val = fabsf((x));
float reduced = fmodf(abs_val, 6.28318530718f);
if (reduced > 3.14159265359f) {
reduced = 6.28318530718f - reduced;
}
if (reduced < 1.57079632679f) {
float val2 = reduced * reduced;
result_var = fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
} else {
reduced = 3.14159265359f - reduced;
float val2 = reduced * reduced;
result_var = -fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
}
} while (0)
#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
int limit = (n) & ~7;
if ((n) >= 8) {
static thread_local __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f);
static thread_local __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f);
static thread_local __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f);
static thread_local __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f);
static thread_local __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f);
static thread_local __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f);
static thread_local __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f);
static thread_local __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f);
static thread_local __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f);
static thread_local __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f);
static thread_local __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps();
while (i < limit) {
__m256 vx = _mm256_load_ps(&(in)[i]);
__m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
__m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
__m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
__m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
__m256 t2 = _mm256_mul_ps(r2, r2);
__m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P3, VEC_COS_P1), VEC_COS_P0);
__m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, VEC_SIN_P1, VEC_SIN_P0), r2);
__m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
__m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ),
_mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
cosv = _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip);
sinv = _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip);
_mm256_store_ps(&(cos_out)[i], cosv);
_mm256_store_ps(&(sin_out)[i], sinv);
i += 8;
}
}
while (i < (n)) {
float x = (in)[i];
float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
bool sflip = (r > 3.14159265359f);
if (sflip) r = 6.28318530718f - r;
bool cflip = (r > 1.57079632679f);
if (cflip) r = 3.14159265359f - r;
float t2 = r * r;
float c = fmaf(t2, fmaf(t2, 0.0416666667f, -0.5f), 1.0f);
float s = fmaf(t2, -0.16666666f, 1.0f) * r;
cos_out[i] = cflip ? -c : c;
sin_out[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while (0)
static __declspec(noalias) __forceinline float ShekelFunc(const float x, const float seed) noexcept
{
int i = 0;
float current_state = seed, current_res, res = 0.0f;
while (i < 10) {
LCG_RAND(current_state);
const float x_part = fmaf(-current_state, 10.0f, x);
LCG_RAND(current_state);
current_res = current_state;
LCG_RAND(current_state);
float delimiter = fmaf(fmaf(current_res, 20.0f, 5.0f),
x_part * x_part,
fmaf(current_state, 0.2f, 1.0f));
delimiter = copysignf(fmaxf(fabsf(delimiter), FLT_MIN), delimiter);
res -= 1.0f / delimiter;
++i;
}
return res;
}
static __declspec(noalias) __forceinline float RastriginFunc(const float x1, const float x2) noexcept
{
const float term1 = fmaf(x1, x1, x2 * x2);
float cos1, cos2;
FABE13_COS(6.28318530717958647692f * x1, cos1);
FABE13_COS(6.28318530717958647692f * x2, cos2);
return (term1 - fmaf(cos1 + cos2, 10.0f, -14.6f)) * fmaf(-term1, 0.25f, 18.42f);
}
static __declspec(noalias) __forceinline float HillFunc(const float x, const float seed) noexcept
{
int j = 0;
static thread_local __declspec(align(64)) float angles[14u];
const float start_angle = 6.28318530717958647692f * x;
#pragma loop(ivdep)
while (j < 14) {
angles[j] = start_angle * (j + 1);
++j;
}
static thread_local __declspec(align(64)) float sin_vals[14u];
static thread_local __declspec(align(64)) float cos_vals[14u];
FABE13_SINCOS(angles, sin_vals, cos_vals, 14u);
float current_state = seed;
LCG_RAND(current_state);
float res = fmaf(current_state, 2.0f, -1.1f);
--j;
while (j >= 0) {
LCG_RAND(current_state);
float tmp_state = current_state;
LCG_RAND(current_state);
res += fmaf(fmaf(tmp_state, 2.0f, -1.1f), sin_vals[j],
fmaf(current_state, 2.0f, -1.1f) * cos_vals[j]);
--j;
}
return res;
}
static __declspec(noalias) __forceinline float GrishaginFunc(const float x1, const float x2, const float seed) noexcept
{
int j = 0;
static thread_local __declspec(align(64)) float angles_j[8u];
static thread_local __declspec(align(64)) float angles_k[8u];
#pragma loop(ivdep)
while (j < 8) {
const float pj_mult = 3.14159265358979323846f * (j + 1);
angles_j[j] = pj_mult * x1;
angles_k[j] = pj_mult * x2;
++j;
}
static thread_local __declspec(align(64)) float sin_j[8u], cos_j[8u];
static thread_local __declspec(align(64)) float sin_k[8u], cos_k[8u];
FABE13_SINCOS(angles_j, sin_j, cos_j, 8u);
FABE13_SINCOS(angles_k, sin_k, cos_k, 8u);
--j;
float part1 = 0.0f;
float part2 = 0.0f;
float current_state = seed;
while (j >= 0) {
size_t k = 0u;
while (k < 8u) {
const float sin_term = sin_j[j] * sin_j[j];
const float cos_term = cos_k[k] * cos_k[k];
LCG_RAND_GRSH(current_state);
float tmp_state = current_state;
LCG_RAND_GRSH(current_state);
part1 = fmaf(tmp_state, sin_term,
fmaf(current_state, cos_term, part1));
LCG_RAND_GRSH(tmp_state);
LCG_RAND_GRSH(current_state);
part2 = fmaf(-tmp_state, cos_term,
fmaf(current_state, sin_term, part2));
++k;
}
--j;
}
return -sqrtf(fmaf(part1, part1, part2 * part2));
}
static __declspec(noalias) __forceinline float Shag(const float _m, const float x1, const float x2, const float y1,
const float y2, const float _N, const float _r) noexcept
{
const float diff = y2 - y1;
const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(
0x3F800000 | ((((const int)&diff) & 0x80000000) ^ 0x80000000)
)));
return _N == 1.0f
? fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f
: _N == 2.0f
? fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f
: fmaf(sign_mult / powf(_m, _N), powf(diff, _N) * _r, x1 + x2) * 0.5f;
}
__declspec(align(64)) struct PreallocatedPeanoMemoryManager final {
__declspec(align(64)) struct MemoryBlock final {
char* __restrict data;
char* __restrict current;
const char* __restrict end;
text__declspec(noalias) __forceinline MemoryBlock() noexcept : data(nullptr), current(nullptr), end(nullptr) { } const __declspec(noalias) __forceinline void initialize(const size_t bytes) noexcept { const size_t aligned_need = (bytes + 1048638) & ~(1048575); void* p = VirtualAlloc(nullptr, aligned_need, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); data = reinterpret_cast<char* __restrict>(p); const uintptr_t base = reinterpret_cast<uintptr_t>(data); const uintptr_t aligned = (base + 63) & ~63; current = reinterpret_cast<char* __restrict>(aligned); end = reinterpret_cast<char*>(current + bytes); } __declspec(noalias) __forceinline void* allocate_from_block(const size_t alloc_size) noexcept { const size_t n = (alloc_size + 63) & ~63; char* __restrict result = current; current += n; return result; } }; tbb::enumerable_thread_specific<MemoryBlock> tls; const __declspec(noalias) __forceinline size_t calculate_tree_size_bytes(const int max_depth) const noexcept { static const size_t bytes[] = { 1398080u, 5592384u, 22369600u, 89478464u, 357913920u, 1431655744u, 5726623040u }; return bytes[max_depth - 7u]; } const __declspec(noalias) __forceinline void preallocate_for_depth(const int max_depth) noexcept { const size_t required_size = calculate_tree_size_bytes(max_depth); tls.clear(); tls = tbb::enumerable_thread_specific<MemoryBlock>( [required_size]() { MemoryBlock b; b.initialize(required_size); return b; } ); } __declspec(noalias) __forceinline void* allocate(const size_t size) noexcept { return tls.local().allocate_from_block(size); }
};
static PreallocatedPeanoMemoryManager peano_memory_manager;
__declspec(align(16)) struct Interval final {
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float N_factor;
const float M;
float R;
text__declspec(noalias) __forceinline void* operator new(size_t sz) noexcept { return peano_memory_manager.allocate(sz); } __declspec(noalias) __forceinline Interval(const float _x1, const float _x2, const float _y1, const float _y2, const float _N) noexcept : x1(_x1), x2(_x2), y1(_y1), y2(_y2), delta_y(_y2 - _y1), N_factor(_N == 1.0f ? _x2 - _x1 : _N == 2.0f ? sqrtf(_x2 - _x1) : powf(_x2 - _x1, 1.0f / _N)), M(fabsf(delta_y)* (1.0f / N_factor)) { } __declspec(noalias) __forceinline void ChangeCharacteristic(const float _m) noexcept { R = fmaf( -(y2 + y1), 2.0f, fmaf(_m, N_factor, (delta_y * delta_y) * (1.0f / (_m * N_factor)))); }
};
static __declspec(noalias) __forceinline bool ComparePtr(const Interval* __restrict a, const Interval* __restrict b) noexcept
{
return a->R < b->R;
}
const enum List : uint8_t {
Top = 0b00,
Down = 0b01,
Left = 0b10,
Right = 0b11
};
__declspec(align(16)) struct PeanoCurve_2D final {
const List Type;
const int razvertka;
const float a;
const float b;
const float c;
const float d;
const float x1;
const float x2;
PeanoCurve_2D* __restrict DownLeft;
PeanoCurve_2D* __restrict TopLeft;
PeanoCurve_2D* __restrict TopRight;
PeanoCurve_2D* __restrict DownRight;
text__declspec(noalias) __forceinline void* operator new(size_t sz) noexcept { return peano_memory_manager.allocate(sz); } __declspec(noalias) __forceinline PeanoCurve_2D( const List _Type, int _razvertka, const float _a, const float _b, const float _c, const float _d) noexcept : Type(_Type), razvertka(_razvertka), a(_a), b(_b), c(_c), d(_d), x1(fmaf(0.5f, _a, 0.5f * _b)), x2(fmaf(0.5f, _c, 0.5f * _d)), DownLeft(nullptr), TopLeft(nullptr), TopRight(nullptr), DownRight(nullptr) { if (_razvertka-- != 0) { if (_razvertka > 8) { switch (_Type) { case Top: tbb::parallel_invoke( [&]() { DownLeft = new PeanoCurve_2D(Right, _razvertka, _a, x1, _c, x2); }, [&]() { TopLeft = new PeanoCurve_2D(Top, _razvertka, _a, x1, x2, _d); }, [&]() { TopRight = new PeanoCurve_2D(Top, _razvertka, x1, _b, x2, _d); }, [&]() { DownRight = new PeanoCurve_2D(Left, _razvertka, x1, _b, _c, x2); } ); break; case Down: tbb::parallel_invoke( [&]() { TopRight = new PeanoCurve_2D(Left, _razvertka, x1, _b, x2, _d); }, [&]() { DownRight = new PeanoCurve_2D(Down, _razvertka, x1, _b, _c, x2); }, [&]() { DownLeft = new PeanoCurve_2D(Down, _razvertka, _a, x1, _c, x2); }, [&]() { TopLeft = new PeanoCurve_2D(Right, _razvertka, _a, x1, x2, _d); } ); break; case Right: tbb::parallel_invoke( [&]() { DownLeft = new PeanoCurve_2D(Top, _razvertka, _a, x1, _c, x2); }, [&]() { DownRight = new PeanoCurve_2D(Right, _razvertka, x1, _b, _c, x2); }, [&]() { TopRight = new PeanoCurve_2D(Right, _razvertka, x1, _b, x2, _d); }, [&]() { TopLeft = new PeanoCurve_2D(Down, _razvertka, _a, x1, x2, _d); } ); break; default: tbb::parallel_invoke( [&]() { TopRight = new PeanoCurve_2D(Down, _razvertka, x1, _b, x2, _d); }, [&]() { TopLeft = new PeanoCurve_2D(Left, _razvertka, _a, x1, x2, _d); }, [&]() { DownLeft = new PeanoCurve_2D(Left, _razvertka, _a, x1, _c, x2); }, [&]() { DownRight = new PeanoCurve_2D(Top, _razvertka, x1, _b, _c, x2); } ); } } else { switch (_Type) { case Top: DownLeft = new PeanoCurve_2D(Right, _razvertka, _a, x1, _c, x2); TopLeft = new PeanoCurve_2D(Top, _razvertka, _a, x1, x2, _d); TopRight = new PeanoCurve_2D(Top, _razvertka, x1, _b, x2, _d); DownRight = new PeanoCurve_2D(Left, _razvertka, x1, _b, _c, x2); break; case Down: TopRight = new PeanoCurve_2D(Left, _razvertka, x1, _b, x2, _d); DownRight = new PeanoCurve_2D(Down, _razvertka, x1, _b, _c, x2); DownLeft = new PeanoCurve_2D(Down, _razvertka, _a, x1, _c, x2); TopLeft = new PeanoCurve_2D(Right, _razvertka, _a, x1, x2, _d); break; case Right: DownLeft = new PeanoCurve_2D(Top, _razvertka, _a, x1, _c, x2); DownRight = new PeanoCurve_2D(Right, _razvertka, x1, _b, _c, x2); TopRight = new PeanoCurve_2D(Right, _razvertka, x1, _b, x2, _d); TopLeft = new PeanoCurve_2D(Down, _razvertka, _a, x1, x2, _d); break; default: TopRight = new PeanoCurve_2D(Down, _razvertka, x1, _b, x2, _d); TopLeft = new PeanoCurve_2D(Left, _razvertka, _a, x1, x2, _d); DownLeft = new PeanoCurve_2D(Left, _razvertka, _a, x1, _c, x2); DownRight = new PeanoCurve_2D(Top, _razvertka, x1, _b, _c, x2); } } } } const __declspec(noalias) __forceinline PeanoCurve_2D* __restrict HitTest_2D(float x) const noexcept { int i = 0; const int _razvertka = this->razvertka; int num; const float this_a = this->a; x -= this_a; const float b_minus_a = this->b - this_a; const float inv_b_minus_a = 1.0f / b_minus_a; const PeanoCurve_2D* __restrict Curr = this; while (i != _razvertka) { const int shift = 1 << ++i + i; num = shift * x * inv_b_minus_a; x = fmaf(-ldexp(1.0f, -(i << 1)) * num, b_minus_a, x); const List currType = Curr->Type; switch (num) { case 0: Curr = (currType == Top || currType == Right) ? Curr->DownLeft : Curr->TopRight; break; case 1: Curr = (currType == Top || currType == Left) ? Curr->TopLeft : Curr->DownRight; break; case 2: Curr = (currType == Top || currType == Right) ? Curr->TopRight : Curr->DownLeft; break; default: Curr = (currType == Top || currType == Left) ? Curr->DownRight : Curr->TopLeft; } } return const_cast<PeanoCurve_2D * __restrict>(Curr); } const __declspec(noalias) __forceinline float FindX_2D(const float target_x1, const float target_x2) const noexcept { int _razvertka = this->razvertka; int _razvertka1 = _razvertka; float x1, x2, x = this->a; const float b_minus_a = this->b - x; const PeanoCurve_2D* __restrict Curr = this; while (_razvertka != 0) { const int exponent = _razvertka1 - _razvertka-- << 1; x1 = Curr->x1; x2 = Curr->x2; const List currType = Curr->Type; if (target_x1 > x1 && target_x2 > x2) { Curr = Curr->TopRight; if (currType == Top || currType == Right) { x = fmaf(ldexpf(1.0f, -exponent) * 0.5f, b_minus_a, x); } } else if (target_x1 < x1 && target_x2 > x2) { Curr = Curr->TopLeft; if (currType == Top || currType == Left) { x = fmaf(ldexpf(1.0f, -exponent) * 0.25f, b_minus_a, x); } else { x = fmaf(ldexpf(1.0f, -exponent) * 0.75f, b_minus_a, x); } } else if (target_x1 < x1 && target_x2 < x2) { Curr = Curr->DownLeft; if (currType == Down || currType == Left) { x = fmaf(ldexpf(1.0f, -exponent) * 0.5f, b_minus_a, x); } } else { Curr = Curr->DownRight; if (currType == Top || currType == Left) { x = fmaf(ldexpf(1.0f, -exponent) * 0.75f, b_minus_a, x); } else { x = fmaf(ldexpf(1.0f, -exponent) * 0.25f, b_minus_a, x); } } } return x; }
};
static thread_local std::vector<float, tbb::scalable_allocator<float>> Extr;
static thread_local std::vector<Interval*, tbb::scalable_allocator<Interval*>> R;
static boost::mpi::environment* g_env;
static boost::mpi::communicator* g_world;
static std::aligned_storage_t<sizeof(PeanoCurve_2D), alignof(PeanoCurve_2D)> curveStorage;
static PeanoCurve_2D& Curve = reinterpret_cast<PeanoCurve_2D&>(curveStorage);
static boost::optionalboost::mpi::status probe_status;
extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(const int peanoLevel, const float a, const float b, const float c, const float d) noexcept
{
peano_memory_manager.preallocate_for_depth(peanoLevel);
g_env = ::new boost::mpi::environment();
g_world = ::new boost::mpi::communicator();
int rank = g_world->rank();
if (rank) {
::new (&Curve) PeanoCurve_2D(List::Down, peanoLevel, a, b, c, d);
}
else {
::new (&Curve) PeanoCurve_2D(List::Top, peanoLevel, a, b, c, d);
}
return rank;
}
extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpFinalize() noexcept
{
delete g_world;
delete g_env;
}
__declspec(align(16)) struct CrossMsg final {
float s_x1, s_x2;
float e_x1, e_x2;
float Rtop;
template <typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& __restrict const ar, const int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }
};
extern "C" __declspec(dllexport) __declspec(noalias)
void Base_LNA_1_2_Mer_AGP(
const float N, const float global_iterations,
const float a, const float b, const float c, const float d, const float r,
const bool mode, const float epsilon, const float seed,
float** __restrict out_data, size_t* __restrict out_len) noexcept
{
const int rank = g_world->rank();
const int partner = rank ^ 1;
float divider = ldexpf(1.0f, (Curve.razvertka << 1) + 1);
float inv_divider = 1.0f / divider;
float x_addition = (b - a) * inv_divider;
float y_addition = (d - c) * inv_divider;
float true_start = a + x_addition;
float true_end = b - x_addition;
float x_Rmax_1, x_Rmax_2;
float initial_length, dmax, threshold_03, inv_threshold_03;
float start_val, best_f, y_Rmax_1, y_Rmax_2;
float Mmax, m;
int schetchick = 0, mcQueenSpeed = 1;
float new_point, new_value;
const PeanoCurve_2D* __restrict pc;
float new_x1, new_x2;
Interval* __restrict promejutochny_otrezok;
Interval* __restrict curr;
Interval* __restrict curr1;
float currM, len1, len2, len_item;
size_t r_size;
float progress, alpha, betta, MULTIPLIER, global_coeff, GLOBAL_FACTOR;
Interval* __restrict top_ptr;
float interval_len;
int dummy;
const PeanoCurve_2D* __restrict p1L;
const PeanoCurve_2D* __restrict p2L;
CrossMsg outbound, inbound{};
Interval* __restrict injected;
float cooling;
int T;
float k;
int tag;
textif (N == 1.0f) { initial_length = b - a; dmax = initial_length; threshold_03 = 0.3f * initial_length; inv_threshold_03 = 1.0f / threshold_03; start_val = ShekelFunc(a, seed); best_f = ShekelFunc(b, seed); x_Rmax_1 = a; x_Rmax_2 = b; y_Rmax_1 = start_val; y_Rmax_2 = best_f; Extr.reserve(static_cast<size_t>(global_iterations) * 4u); Extr.clear(); R.reserve(static_cast<size_t>(global_iterations) * 2u); R.clear(); R.emplace_back(new Interval(a, b, start_val, best_f, N)); std::push_heap(R.begin(), R.end(), ComparePtr); Mmax = R.front()->M; m = r * Mmax; while (true) { new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, N, r); new_value = ShekelFunc(new_point, seed); if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_point); } std::pop_heap(R.begin(), R.end(), ComparePtr); promejutochny_otrezok = R.back(); R.pop_back(); new_x1 = promejutochny_otrezok->x1; new_x2 = promejutochny_otrezok->x2; len2 = new_x2 - new_point; len1 = new_point - new_x1; interval_len = (len1 < len2) ? len1 : len2; if (++schetchick == static_cast<int>(global_iterations) || interval_len < epsilon) { if (rank == 0) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); } return; } curr = new Interval(new_x1, new_point, promejutochny_otrezok->y1, new_value, N); curr1 = new Interval(new_point, new_x2, new_value, promejutochny_otrezok->y2, N); currM = (curr->M > curr1->M) ? curr->M : curr1->M; r_size = R.size(); if ((len2 + len1) == dmax) { dmax = (len2 > len1) ? len2 : len1; size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}
if (mode) {
if ((threshold_03 > dmax && fmodf(static_cast<float>(schetchick), 3.0f) == 0.0f) || 10.0f * dmax < initial_length) {
if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
progress = fmaf(-inv_threshold_03, dmax, 1.0f);
alpha = fmaf(progress, progress, 1.0f);
betta = 2.0f - alpha;
MULTIPLIER = (1.0f / dmax) * Mmax;
global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
GLOBAL_FACTOR = betta * global_coeff;
curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
Mmax = currM; m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) { R[i]->ChangeCharacteristic(m); ++i; }
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
Mmax = currM; m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) { R[i]->ChangeCharacteristic(m); ++i; }
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
R.emplace_back(curr);
std::push_heap(R.begin(), R.end(), ComparePtr);
R.emplace_back(curr1);
std::push_heap(R.begin(), R.end(), ComparePtr);
top_ptr = R.front();
x_Rmax_1 = top_ptr->x1;
x_Rmax_2 = top_ptr->x2;
y_Rmax_1 = top_ptr->y1;
y_Rmax_2 = top_ptr->y2;
}
}
else {
x_Rmax_1 = true_start;
x_Rmax_2 = true_end;
initial_length = true_end - true_start;
dmax = initial_length;
threshold_03 = 0.3f * initial_length;
inv_threshold_03 = 1.0f / threshold_03;
start_val = rank ? RastriginFunc(true_end, d - y_addition)
: RastriginFunc(true_start, c + y_addition);
y_Rmax_1 = start_val;
best_f = rank ? RastriginFunc(true_start, d - y_addition)
: RastriginFunc(true_end, c + y_addition);
y_Rmax_2 = best_f;
Extr.reserve(static_cast<size_t>(global_iterations) * 4u);
Extr.clear();
R.reserve(static_cast<size_t>(global_iterations) * 2u);
R.clear();
R.emplace_back(new Interval(true_start, true_end, start_val, best_f, N));
Mmax = R.front()->M;
m = r * Mmax;
while (true) {
if (++schetchick == static_cast<int>(global_iterations)) {
g_world->send(partner, 2, 0);
if (partner) {
top_ptr = R.front();
Extr.emplace_back(static_cast<float>(schetchick));
Extr.emplace_back(top_ptr->x2 - top_ptr->x1);
*out_len = Extr.size();
out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
}
return;
}
cooling = ldexpf(1.0f, -(1.0f / 138.63f) * schetchick);
T = static_cast<int>(fmaf(20.0f, cooling, 10.0f));
k = fmaf(0.2f, cooling, 0.7f);
if (schetchick % (T - 1) == 0) {
probe_status = g_world->iprobe(partner, boost::mpi::any_tag);
if (probe_status) {
tag = probe_status->tag();
if (tag == 0) {
g_world->recv(partner, 0, mcQueenSpeed);
}
else if (tag == 2) {
g_world->recv(partner, 2, dummy);
if (partner) {
top_ptr = R.front();
Extr.emplace_back(static_cast<float>(schetchick));
Extr.emplace_back(top_ptr->x2 - top_ptr->x1);
*out_len = Extr.size();
out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
}
return;
}
}
}
if (!(schetchick % T) == mcQueenSpeed) {
top_ptr = R.front();
p1L = Curve.HitTest_2D(top_ptr->x1);
p2L = Curve.HitTest_2D(top_ptr->x2);
outbound = CrossMsg{ p1L->x1, p1L->x2, p2L->x1, p2L->x2, top_ptr->R };
if (mcQueenSpeed) {
g_world->send(partner, 0, 0);
}
else {
mcQueenSpeed = 1;
}
g_world->send(partner, 1, outbound);
g_world->recv(partner, 1, inbound);
injected = new Interval(Curve.FindX_2D(inbound.s_x1, inbound.s_x2), Curve.FindX_2D(inbound.e_x1, inbound.e_x2), RastriginFunc(inbound.s_x1, inbound.s_x2), RastriginFunc(inbound.e_x1, inbound.e_x2), N);
injected->R = inbound.Rtop * k;
R.emplace_back(injected);
std::push_heap(R.begin(), R.end(), ComparePtr);
}
new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, N, r);
pc = Curve.HitTest_2D(new_point);
new_x1 = pc->x1;
new_x2 = pc->x2;
new_value = RastriginFunc(new_x1, new_x2);
if (new_value < best_f) {
best_f = new_value;
Extr.emplace_back(best_f);
Extr.emplace_back(new_x1);
Extr.emplace_back(new_x2);
}
std::pop_heap(R.begin(), R.end(), ComparePtr);
promejutochny_otrezok = R.back();
R.pop_back();
curr = new Interval(promejutochny_otrezok->x1, new_point, promejutochny_otrezok->y1, new_value, N);
curr1 = new Interval(new_point, promejutochny_otrezok->x2, new_value, promejutochny_otrezok->y2, N);
currM = (curr->M > curr1->M) ? curr->M : curr1->M;
len2 = promejutochny_otrezok->x2 - new_point;
len1 = new_point - promejutochny_otrezok->x1;
r_size = R.size();
if ((len2 + len1) == dmax) {
dmax = (len2 > len1) ? len2 : len1;
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}
if (mode) {
if ((threshold_03 > dmax && fmodf(static_cast<float>(schetchick), 3.0f) == 0.0f) || 10.0f * dmax < initial_length) {
if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
progress = fmaf(-inv_threshold_03, dmax, 1.0f);
alpha = fmaf(progress, progress, 1.0f);
betta = 2.0f - alpha;
MULTIPLIER = (1.0f / dmax) * Mmax;
global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
GLOBAL_FACTOR = betta * global_coeff;
curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
Mmax = currM; m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) { R[i]->ChangeCharacteristic(m); ++i; }
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
Mmax = currM; m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) { R[i]->ChangeCharacteristic(m); ++i; }
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
R.emplace_back(curr);
std::push_heap(R.begin(), R.end(), ComparePtr);
R.emplace_back(curr1);
std::push_heap(R.begin(), R.end(), ComparePtr);
top_ptr = R.front();
interval_len = top_ptr->x2 - top_ptr->x1;
if (interval_len < epsilon) {
g_world->send(partner, 2, 0);
if (partner) {
Extr.emplace_back(static_cast<float>(schetchick));
Extr.emplace_back(interval_len);
*out_len = Extr.size();
out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
}
return;
}
x_Rmax_1 = top_ptr->x1;
x_Rmax_2 = top_ptr->x2;
y_Rmax_1 = top_ptr->y1;
y_Rmax_2 = top_ptr->y2;
}
}
}
extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void Base_LNA_1_2_Mer_AGP_Free(float* p) noexcept
{
CoTaskMemFree(p);
}
extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpStartWorkers() noexcept
{
g_world->send(1, 0, 1);
}
extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpStopWorkers() noexcept
{
g_world->send(1, 0, 2);
}
extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() noexcept
{
int command = 0;
while (true) {
probe_status = g_world->iprobe(0, 0);
if (probe_status) {
g_world->recv(0, 0, command);
}
if (command == 1) {
float* __restrict dummy = nullptr;
size_t dummy_len = 0u;
Base_LNA_1_2_Mer_AGP(2.0f, 1000.0f, -2.2f, 1.8f, -2.2f, 1.8f,
2.5f, false, 0.001f, GetTickCount(),
&dummy, &dummy_len);
}
else if (command == 2) {
break;
}
}
return;
} в этом коде у меня сейчас проблема не с mpi коммуникацией а в том что при первом нажатии на кнопку всё срабатывает нормально но затем происходит что-то странное и значения кажутся некорректными - мусорорм - думаю это из-за того что нужно обнулять указатели статические после использования - возвращать состояние к корректному как при первом запуске, напоминаю код в MyForm.cpp #include "MyForm.h"
#include <float.h>
using namespace System;
using namespace System::Windows::Forms;
typedef int(__cdecl* PInit)(int, float, float, float, float);
typedef void(__cdecl* PFin)();
typedef void(__cdecl* PStartWorkers)();
[STAThread]
int main() {
HMODULE h = LoadLibraryW(L"TEST_FUNC.dll");
auto AgpInit = (PInit)GetProcAddress(h, "AgpInit");
auto AgpFinalize = (PFin)GetProcAddress(h, "AgpFinalize");
auto AgpWaitStartAndRun = (PStartWorkers)GetProcAddress(h, "AgpWaitStartAndRun");
textconst int rank = AgpInit(12, -2.2f, 1.8f, -2.2f, 1.8f); if (rank == 0) { Application::EnableVisualStyles(); Application::SetCompatibleTextRenderingDefault(false); Application::Run(gcnew TESTAGP::MyForm(h)); } else { AgpWaitStartAndRun(); } AgpFinalize(); FreeLibrary(h); return 0;
} код в MyForm.h ref class MyForm : public System::Windows::Forms::Form {
public:
MyForm(HMODULE hLib) : hLib(hLib) { // Передаем дескриптор из main
InitializeComponent();
// Загружаем функции из DLL
f = (agp_c)GetProcAddress(hLib, "Base_LNA_1_2_Mer_AGP");
pStart = (start_workers)GetProcAddress(hLib, "AgpStartWorkers");
pFree = (free_agp)GetProcAddress(hLib, "Base_LNA_1_2_Mer_AGP_Free");
pStop = (stop_workers)GetProcAddress(hLib, "AgpStopWorkers");
}
textprotected: ~MyForm() { pStop(); delete components; } private: HINSTANCE hLib = nullptr; typedef void(__cdecl* agp_c)(float, float, float, float, float, float, float, bool, float, float, float**, size_t*); typedef void(__cdecl* start_workers)(); typedef void(__cdecl* free_agp)(float*); typedef void(__cdecl* stop_workers)(); agp_c f = nullptr; start_workers pStart = nullptr; free_agp pFree = nullptr; stop_workers pStop = nullptr; System::Windows::Forms::Button^ button1; System::Windows::Forms::TextBox^ textBox2; System::Windows::Forms::DataVisualization::Charting::Chart^ chart2; System::Windows::Forms::Label^ label2; System::Windows::Forms::Label^ label3; System::Windows::Forms::TextBox^ textBox1; System::Windows::Forms::TextBox^ textBox3; System::Windows::Forms::TextBox^ textBox4; System::Windows::Forms::TextBox^ textBox5; System::Windows::Forms::TextBox^ textBox6; System::Windows::Forms::Label^ label6; System::Windows::Forms::Label^ label7; System::Windows::Forms::Label^ label8; System::Windows::Forms::Label^ label9; System::Windows::Forms::Label^ label10; System::Windows::Forms::Label^ label1; System::Windows::Forms::TextBox^ textBox7; System::Windows::Forms::TextBox^ textBox8; System::ComponentModel::Container^ components; System::Void button1_Click(System::Object^ sender, System::EventArgs^ e) { chart2->Series[0]->Points->Clear(); chart2->Series[1]->Points->Clear(); chart2->Series[2]->Points->Clear(); chart2->Series[3]->Points->Clear(); static LARGE_INTEGER start, end; QueryPerformanceCounter(&start); float* buf = nullptr; size_t len = 0u; pStart(); // разбудили rank != 0 f(2.0f, 1000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.001f, GetTickCount(), &buf, &len); if (buf != nullptr) { std::vector<float> Extr_2D(buf, buf + len); pFree(buf); // Для 2D ветки: извлекаем schetchick и interval_len textBox7->Text = Convert::ToString(Extr_2D.back()); // schetchick Extr_2D.pop_back(); textBox6->Text = Convert::ToString(Extr_2D.back()); // interval_len Extr_2D.pop_back(); // Извлекаем последнюю тройку (x2, x1, f) float x2 = Extr_2D.back(); Extr_2D.pop_back(); float x1 = Extr_2D.back(); Extr_2D.pop_back(); float f_val = Extr_2D.back(); Extr_2D.pop_back(); // Выводим координаты и значение функции textBox4->Text = Convert::ToString(x1); textBox3->Text = Convert::ToString(x2); textBox2->Text = Convert::ToString(f_val); // Новое поле для значения функции // Отображаем точку на графике chart2->Series[2]->Points->AddXY(x1, x2); // Обрабатываем остальные точки (в порядке: f, x1, x2) while (!Extr_2D.empty()) { float x2_point = Extr_2D.back(); Extr_2D.pop_back(); float x1_point = Extr_2D.back(); Extr_2D.pop_back(); float f_point = Extr_2D.back(); Extr_2D.pop_back(); // Добавляем точку на график (только координаты) chart2->Series[0]->Points->AddXY(x1_point, x2_point); } } QueryPerformanceCounter(&end); LARGE_INTEGER freq; QueryPerformanceFrequency(&freq); const int multiplier = ((1ULL << 32) / freq.QuadPart) * 1'000'000ULL; textBox5->Text = Convert::ToString( ((end.QuadPart - start.QuadPart) * multiplier) >> 32 ) + " microseconds";
Похоже, «мусор» после второго нажатия прилетает не из-за MPI, а из-за вашего собственного аллокатора и общего глобального состояния между запусками.
Коротко, что происходит:
peano_memory_manager
и для построения дерева Пеано (PeanoCurve_2D
), и для рабочих интервалов (Interval
).allocate_from_block()
только сдвигает current
, границ не проверяет и никогда не откатывается.AgpInit()
и больше не трогается. На первом запуске вы израсходовали кусок на дерево и интервалы, на втором запуске начинаете выделять дальше (над областью, где уже ничего не гарантируется). Отсюда – странные/рандомные значения.dummy
, но его не освобождаете — на каждое «Start» течёт память. Это быстро не «ломает» логику, но усугубляет поведение.Ниже — минимальные правки, которые возвращают состояние «как при первом запуске», но не трогают память с деревом.
Идея: запоминаем high-water-mark у каждого MemoryBlock
после строительства дерева; перед каждым Base_LNA_1_2_Mer_AGP()
откатываем current
к этой метке (то есть заново используем область для интервалов), не трогая память с узлами дерева.
cpp// В вашем MemoryBlock: struct MemoryBlock final { char* data; char* current; const char* end; char* mark_after_tree; // <- добавили __forceinline void initialize(size_t bytes) noexcept { const size_t aligned_need = (bytes + 1048638) & ~(1048575); void* p = VirtualAlloc(nullptr, aligned_need, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); data = reinterpret_cast<char*>(p); const uintptr_t base = reinterpret_cast<uintptr_t>(data); const uintptr_t aligned = (base + 63) & ~63; current = reinterpret_cast<char*>(aligned); end = current + bytes; mark_after_tree = current; // изначально метка = начало } __forceinline void mark_tree_done() noexcept { mark_after_tree = current; } __forceinline void reset_to_mark() noexcept { current = mark_after_tree; } __forceinline void* allocate_from_block(size_t alloc_size) noexcept { const size_t n = (alloc_size + 63) & ~63; char* result = current; current += n; // Заодно добавим проверку границы в Debug (в релизе можно убрать/ассертить). #if _DEBUG if (current > end) __debugbreak(); #endif return result; } };
В менеджере добавим удобные обёртки:
cppstruct PreallocatedPeanoMemoryManager { tbb::enumerable_thread_specific<MemoryBlock> tls; void mark_tree_done_for_current_thread() noexcept { tls.local().mark_tree_done(); } void reset_for_current_thread() noexcept { tls.local().reset_to_mark(); } };
Где вызывать:
AgpInit(...)
— после ::new (&Curve) PeanoCurve_2D(...)
:cpp// Дерево построено -> для текущего (главного) потока фиксируем метку. peano_memory_manager.mark_tree_done_for_current_thread();
Не переживайте насчёт TBB: узлы дерева действительно создавались в разных потоках, но интервалы вы создаёте в том же потоке, где зовёте
Base_LNA_1_2_Mer_AGP
(ранг 0). Нам важно откатывать именно локальныйMemoryBlock
этого потока, чтобы итерировать область «под интервалы». Память других TLS-блоков, на которых строились части дерева, мы не трогаем.
Base_LNA_1_2_Mer_AGP(...)
:cpp// Перед каждым запуском алгоритма очищаем «рабочую» арену интервалов: peano_memory_manager.reset_for_current_thread(); // А эти вектора у вас уже очищаются .clear(), что нормально: Extr.clear(); R.clear();
Этого достаточно, чтобы каждый запуск начинался в «чистой» зоне для Interval
, не повреждая дерево.
Альтернатива (тоже рабочая): дать
Interval
свой отдельный аллокатор (второй глобальный менеджер или просто обычныйnew
). Это изолирует «временные» объекты от дерева вообще. Если захотите — подскажу, как быстро вынестиInterval::operator new
на второй пул/наtbb::scalable_allocator
.
В AgpWaitStartAndRun()
вы не освобождаете буфер, возвращаемый из Base_LNA_1_2_Mer_AGP
. Нужно сразу звать экспорт Base_LNA_1_2_Mer_AGP_Free
:
cppelse if (command == 1) { float* __restrict dummy = nullptr; size_t dummy_len = 0; Base_LNA_1_2_Mer_AGP(2.0f, 1000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.001f, GetTickCount(), &dummy, &dummy_len); if (dummy) Base_LNA_1_2_Mer_AGP_Free(dummy); // <-- обязательно }
Без этого при каждом старте ранга ≠ 0 утекает кусок памяти.
В 2D-ветке после первого R.emplace_back(...)
хорошо бы сразу привести R
к куче (для единообразия):
cppR.clear(); R.emplace_back(new Interval(true_start, true_end, start_val, best_f, N)); std::push_heap(R.begin(), R.end(), ComparePtr); // можно и так; с одним элементом безопасно
(С одним элементом поведение и так корректное, но это уберёт риск, если кто-то позже вставит код между emplace_back
и первым pop_heap
.)
В условии обменов есть неочевидный приоритет !
:
cpp// Было: if (!(schetchick % T) == mcQueenSpeed) { // Обычно хотят: if ((schetchick % T) != mcQueenSpeed) { }
Если логика у вас именно «раз в T шагов при несоответствии скорости», перепишите как выше. С нынешним вариантом легко попасть в «дребезг» условий.
Shag()
: вы читаете биты diff
через *(const int*)&diff
— это UB по strict-aliasing. Безопаснее:
cppconst uint32_t sign = _mm_cvtss_si32(_mm_set_ss(diff)) & 0x80000000u; const uint32_t bits = 0x3F800000u | (sign ^ 0x80000000u); const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(bits)));
Вы правильно обнуляете график и освобождаете буфер через pFree(buf)
после копирования в std::vector
. В 2D-ветке порядок у вас f, x1, x2
— вы читаете задом-наперед, это корректно.
Если хотите самый простой (и радикально безопасный) вариант — уберите перегруженный operator new
у Interval
и дайте интервалам обычные new/delete
(или tbb::scalable_allocator
). Тогда никаких сбросов вообще не нужно — пул останется только под дерево, и он неизменяем между запусками. Но показанный выше «mark/reset» даёт те же гарантии с минимальными правками.