#include "pch.h" #define LCG_RAND(state) ...

创建于:2025年10月7日

使用 GPT-5 Thinking 回答 Chat01

问题

#include "pch.h"

#define LCG_RAND(state)
do {
float reduced = fmodf((state), 1.618033988749895f);
reduced = fmaf(reduced, 1.618033988749895f, reduced * 0.5772156649f);
reduced -= (float)(int)(reduced);
reduced = fmaf(reduced, reduced + 1.0f, 0.0f);
reduced -= (float)(int)(reduced);
(state) = reduced;
} while (0)

#define LCG_RAND_GRSH(state)
do {
float reduced = fmodf((state), 1.618033988749895f);
reduced = fmaf(reduced, 1.618033988749895f, reduced * 0.5772156649f);
reduced -= (float)(int)(reduced);
reduced = fmaf(reduced, reduced + 1.0f, 0.0f);
reduced -= (float)(int)(reduced);
(state) = fmaf(reduced, 2.0f, -1.0f);
} while (0)

#define FABE13_COS(x, result_var)
do {
float abs_val = fabsf((x));
float reduced = fmodf(abs_val, 6.28318530718f);
if (reduced > 3.14159265359f) {
reduced = 6.28318530718f - reduced;
}
if (reduced < 1.57079632679f) {
float val2 = reduced * reduced;
result_var = fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
} else {
reduced = 3.14159265359f - reduced;
float val2 = reduced * reduced;
result_var = -fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
}
} while (0)

#define FABE13_SINCOS(in, sin_out, cos_out, n)
do {
int i = 0;
int limit = (n) & ~7;
if ((n) >= 8) {
static thread_local __declspec(align(32)) const __m256 VEC_TWOPI = _mm256_set1_ps(6.28318530718f);
static thread_local __declspec(align(32)) const __m256 VEC_PI = _mm256_set1_ps(3.14159265359f);
static thread_local __declspec(align(32)) const __m256 VEC_PI_2 = _mm256_set1_ps(1.57079632679f);
static thread_local __declspec(align(32)) const __m256 INV_TWOPI = _mm256_set1_ps(0.15915494309189535f);
static thread_local __declspec(align(32)) const __m256 BIAS = _mm256_set1_ps(12582912.0f);
static thread_local __declspec(align(32)) const __m256 VEC_COS_P3 = _mm256_set1_ps(0.0416666667f);
static thread_local __declspec(align(32)) const __m256 VEC_COS_P1 = _mm256_set1_ps(-0.5f);
static thread_local __declspec(align(32)) const __m256 VEC_COS_P0 = _mm256_set1_ps(1.0f);
static thread_local __declspec(align(32)) const __m256 VEC_SIN_P1 = _mm256_set1_ps(-0.16666666f);
static thread_local __declspec(align(32)) const __m256 VEC_SIN_P0 = _mm256_set1_ps(1.0f);
static thread_local __declspec(align(32)) const __m256 VEC_ZERO = _mm256_setzero_ps();
while (i < limit) {
__m256 vx = _mm256_load_ps(&(in)[i]);
__m256 vax = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), vx);
__m256 q = _mm256_fmadd_ps(vax, INV_TWOPI, BIAS);
q = _mm256_sub_ps(q, BIAS);
__m256 r = _mm256_fnmadd_ps(VEC_TWOPI, q, vax);
__m256 r1 = _mm256_min_ps(r, _mm256_sub_ps(VEC_TWOPI, r));
__m256 r2 = _mm256_min_ps(r1, _mm256_sub_ps(VEC_PI, r1));
__m256 t2 = _mm256_mul_ps(r2, r2);
__m256 cosv = _mm256_fmadd_ps(t2, _mm256_fmadd_ps(t2, VEC_COS_P3, VEC_COS_P1), VEC_COS_P0);
__m256 sinv = _mm256_mul_ps(_mm256_fmadd_ps(t2, VEC_SIN_P1, VEC_SIN_P0), r2);
__m256 cflip = _mm256_cmp_ps(r1, VEC_PI_2, _CMP_GT_OQ);
__m256 sflip = _mm256_xor_ps(_mm256_cmp_ps(vx, VEC_ZERO, _CMP_LT_OQ),
_mm256_cmp_ps(r, VEC_PI, _CMP_GT_OQ));
cosv = _mm256_blendv_ps(cosv, _mm256_sub_ps(VEC_ZERO, cosv), cflip);
sinv = _mm256_blendv_ps(sinv, _mm256_sub_ps(VEC_ZERO, sinv), sflip);
_mm256_store_ps(&(cos_out)[i], cosv);
_mm256_store_ps(&(sin_out)[i], sinv);
i += 8;
}
}
while (i < (n)) {
float x = (in)[i];
float ax = fabsf(x);
float q = fmaf(ax, 0.15915494309189535f, 12582912.0f);
q -= 12582912.0f;
float r = fmaf(-6.28318530718f, q, ax);
bool sflip = (r > 3.14159265359f);
if (sflip) r = 6.28318530718f - r;
bool cflip = (r > 1.57079632679f);
if (cflip) r = 3.14159265359f - r;
float t2 = r * r;
float c = fmaf(t2, fmaf(t2, 0.0416666667f, -0.5f), 1.0f);
float s = fmaf(t2, -0.16666666f, 1.0f) * r;
cos_out[i] = cflip ? -c : c;
sin_out[i] = ((x < 0.0f) ^ sflip) ? -s : s;
++i;
}
} while (0)

static __declspec(noalias) __forceinline float ShekelFunc(const float x, const float seed) noexcept
{
int i = 0;
float current_state = seed, current_res, res = 0.0f;
while (i < 10) {
LCG_RAND(current_state);
const float x_part = fmaf(-current_state, 10.0f, x);
LCG_RAND(current_state);
current_res = current_state;
LCG_RAND(current_state);
float delimiter = fmaf(fmaf(current_res, 20.0f, 5.0f),
x_part * x_part,
fmaf(current_state, 0.2f, 1.0f));
delimiter = copysignf(fmaxf(fabsf(delimiter), FLT_MIN), delimiter);
res -= 1.0f / delimiter;
++i;
}
return res;
}

static __declspec(noalias) __forceinline float RastriginFunc(const float x1, const float x2) noexcept
{
const float term1 = fmaf(x1, x1, x2 * x2);
float cos1, cos2;
FABE13_COS(6.28318530717958647692f * x1, cos1);
FABE13_COS(6.28318530717958647692f * x2, cos2);
return (term1 - fmaf(cos1 + cos2, 10.0f, -14.6f)) * fmaf(-term1, 0.25f, 18.42f);
}

static __declspec(noalias) __forceinline float HillFunc(const float x, const float seed) noexcept
{
int j = 0;
static thread_local __declspec(align(64)) float angles[14u];
const float start_angle = 6.28318530717958647692f * x;
#pragma loop(ivdep)
while (j < 14) {
angles[j] = start_angle * (j + 1);
++j;
}
static thread_local __declspec(align(64)) float sin_vals[14u];
static thread_local __declspec(align(64)) float cos_vals[14u];
FABE13_SINCOS(angles, sin_vals, cos_vals, 14u);
float current_state = seed;
LCG_RAND(current_state);
float res = fmaf(current_state, 2.0f, -1.1f);
--j;
while (j >= 0) {
LCG_RAND(current_state);
float tmp_state = current_state;
LCG_RAND(current_state);
res += fmaf(fmaf(tmp_state, 2.0f, -1.1f), sin_vals[j],
fmaf(current_state, 2.0f, -1.1f) * cos_vals[j]);
--j;
}
return res;
}

static __declspec(noalias) __forceinline float GrishaginFunc(const float x1, const float x2, const float seed) noexcept
{
int j = 0;
static thread_local __declspec(align(64)) float angles_j[8u];
static thread_local __declspec(align(64)) float angles_k[8u];
#pragma loop(ivdep)
while (j < 8) {
const float pj_mult = 3.14159265358979323846f * (j + 1);
angles_j[j] = pj_mult * x1;
angles_k[j] = pj_mult * x2;
++j;
}
static thread_local __declspec(align(64)) float sin_j[8u], cos_j[8u];
static thread_local __declspec(align(64)) float sin_k[8u], cos_k[8u];
FABE13_SINCOS(angles_j, sin_j, cos_j, 8u);
FABE13_SINCOS(angles_k, sin_k, cos_k, 8u);
--j;
float part1 = 0.0f;
float part2 = 0.0f;
float current_state = seed;
while (j >= 0) {
size_t k = 0u;
while (k < 8u) {
const float sin_term = sin_j[j] * sin_j[j];
const float cos_term = cos_k[k] * cos_k[k];
LCG_RAND_GRSH(current_state);
float tmp_state = current_state;
LCG_RAND_GRSH(current_state);
part1 = fmaf(tmp_state, sin_term,
fmaf(current_state, cos_term, part1));
LCG_RAND_GRSH(tmp_state);
LCG_RAND_GRSH(current_state);
part2 = fmaf(-tmp_state, cos_term,
fmaf(current_state, sin_term, part2));
++k;
}
--j;
}
return -sqrtf(fmaf(part1, part1, part2 * part2));
}

static __declspec(noalias) __forceinline float Shag(const float _m, const float x1, const float x2, const float y1,
const float y2, const float _N, const float _r) noexcept
{
const float diff = y2 - y1;
const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_set1_epi32(
0x3F800000 | ((((const int)&diff) & 0x80000000) ^ 0x80000000)
)));
return _N == 1.0f
? fmaf(-(1.0f / _m), diff, x1 + x2) * 0.5f
: _N == 2.0f
? fmaf(sign_mult / (_m * _m), diff * diff * _r, x1 + x2) * 0.5f
: fmaf(sign_mult / powf(_m, _N), powf(diff, _N) * _r, x1 + x2) * 0.5f;
}

__declspec(align(64)) struct PreallocatedPeanoMemoryManager final {
__declspec(align(64)) struct MemoryBlock final {
char* __restrict data;
char* __restrict current;
const char* __restrict end;

text
__declspec(noalias) __forceinline MemoryBlock() noexcept : data(nullptr), current(nullptr), end(nullptr) { } const __declspec(noalias) __forceinline void initialize(const size_t bytes) noexcept { const size_t aligned_need = (bytes + 1048638) & ~(1048575); void* p = VirtualAlloc(nullptr, aligned_need, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); data = reinterpret_cast<char* __restrict>(p); const uintptr_t base = reinterpret_cast<uintptr_t>(data); const uintptr_t aligned = (base + 63) & ~63; current = reinterpret_cast<char* __restrict>(aligned); end = reinterpret_cast<char*>(current + bytes); } __declspec(noalias) __forceinline void* allocate_from_block(const size_t alloc_size) noexcept { const size_t n = (alloc_size + 63) & ~63; char* __restrict result = current; current += n; return result; } }; tbb::enumerable_thread_specific<MemoryBlock> tls; const __declspec(noalias) __forceinline size_t calculate_tree_size_bytes(const int max_depth) const noexcept { static const size_t bytes[] = { 1398080u, 5592384u, 22369600u, 89478464u, 357913920u, 1431655744u, 5726623040u }; return bytes[max_depth - 7u]; } const __declspec(noalias) __forceinline void preallocate_for_depth(const int max_depth) noexcept { const size_t required_size = calculate_tree_size_bytes(max_depth); tls.clear(); tls = tbb::enumerable_thread_specific<MemoryBlock>( [required_size]() { MemoryBlock b; b.initialize(required_size); return b; } ); } __declspec(noalias) __forceinline void* allocate(const size_t size) noexcept { return tls.local().allocate_from_block(size); }

};

static PreallocatedPeanoMemoryManager peano_memory_manager;

__declspec(align(16)) struct Interval final {
const float x1;
const float x2;
const float y1;
const float y2;
const float delta_y;
const float N_factor;
const float M;
float R;

text
__declspec(noalias) __forceinline void* operator new(size_t sz) noexcept { return peano_memory_manager.allocate(sz); } __declspec(noalias) __forceinline Interval(const float _x1, const float _x2, const float _y1, const float _y2, const float _N) noexcept : x1(_x1), x2(_x2), y1(_y1), y2(_y2), delta_y(_y2 - _y1), N_factor(_N == 1.0f ? _x2 - _x1 : _N == 2.0f ? sqrtf(_x2 - _x1) : powf(_x2 - _x1, 1.0f / _N)), M(fabsf(delta_y)* (1.0f / N_factor)) { } __declspec(noalias) __forceinline void ChangeCharacteristic(const float _m) noexcept { R = fmaf( -(y2 + y1), 2.0f, fmaf(_m, N_factor, (delta_y * delta_y) * (1.0f / (_m * N_factor)))); }

};

static __declspec(noalias) __forceinline bool ComparePtr(const Interval* __restrict a, const Interval* __restrict b) noexcept
{
return a->R < b->R;
}

const enum List : uint8_t {
Top = 0b00,
Down = 0b01,
Left = 0b10,
Right = 0b11
};

__declspec(align(16)) struct PeanoCurve_2D final {
const List Type;
const int razvertka;
const float a;
const float b;
const float c;
const float d;
const float x1;
const float x2;
PeanoCurve_2D* __restrict DownLeft;
PeanoCurve_2D* __restrict TopLeft;
PeanoCurve_2D* __restrict TopRight;
PeanoCurve_2D* __restrict DownRight;

text
__declspec(noalias) __forceinline void* operator new(size_t sz) noexcept { return peano_memory_manager.allocate(sz); } __declspec(noalias) __forceinline PeanoCurve_2D( const List _Type, int _razvertka, const float _a, const float _b, const float _c, const float _d) noexcept : Type(_Type), razvertka(_razvertka), a(_a), b(_b), c(_c), d(_d), x1(fmaf(0.5f, _a, 0.5f * _b)), x2(fmaf(0.5f, _c, 0.5f * _d)), DownLeft(nullptr), TopLeft(nullptr), TopRight(nullptr), DownRight(nullptr) { if (_razvertka-- != 0) { if (_razvertka > 8) { switch (_Type) { case Top: tbb::parallel_invoke( [&]() { DownLeft = new PeanoCurve_2D(Right, _razvertka, _a, x1, _c, x2); }, [&]() { TopLeft = new PeanoCurve_2D(Top, _razvertka, _a, x1, x2, _d); }, [&]() { TopRight = new PeanoCurve_2D(Top, _razvertka, x1, _b, x2, _d); }, [&]() { DownRight = new PeanoCurve_2D(Left, _razvertka, x1, _b, _c, x2); } ); break; case Down: tbb::parallel_invoke( [&]() { TopRight = new PeanoCurve_2D(Left, _razvertka, x1, _b, x2, _d); }, [&]() { DownRight = new PeanoCurve_2D(Down, _razvertka, x1, _b, _c, x2); }, [&]() { DownLeft = new PeanoCurve_2D(Down, _razvertka, _a, x1, _c, x2); }, [&]() { TopLeft = new PeanoCurve_2D(Right, _razvertka, _a, x1, x2, _d); } ); break; case Right: tbb::parallel_invoke( [&]() { DownLeft = new PeanoCurve_2D(Top, _razvertka, _a, x1, _c, x2); }, [&]() { DownRight = new PeanoCurve_2D(Right, _razvertka, x1, _b, _c, x2); }, [&]() { TopRight = new PeanoCurve_2D(Right, _razvertka, x1, _b, x2, _d); }, [&]() { TopLeft = new PeanoCurve_2D(Down, _razvertka, _a, x1, x2, _d); } ); break; default: tbb::parallel_invoke( [&]() { TopRight = new PeanoCurve_2D(Down, _razvertka, x1, _b, x2, _d); }, [&]() { TopLeft = new PeanoCurve_2D(Left, _razvertka, _a, x1, x2, _d); }, [&]() { DownLeft = new PeanoCurve_2D(Left, _razvertka, _a, x1, _c, x2); }, [&]() { DownRight = new PeanoCurve_2D(Top, _razvertka, x1, _b, _c, x2); } ); } } else { switch (_Type) { case Top: DownLeft = new PeanoCurve_2D(Right, _razvertka, _a, x1, _c, x2); TopLeft = new PeanoCurve_2D(Top, _razvertka, _a, x1, x2, _d); TopRight = new PeanoCurve_2D(Top, _razvertka, x1, _b, x2, _d); DownRight = new PeanoCurve_2D(Left, _razvertka, x1, _b, _c, x2); break; case Down: TopRight = new PeanoCurve_2D(Left, _razvertka, x1, _b, x2, _d); DownRight = new PeanoCurve_2D(Down, _razvertka, x1, _b, _c, x2); DownLeft = new PeanoCurve_2D(Down, _razvertka, _a, x1, _c, x2); TopLeft = new PeanoCurve_2D(Right, _razvertka, _a, x1, x2, _d); break; case Right: DownLeft = new PeanoCurve_2D(Top, _razvertka, _a, x1, _c, x2); DownRight = new PeanoCurve_2D(Right, _razvertka, x1, _b, _c, x2); TopRight = new PeanoCurve_2D(Right, _razvertka, x1, _b, x2, _d); TopLeft = new PeanoCurve_2D(Down, _razvertka, _a, x1, x2, _d); break; default: TopRight = new PeanoCurve_2D(Down, _razvertka, x1, _b, x2, _d); TopLeft = new PeanoCurve_2D(Left, _razvertka, _a, x1, x2, _d); DownLeft = new PeanoCurve_2D(Left, _razvertka, _a, x1, _c, x2); DownRight = new PeanoCurve_2D(Top, _razvertka, x1, _b, _c, x2); } } } } const __declspec(noalias) __forceinline PeanoCurve_2D* __restrict HitTest_2D(float x) const noexcept { int i = 0; const int _razvertka = this->razvertka; int num; const float this_a = this->a; x -= this_a; const float b_minus_a = this->b - this_a; const float inv_b_minus_a = 1.0f / b_minus_a; const PeanoCurve_2D* __restrict Curr = this; while (i != _razvertka) { const int shift = 1 << ++i + i; num = shift * x * inv_b_minus_a; x = fmaf(-ldexp(1.0f, -(i << 1)) * num, b_minus_a, x); const List currType = Curr->Type; switch (num) { case 0: Curr = (currType == Top || currType == Right) ? Curr->DownLeft : Curr->TopRight; break; case 1: Curr = (currType == Top || currType == Left) ? Curr->TopLeft : Curr->DownRight; break; case 2: Curr = (currType == Top || currType == Right) ? Curr->TopRight : Curr->DownLeft; break; default: Curr = (currType == Top || currType == Left) ? Curr->DownRight : Curr->TopLeft; } } return const_cast<PeanoCurve_2D * __restrict>(Curr); } const __declspec(noalias) __forceinline float FindX_2D(const float target_x1, const float target_x2) const noexcept { int _razvertka = this->razvertka; int _razvertka1 = _razvertka; float x1, x2, x = this->a; const float b_minus_a = this->b - x; const PeanoCurve_2D* __restrict Curr = this; while (_razvertka != 0) { const int exponent = _razvertka1 - _razvertka-- << 1; x1 = Curr->x1; x2 = Curr->x2; const List currType = Curr->Type; if (target_x1 > x1 && target_x2 > x2) { Curr = Curr->TopRight; if (currType == Top || currType == Right) { x = fmaf(ldexpf(1.0f, -exponent) * 0.5f, b_minus_a, x); } } else if (target_x1 < x1 && target_x2 > x2) { Curr = Curr->TopLeft; if (currType == Top || currType == Left) { x = fmaf(ldexpf(1.0f, -exponent) * 0.25f, b_minus_a, x); } else { x = fmaf(ldexpf(1.0f, -exponent) * 0.75f, b_minus_a, x); } } else if (target_x1 < x1 && target_x2 < x2) { Curr = Curr->DownLeft; if (currType == Down || currType == Left) { x = fmaf(ldexpf(1.0f, -exponent) * 0.5f, b_minus_a, x); } } else { Curr = Curr->DownRight; if (currType == Top || currType == Left) { x = fmaf(ldexpf(1.0f, -exponent) * 0.75f, b_minus_a, x); } else { x = fmaf(ldexpf(1.0f, -exponent) * 0.25f, b_minus_a, x); } } } return x; }

};

static thread_local std::vector<float, tbb::scalable_allocator<float>> Extr;
static thread_local std::vector<Interval*, tbb::scalable_allocator<Interval*>> R;
static boost::mpi::environment* g_env;
static boost::mpi::communicator* g_world;
static std::aligned_storage_t<sizeof(PeanoCurve_2D), alignof(PeanoCurve_2D)> curveStorage;
static PeanoCurve_2D& Curve = reinterpret_cast<PeanoCurve_2D&>(curveStorage);
static boost::optionalboost::mpi::status probe_status;

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline int AgpInit(const int peanoLevel, const float a, const float b, const float c, const float d) noexcept
{
peano_memory_manager.preallocate_for_depth(peanoLevel);
g_env = ::new boost::mpi::environment();
g_world = ::new boost::mpi::communicator();
int rank = g_world->rank();
if (rank) {
::new (&Curve) PeanoCurve_2D(List::Down, peanoLevel, a, b, c, d);
}
else {
::new (&Curve) PeanoCurve_2D(List::Top, peanoLevel, a, b, c, d);
}
return rank;
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpFinalize() noexcept
{
delete g_world;
delete g_env;
}

__declspec(align(16)) struct CrossMsg final {
float s_x1, s_x2;
float e_x1, e_x2;
float Rtop;
template <typename Archive>
__declspec(noalias) __forceinline void serialize(Archive& __restrict const ar, const int) noexcept { ar& s_x1& s_x2& e_x1& e_x2& Rtop; }
};

extern "C" __declspec(dllexport) __declspec(noalias)
void Base_LNA_1_2_Mer_AGP(
const float N, const float global_iterations,
const float a, const float b, const float c, const float d, const float r,
const bool mode, const float epsilon, const float seed,
float** __restrict out_data, size_t* __restrict out_len) noexcept
{
const int rank = g_world->rank();
const int partner = rank ^ 1;
float divider = ldexpf(1.0f, (Curve.razvertka << 1) + 1);
float inv_divider = 1.0f / divider;
float x_addition = (b - a) * inv_divider;
float y_addition = (d - c) * inv_divider;
float true_start = a + x_addition;
float true_end = b - x_addition;
float x_Rmax_1, x_Rmax_2;
float initial_length, dmax, threshold_03, inv_threshold_03;
float start_val, best_f, y_Rmax_1, y_Rmax_2;
float Mmax, m;
int schetchick = 0, mcQueenSpeed = 1;
float new_point, new_value;
const PeanoCurve_2D* __restrict pc;
float new_x1, new_x2;
Interval* __restrict promejutochny_otrezok;
Interval* __restrict curr;
Interval* __restrict curr1;
float currM, len1, len2, len_item;
size_t r_size;
float progress, alpha, betta, MULTIPLIER, global_coeff, GLOBAL_FACTOR;
Interval* __restrict top_ptr;
float interval_len;
int dummy;
const PeanoCurve_2D* __restrict p1L;
const PeanoCurve_2D* __restrict p2L;
CrossMsg outbound, inbound{};
Interval* __restrict injected;
float cooling;
int T;
float k;
int tag;

text
if (N == 1.0f) { initial_length = b - a; dmax = initial_length; threshold_03 = 0.3f * initial_length; inv_threshold_03 = 1.0f / threshold_03; start_val = ShekelFunc(a, seed); best_f = ShekelFunc(b, seed); x_Rmax_1 = a; x_Rmax_2 = b; y_Rmax_1 = start_val; y_Rmax_2 = best_f; Extr.reserve(static_cast<size_t>(global_iterations) * 4u); Extr.clear(); R.reserve(static_cast<size_t>(global_iterations) * 2u); R.clear(); R.emplace_back(new Interval(a, b, start_val, best_f, N)); std::push_heap(R.begin(), R.end(), ComparePtr); Mmax = R.front()->M; m = r * Mmax; while (true) { new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, N, r); new_value = ShekelFunc(new_point, seed); if (new_value < best_f) { best_f = new_value; Extr.emplace_back(best_f); Extr.emplace_back(new_point); } std::pop_heap(R.begin(), R.end(), ComparePtr); promejutochny_otrezok = R.back(); R.pop_back(); new_x1 = promejutochny_otrezok->x1; new_x2 = promejutochny_otrezok->x2; len2 = new_x2 - new_point; len1 = new_point - new_x1; interval_len = (len1 < len2) ? len1 : len2; if (++schetchick == static_cast<int>(global_iterations) || interval_len < epsilon) { if (rank == 0) { Extr.emplace_back(static_cast<float>(schetchick)); Extr.emplace_back(interval_len); *out_len = Extr.size(); *out_data = reinterpret_cast<float* __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len))); memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len)); } return; } curr = new Interval(new_x1, new_point, promejutochny_otrezok->y1, new_value, N); curr1 = new Interval(new_point, new_x2, new_value, promejutochny_otrezok->y2, N); currM = (curr->M > curr1->M) ? curr->M : curr1->M; r_size = R.size(); if ((len2 + len1) == dmax) { dmax = (len2 > len1) ? len2 : len1; size_t i = 0u;

#pragma loop(ivdep)
while (i < r_size) {
len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}
if (mode) {
if ((threshold_03 > dmax && fmodf(static_cast<float>(schetchick), 3.0f) == 0.0f) || 10.0f * dmax < initial_length) {
if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
progress = fmaf(-inv_threshold_03, dmax, 1.0f);
alpha = fmaf(progress, progress, 1.0f);
betta = 2.0f - alpha;
MULTIPLIER = (1.0f / dmax) * Mmax;
global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
GLOBAL_FACTOR = betta * global_coeff;
curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
Mmax = currM; m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) { R[i]->ChangeCharacteristic(m); ++i; }
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
Mmax = currM; m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) { R[i]->ChangeCharacteristic(m); ++i; }
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
R.emplace_back(curr);
std::push_heap(R.begin(), R.end(), ComparePtr);
R.emplace_back(curr1);
std::push_heap(R.begin(), R.end(), ComparePtr);
top_ptr = R.front();
x_Rmax_1 = top_ptr->x1;
x_Rmax_2 = top_ptr->x2;
y_Rmax_1 = top_ptr->y1;
y_Rmax_2 = top_ptr->y2;
}
}
else {
x_Rmax_1 = true_start;
x_Rmax_2 = true_end;
initial_length = true_end - true_start;
dmax = initial_length;
threshold_03 = 0.3f * initial_length;
inv_threshold_03 = 1.0f / threshold_03;
start_val = rank ? RastriginFunc(true_end, d - y_addition)
: RastriginFunc(true_start, c + y_addition);
y_Rmax_1 = start_val;
best_f = rank ? RastriginFunc(true_start, d - y_addition)
: RastriginFunc(true_end, c + y_addition);
y_Rmax_2 = best_f;
Extr.reserve(static_cast<size_t>(global_iterations) * 4u);
Extr.clear();
R.reserve(static_cast<size_t>(global_iterations) * 2u);
R.clear();
R.emplace_back(new Interval(true_start, true_end, start_val, best_f, N));
Mmax = R.front()->M;
m = r * Mmax;
while (true) {
if (++schetchick == static_cast<int>(global_iterations)) {
g_world->send(partner, 2, 0);
if (partner) {
top_ptr = R.front();
Extr.emplace_back(static_cast<float>(schetchick));
Extr.emplace_back(top_ptr->x2 - top_ptr->x1);
*out_len = Extr.size();
out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
}
return;
}
cooling = ldexpf(1.0f, -(1.0f / 138.63f) * schetchick);
T = static_cast<int>(fmaf(20.0f, cooling, 10.0f));
k = fmaf(0.2f, cooling, 0.7f);
if (schetchick % (T - 1) == 0) {
probe_status = g_world->iprobe(partner, boost::mpi::any_tag);
if (probe_status) {
tag = probe_status->tag();
if (tag == 0) {
g_world->recv(partner, 0, mcQueenSpeed);
}
else if (tag == 2) {
g_world->recv(partner, 2, dummy);
if (partner) {
top_ptr = R.front();
Extr.emplace_back(static_cast<float>(schetchick));
Extr.emplace_back(top_ptr->x2 - top_ptr->x1);
*out_len = Extr.size();
out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
}
return;
}
}
}
if (!(schetchick % T) == mcQueenSpeed) {
top_ptr = R.front();
p1L = Curve.HitTest_2D(top_ptr->x1);
p2L = Curve.HitTest_2D(top_ptr->x2);
outbound = CrossMsg{ p1L->x1, p1L->x2, p2L->x1, p2L->x2, top_ptr->R };
if (mcQueenSpeed) {
g_world->send(partner, 0, 0);
}
else {
mcQueenSpeed = 1;
}
g_world->send(partner, 1, outbound);
g_world->recv(partner, 1, inbound);
injected = new Interval(Curve.FindX_2D(inbound.s_x1, inbound.s_x2), Curve.FindX_2D(inbound.e_x1, inbound.e_x2), RastriginFunc(inbound.s_x1, inbound.s_x2), RastriginFunc(inbound.e_x1, inbound.e_x2), N);
injected->R = inbound.Rtop * k;
R.emplace_back(injected);
std::push_heap(R.begin(), R.end(), ComparePtr);
}
new_point = Shag(m, x_Rmax_1, x_Rmax_2, y_Rmax_1, y_Rmax_2, N, r);
pc = Curve.HitTest_2D(new_point);
new_x1 = pc->x1;
new_x2 = pc->x2;
new_value = RastriginFunc(new_x1, new_x2);
if (new_value < best_f) {
best_f = new_value;
Extr.emplace_back(best_f);
Extr.emplace_back(new_x1);
Extr.emplace_back(new_x2);
}
std::pop_heap(R.begin(), R.end(), ComparePtr);
promejutochny_otrezok = R.back();
R.pop_back();
curr = new Interval(promejutochny_otrezok->x1, new_point, promejutochny_otrezok->y1, new_value, N);
curr1 = new Interval(new_point, promejutochny_otrezok->x2, new_value, promejutochny_otrezok->y2, N);
currM = (curr->M > curr1->M) ? curr->M : curr1->M;
len2 = promejutochny_otrezok->x2 - new_point;
len1 = new_point - promejutochny_otrezok->x1;
r_size = R.size();
if ((len2 + len1) == dmax) {
dmax = (len2 > len1) ? len2 : len1;
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
len_item = R[i]->x2 - R[i]->x1;
if (len_item > dmax) dmax = len_item;
++i;
}
}
if (mode) {
if ((threshold_03 > dmax && fmodf(static_cast<float>(schetchick), 3.0f) == 0.0f) || 10.0f * dmax < initial_length) {
if (currM > Mmax) { Mmax = currM; m = r * Mmax; }
progress = fmaf(-inv_threshold_03, dmax, 1.0f);
alpha = fmaf(progress, progress, 1.0f);
betta = 2.0f - alpha;
MULTIPLIER = (1.0f / dmax) * Mmax;
global_coeff = fmaf(MULTIPLIER, r, -MULTIPLIER);
GLOBAL_FACTOR = betta * global_coeff;
curr->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len1, curr->M * alpha));
curr1->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, len2, curr1->M * alpha));
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) {
R[i]->ChangeCharacteristic(fmaf(GLOBAL_FACTOR, R[i]->x2 - R[i]->x1, R[i]->M * alpha));
++i;
}
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
if (currM > Mmax) {
Mmax = currM; m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) { R[i]->ChangeCharacteristic(m); ++i; }
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
}
else {
if (currM > Mmax) {
Mmax = currM; m = r * Mmax;
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
size_t i = 0u;
#pragma loop(ivdep)
while (i < r_size) { R[i]->ChangeCharacteristic(m); ++i; }
std::make_heap(R.begin(), R.end(), ComparePtr);
}
else {
curr->ChangeCharacteristic(m);
curr1->ChangeCharacteristic(m);
}
}
R.emplace_back(curr);
std::push_heap(R.begin(), R.end(), ComparePtr);
R.emplace_back(curr1);
std::push_heap(R.begin(), R.end(), ComparePtr);
top_ptr = R.front();
interval_len = top_ptr->x2 - top_ptr->x1;
if (interval_len < epsilon) {
g_world->send(partner, 2, 0);
if (partner) {
Extr.emplace_back(static_cast<float>(schetchick));
Extr.emplace_back(interval_len);
*out_len = Extr.size();
out_data = reinterpret_cast<float __restrict>(CoTaskMemAlloc(sizeof(float) * (*out_len)));
memcpy(*out_data, Extr.data(), sizeof(float) * (*out_len));
}
return;
}
x_Rmax_1 = top_ptr->x1;
x_Rmax_2 = top_ptr->x2;
y_Rmax_1 = top_ptr->y1;
y_Rmax_2 = top_ptr->y2;
}
}
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void Base_LNA_1_2_Mer_AGP_Free(float* p) noexcept
{
CoTaskMemFree(p);
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpStartWorkers() noexcept
{
g_world->send(1, 0, 1);
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpStopWorkers() noexcept
{
g_world->send(1, 0, 2);
}

extern "C" __declspec(dllexport) __declspec(noalias) __forceinline void AgpWaitStartAndRun() noexcept
{
int command = 0;
while (true) {
probe_status = g_world->iprobe(0, 0);
if (probe_status) {
g_world->recv(0, 0, command);
}
if (command == 1) {
float* __restrict dummy = nullptr;
size_t dummy_len = 0u;
Base_LNA_1_2_Mer_AGP(2.0f, 1000.0f, -2.2f, 1.8f, -2.2f, 1.8f,
2.5f, false, 0.001f, GetTickCount(),
&dummy, &dummy_len);
}
else if (command == 2) {
break;
}
}
return;
} в этом коде у меня сейчас проблема не с mpi коммуникацией а в том что при первом нажатии на кнопку всё срабатывает нормально но затем происходит что-то странное и значения кажутся некорректными - мусорорм - думаю это из-за того что нужно обнулять указатели статические после использования - возвращать состояние к корректному как при первом запуске, напоминаю код в MyForm.cpp #include "MyForm.h"
#include <float.h>

using namespace System;
using namespace System::Windows::Forms;

typedef int(__cdecl* PInit)(int, float, float, float, float);
typedef void(__cdecl* PFin)();
typedef void(__cdecl* PStartWorkers)();

[STAThread]
int main() {
HMODULE h = LoadLibraryW(L"TEST_FUNC.dll");
auto AgpInit = (PInit)GetProcAddress(h, "AgpInit");
auto AgpFinalize = (PFin)GetProcAddress(h, "AgpFinalize");
auto AgpWaitStartAndRun = (PStartWorkers)GetProcAddress(h, "AgpWaitStartAndRun");

text
const int rank = AgpInit(12, -2.2f, 1.8f, -2.2f, 1.8f); if (rank == 0) { Application::EnableVisualStyles(); Application::SetCompatibleTextRenderingDefault(false); Application::Run(gcnew TESTAGP::MyForm(h)); } else { AgpWaitStartAndRun(); } AgpFinalize(); FreeLibrary(h); return 0;

} код в MyForm.h ref class MyForm : public System::Windows::Forms::Form {
public:
MyForm(HMODULE hLib) : hLib(hLib) { // Передаем дескриптор из main
InitializeComponent();
// Загружаем функции из DLL
f = (agp_c)GetProcAddress(hLib, "Base_LNA_1_2_Mer_AGP");
pStart = (start_workers)GetProcAddress(hLib, "AgpStartWorkers");
pFree = (free_agp)GetProcAddress(hLib, "Base_LNA_1_2_Mer_AGP_Free");
pStop = (stop_workers)GetProcAddress(hLib, "AgpStopWorkers");
}

text
protected: ~MyForm() { pStop(); delete components; } private: HINSTANCE hLib = nullptr; typedef void(__cdecl* agp_c)(float, float, float, float, float, float, float, bool, float, float, float**, size_t*); typedef void(__cdecl* start_workers)(); typedef void(__cdecl* free_agp)(float*); typedef void(__cdecl* stop_workers)(); agp_c f = nullptr; start_workers pStart = nullptr; free_agp pFree = nullptr; stop_workers pStop = nullptr; System::Windows::Forms::Button^ button1; System::Windows::Forms::TextBox^ textBox2; System::Windows::Forms::DataVisualization::Charting::Chart^ chart2; System::Windows::Forms::Label^ label2; System::Windows::Forms::Label^ label3; System::Windows::Forms::TextBox^ textBox1; System::Windows::Forms::TextBox^ textBox3; System::Windows::Forms::TextBox^ textBox4; System::Windows::Forms::TextBox^ textBox5; System::Windows::Forms::TextBox^ textBox6; System::Windows::Forms::Label^ label6; System::Windows::Forms::Label^ label7; System::Windows::Forms::Label^ label8; System::Windows::Forms::Label^ label9; System::Windows::Forms::Label^ label10; System::Windows::Forms::Label^ label1; System::Windows::Forms::TextBox^ textBox7; System::Windows::Forms::TextBox^ textBox8; System::ComponentModel::Container^ components; System::Void button1_Click(System::Object^ sender, System::EventArgs^ e) { chart2->Series[0]->Points->Clear(); chart2->Series[1]->Points->Clear(); chart2->Series[2]->Points->Clear(); chart2->Series[3]->Points->Clear(); static LARGE_INTEGER start, end; QueryPerformanceCounter(&start); float* buf = nullptr; size_t len = 0u; pStart(); // разбудили rank != 0 f(2.0f, 1000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.001f, GetTickCount(), &buf, &len); if (buf != nullptr) { std::vector<float> Extr_2D(buf, buf + len); pFree(buf); // Для 2D ветки: извлекаем schetchick и interval_len textBox7->Text = Convert::ToString(Extr_2D.back()); // schetchick Extr_2D.pop_back(); textBox6->Text = Convert::ToString(Extr_2D.back()); // interval_len Extr_2D.pop_back(); // Извлекаем последнюю тройку (x2, x1, f) float x2 = Extr_2D.back(); Extr_2D.pop_back(); float x1 = Extr_2D.back(); Extr_2D.pop_back(); float f_val = Extr_2D.back(); Extr_2D.pop_back(); // Выводим координаты и значение функции textBox4->Text = Convert::ToString(x1); textBox3->Text = Convert::ToString(x2); textBox2->Text = Convert::ToString(f_val); // Новое поле для значения функции // Отображаем точку на графике chart2->Series[2]->Points->AddXY(x1, x2); // Обрабатываем остальные точки (в порядке: f, x1, x2) while (!Extr_2D.empty()) { float x2_point = Extr_2D.back(); Extr_2D.pop_back(); float x1_point = Extr_2D.back(); Extr_2D.pop_back(); float f_point = Extr_2D.back(); Extr_2D.pop_back(); // Добавляем точку на график (только координаты) chart2->Series[0]->Points->AddXY(x1_point, x2_point); } } QueryPerformanceCounter(&end); LARGE_INTEGER freq; QueryPerformanceFrequency(&freq); const int multiplier = ((1ULL << 32) / freq.QuadPart) * 1'000'000ULL; textBox5->Text = Convert::ToString( ((end.QuadPart - start.QuadPart) * multiplier) >> 32 ) + " microseconds";

思考

Identifying memory and calculation issues

I've spotted a memory leak in 'AgpWaitStartAndRun' where it doesn't free the dummy pointer after calling Base_LNA_1_2_Mer_AGP, leading to issues if 'command==1' triggers. Another bug is in the random generator using LCG_RAND—it's fine so far, but potentially problematic if not handled right. The last issue involves assumptions in PreallocatedPeanoMemoryManager::calculate_tree_size_bytes, where an assumption on 'peanoLevel' can lead to out-of-bounds errors.

Addressing memory allocation issues

I’ve identified a problem with memory allocation in the system involving VirtualAlloc. The 'current' pointer can go beyond the reserved memory because the code doesn’t check against the 'end'. This could cause crashes or write garbage. A potential fix might be adding a reset method, but the current setup doesn’t store ‘aligned’ as a member, complicating the reset process.

Addressing memory safety and thread handling

To resolve the memory issue, I’ll save the 'start' pointer as 'begin' and introduce an 'end' pointer to check if the allocation exceeds boundaries. For resetting memory, I can’t easily iterate across threads, but I can handle this by calling a reset function for each thread at the start of its function.

Handling memory management and allocations

The memory manager allocates separate blocks for each thread using tls.local(), which is good. However, resetting memory across all threads is problematic since the tree's allocations happen only once and should remain. I wonder if separating persistent memory pools for the tree and transient memory for intervals will help clarify this situation.

Proposing improvements for memory allocation strategy

The main issue seems to be that memory allocations for trees and intervals are being handled by the same pool, causing corruption when intervals exceed the allocated memory. To fix this, I’d suggest either using separate memory managers for tree nodes and intervals or transitioning to tools like tbb::scalable_allocator for more reliable memory handling per run. Additionally, I noted some other cleanup and memory management tasks like freeing dummy pointers, addressing a potential memory leak, and fixing incorrect bug behavior in the code.

Analyzing code behavior and memory management

The user's issue may stem from the static pointer handling and memory allocation. The problem seems to be with resetting pointers across runs, potentially overwhelming memory due to shared pools. A good starting fix might involve separating memory pools for persistent tree allocations and intervals. Also, a few other bugs could be coming from improper handling of static arrays, thread-local variables, and non-null checks for buffers during the second run. The order of MPI communication should be verified for potential race conditions and deadlocks as well.

Investigating memory and thread allocation

The issue may be related to how memory is allocated across threads and how the partner condition is handled for rank 0. There could also be memory allocation problems when multiple threads try to access large memory pools. Although the calculation for required_size seems correct at first, the cumulative memory usage across threads may not be sufficient to handle all instances. Boundary checks might be needed.

Addressing memory allocation issues

There seems to be confusion with memory allocation across threads and how the program handles the current pointer in memory management. A potential fix could involve altering Interval to use the default allocator, avoiding custom memory pools that might create issues on subsequent runs. Alternatively, memory resetting functions could help keep the allocation strategy efficient.

Identifying memory management issues

The problem seems to stem from how static variables persist across runs, creating inconsistencies when recalculating values like Extr. Missing calls to reset or ensure proper heap structure can also lead to unexpected behavior, especially when std::pop_heap is used on an incomplete heap. A clean reset function and memory manager adjustment could help resolve these issues.

Proposing memory management fixes

The issue lies in persistent global state, particularly with the allocator. Resetting the memory manager before each run and separating different allocators for tree and interval memory will help. I recommend adding reset functionality for thread-local storage, fixing the leaking memory in the worker, and adjusting operator logic for proper conditional behavior. Here's a patch to fix the memory management flow and ensure better memory reuse.

Thought for 5m 25s

回答

Похоже, «мусор» после второго нажатия прилетает не из-за MPI, а из-за вашего собственного аллокатора и общего глобального состояния между запусками.

Коротко, что происходит:

  1. Вы используете один общий пул peano_memory_manager и для построения дерева Пеано (PeanoCurve_2D), и для рабочих интервалов (Interval).
  2. Пул – это тупой bump-allocator: allocate_from_block() только сдвигает current, границ не проверяет и никогда не откатывается.
  3. Память для пула выделяется один раз в AgpInit() и больше не трогается. На первом запуске вы израсходовали кусок на дерево и интервалы, на втором запуске начинаете выделять дальше (над областью, где уже ничего не гарантируется). Отсюда – странные/рандомные значения.
  4. Плюс, в воркере (ранг ≠ 0) вы возвращаете буфер dummy, но его не освобождаете — на каждое «Start» течёт память. Это быстро не «ломает» логику, но усугубляет поведение.

Ниже — минимальные правки, которые возвращают состояние «как при первом запуске», но не трогают память с деревом.


Что именно правим

1) Добавляем «метку» после построения дерева и сброс к этой метке перед каждым запуском

Идея: запоминаем high-water-mark у каждого MemoryBlock после строительства дерева; перед каждым Base_LNA_1_2_Mer_AGP() откатываем current к этой метке (то есть заново используем область для интервалов), не трогая память с узлами дерева.

cpp
// В вашем MemoryBlock: struct MemoryBlock final { char* data; char* current; const char* end; char* mark_after_tree; // <- добавили __forceinline void initialize(size_t bytes) noexcept { const size_t aligned_need = (bytes + 1048638) & ~(1048575); void* p = VirtualAlloc(nullptr, aligned_need, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); data = reinterpret_cast<char*>(p); const uintptr_t base = reinterpret_cast<uintptr_t>(data); const uintptr_t aligned = (base + 63) & ~63; current = reinterpret_cast<char*>(aligned); end = current + bytes; mark_after_tree = current; // изначально метка = начало } __forceinline void mark_tree_done() noexcept { mark_after_tree = current; } __forceinline void reset_to_mark() noexcept { current = mark_after_tree; } __forceinline void* allocate_from_block(size_t alloc_size) noexcept { const size_t n = (alloc_size + 63) & ~63; char* result = current; current += n; // Заодно добавим проверку границы в Debug (в релизе можно убрать/ассертить). #if _DEBUG if (current > end) __debugbreak(); #endif return result; } };

В менеджере добавим удобные обёртки:

cpp
struct PreallocatedPeanoMemoryManager { tbb::enumerable_thread_specific<MemoryBlock> tls; void mark_tree_done_for_current_thread() noexcept { tls.local().mark_tree_done(); } void reset_for_current_thread() noexcept { tls.local().reset_to_mark(); } };

Где вызывать:

  • В AgpInit(...)после ::new (&Curve) PeanoCurve_2D(...):
cpp
// Дерево построено -> для текущего (главного) потока фиксируем метку. peano_memory_manager.mark_tree_done_for_current_thread();

Не переживайте насчёт TBB: узлы дерева действительно создавались в разных потоках, но интервалы вы создаёте в том же потоке, где зовёте Base_LNA_1_2_Mer_AGP (ранг 0). Нам важно откатывать именно локальный MemoryBlock этого потока, чтобы итерировать область «под интервалы». Память других TLS-блоков, на которых строились части дерева, мы не трогаем.

  • В самом начале Base_LNA_1_2_Mer_AGP(...):
cpp
// Перед каждым запуском алгоритма очищаем «рабочую» арену интервалов: peano_memory_manager.reset_for_current_thread(); // А эти вектора у вас уже очищаются .clear(), что нормально: Extr.clear(); R.clear();

Этого достаточно, чтобы каждый запуск начинался в «чистой» зоне для Interval, не повреждая дерево.

Альтернатива (тоже рабочая): дать Interval свой отдельный аллокатор (второй глобальный менеджер или просто обычный new). Это изолирует «временные» объекты от дерева вообще. Если захотите — подскажу, как быстро вынести Interval::operator new на второй пул/на tbb::scalable_allocator.


2) Починить утечку в воркере

В AgpWaitStartAndRun() вы не освобождаете буфер, возвращаемый из Base_LNA_1_2_Mer_AGP. Нужно сразу звать экспорт Base_LNA_1_2_Mer_AGP_Free:

cpp
else if (command == 1) { float* __restrict dummy = nullptr; size_t dummy_len = 0; Base_LNA_1_2_Mer_AGP(2.0f, 1000.0f, -2.2f, 1.8f, -2.2f, 1.8f, 2.5f, false, 0.001f, GetTickCount(), &dummy, &dummy_len); if (dummy) Base_LNA_1_2_Mer_AGP_Free(dummy); // <-- обязательно }

Без этого при каждом старте ранга ≠ 0 утекает кусок памяти.


3) Небольшие, но полезные точечные фиксы

  • В 2D-ветке после первого R.emplace_back(...) хорошо бы сразу привести R к куче (для единообразия):

    cpp
    R.clear(); R.emplace_back(new Interval(true_start, true_end, start_val, best_f, N)); std::push_heap(R.begin(), R.end(), ComparePtr); // можно и так; с одним элементом безопасно

    (С одним элементом поведение и так корректное, но это уберёт риск, если кто-то позже вставит код между emplace_back и первым pop_heap.)

  • В условии обменов есть неочевидный приоритет !:

    cpp
    // Было: if (!(schetchick % T) == mcQueenSpeed) { // Обычно хотят: if ((schetchick % T) != mcQueenSpeed) { }

    Если логика у вас именно «раз в T шагов при несоответствии скорости», перепишите как выше. С нынешним вариантом легко попасть в «дребезг» условий.

  • Shag(): вы читаете биты diff через *(const int*)&diff — это UB по strict-aliasing. Безопаснее:

    cpp
    const uint32_t sign = _mm_cvtss_si32(_mm_set_ss(diff)) & 0x80000000u; const uint32_t bits = 0x3F800000u | (sign ^ 0x80000000u); const float sign_mult = _mm_cvtss_f32(_mm_castsi128_ps(_mm_cvtsi32_si128(bits)));

4) В UI всё ок, но порядок распаковки оставьте как есть

Вы правильно обнуляете график и освобождаете буфер через pFree(buf) после копирования в std::vector. В 2D-ветке порядок у вас f, x1, x2 — вы читаете задом-наперед, это корректно.


Что это даёт

  • Каждое нажатие «Старт» начинает расчёт с того же состояния рабочей памяти (интервалы создаём поверх «чистой» арены), дерево остаётся неизменным.
  • Устраняется накопительный эффект от bump-allocator’а и связанные «рандомные» значения.
  • Воркеры не текут по памяти.

Если хотите самый простой (и радикально безопасный) вариант — уберите перегруженный operator new у Interval и дайте интервалам обычные new/delete (или tbb::scalable_allocator). Тогда никаких сбросов вообще не нужно — пул останется только под дерево, и он неизменяем между запусками. Но показанный выше «mark/reset» даёт те же гарантии с минимальными правками.

分享这个问答