#define FABE13_SINCOS(in, sin_out, cos_out, n) \ i...

Criado em: 26 de setembro de 2025

Respondido usando GPT-5 Thinking por Chat01

Pergunta

#define FABE13_SINCOS(in, sin_out, cos_out, n)
int i = 0;
int limit = (n) - ((n) % 16);
if ((n) >= 16) {
const __m512 VEC_TWOPI = _mm512_set1_ps(6.28318530718f);
const __m512 VEC_PI = _mm512_set1_ps(3.14159265359f);
const __m512 VEC_PI_2 = _mm512_set1_ps(1.57079632679f);
const __m512 VEC_COS_P3 = _mm512_set1_ps(0.0416666667f);
const __m512 VEC_COS_P1 = _mm512_set1_ps(-0.5f);
const __m512 VEC_COS_P0 = _mm512_set1_ps(1.0f);
const __m512 VEC_SIN_P1 = _mm512_set1_ps(-0.16666666f);
const __m512 VEC_SIN_P0 = _mm512_set1_ps(1.0f);
const __m512 VEC_ZERO = _mm512_setzero_ps();
while (i < limit) {
__m512 vx = _mm512_load_ps(&(in)[i]);
__m512 vax = _mm512_abs_ps(vx);
__m512 vdiv = _mm512_div_ps(vax, VEC_TWOPI);
__m512 vtrunc = _mm512_roundscale_ps(vdiv, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
__m512 reduced = _mm512_fmsub_ps(VEC_TWOPI, vtrunc, vax);
__mmask16 mask1 = _mm512_cmp_ps_mask(reduced, VEC_PI, _CMP_GT_OS);
__m512 adj1 = _mm512_sub_ps(VEC_TWOPI, reduced);
reduced = _mm512_mask_blend_ps(mask1, reduced, adj1);
__mmask16 mask2 = _mm512_cmp_ps_mask(reduced, VEC_PI_2, _CMP_GT_OS);
__m512 adj2 = _mm512_sub_ps(VEC_PI, reduced);
reduced = _mm512_mask_blend_ps(mask2, reduced, adj2);
__m512 val2 = _mm512_mul_ps(reduced, reduced);
__m512 cos_tmp = _mm512_fmadd_ps(val2, VEC_COS_P3, VEC_COS_P1);
__m512 vcos = _mm512_fmadd_ps(val2, cos_tmp, VEC_COS_P0);
__m512 neg_vcos = _mm512_sub_ps(VEC_ZERO, vcos);
vcos = _mm512_mask_blend_ps(mask2, vcos, neg_vcos);
__m512 sin_tmp = _mm512_fmadd_ps(val2, VEC_SIN_P1, VEC_SIN_P0);
__m512 vsin = _mm512_mul_ps(sin_tmp, reduced);
__mmask16 sign_mask = _mm512_cmp_ps_mask(vx, VEC_ZERO, _CMP_LT_OS);
__m512 neg_vsin = _mm512_sub_ps(VEC_ZERO, vsin);
vsin = _mm512_mask_blend_ps(sign_mask, vsin, neg_vsin);
_mm512_store_ps(&(cos_out)[i], vcos);
_mm512_store_ps(&(sin_out)[i], vsin);
i += 16;
}
}
while (i < (n)) {
float x = (in)[i];
float abs_x = fabsf(x);
float reduced = fmodf(abs_x, 6.28318530718f);
if (reduced > 3.14159265359f) {
reduced = 6.28318530718f - reduced;
}
if (reduced > 1.57079632679f) {
reduced = 3.14159265359f - reduced;
float val2 = reduced * reduced;
(cos_out)[i] = -fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
} else {
float val2 = reduced * reduced;
(cos_out)[i] = fmaf(val2, fmaf(val2, 0.0416666667f, -0.5f), 1.0f);
}
if (x > 0.0f) {
(sin_out)[i] = fmaf(reduced * reduced, -0.16666666f, 1.0f) * reduced;
} else {
(sin_out)[i] = -fmaf(reduced * reduced, -0.16666666f, 1.0f) * reduced;
}
++i; \ static __declspec(noalias) __forceinline float HillFunc(const float x, const float seed) noexcept
{
int64_t j = 0;
__declspec(align(64u)) float angles[25u];
const float start_angle = 6.28318530717958647692f * x;

#pragma loop(ivdep)
while (j < 25) {
angles[j] = start_angle * (j + 1);
++j;
}

text
__declspec(align(64u)) float sin_vals[25u]; __declspec(align(64u)) float cos_vals[25u]; FABE13_SINCOS(angles, sin_vals, cos_vals, 25u); float current_state = seed; LCG_RAND(current_state); float res = fmaf(current_state, 2.0f, -1.1f); --j; while (j >= 0) { LCG_RAND(current_state); float tmp_state = current_state; LCG_RAND(current_state); res += fmaf(fmaf(tmp_state, 2.0f, -1.1f), sin_vals[j], fmaf(current_state, 2.0f, -1.1f) * cos_vals[j]); --j; } return res;

} возникает ошибка именно когда размер массивов 25, когда размер 14, 16, 64 - всё нормально, но при 25 возникает ошибка, сама реализация как скалярная так и векторная корректна - в чём проблема? я вызываю эту функцию хилла в своей экспортируемой из dll функции которая используется в управляемом winforms приложении, управляемое приложение также использует mpi и tbb, я заметил что стоит быть очень осторожным со стеком и следить чтобы он не переполнялся

Pensamento

Exploring stack alignment

The issue seems to lie in stack alignment or calling conventions. The user implements AVX-512 intrinsics in C/C++ with function declarations for arrays, using __declspec(align(64)) for alignment. At 25 elements per array, misalignment can cause issues when arrays exceed stack space. The alignment might be triggering a stack overflow due to MSVC's behavior with local variables.

Stack alignment and AVX-512

The issue may stem from stack alignment combined with AVX-512's need for 64-byte alignment, especially since the macro uses _mm512_load_ps, which requires aligned memory. If angles is 64-aligned, then that should work. However, for arrays like sin_vals and cos_vals that are part of the macro call, things may get misaligned when n=25 because it's not a multiple of 16. The scalar path kicks in for the remaining values beyond 16.

Identifying a macro bug

The macro uses a vectorized loop for n=25, processing 16 elements and then handling the remaining indices scalar. However, there's a bug: after the scalar while loop, the macro is missing a closing backslash or brace due to formatting. The backslash at the end might incorrectly continue the macro, causing unintended code expansion. It's likely just copy-paste formatting, though.

Isolating the error with size 25

The error only occurs when the arrays are size 25, suggesting possible out-of-bounds access. In the scalar code, cos_out[i] is computed inside conditions, including reducing values modulo 2π and adjusting signs. However, there's no sign clamping for cos_out depending on a mask. The logic seems to work earlier, but it's worth checking if everything is correctly handled in the scalar path after modulo reductions.

Investigating the crash at n=25

The user is dealing with a crash occurring only when array size is 25, while other sizes work fine. I think it's not a sign issue, but rather something related to stack alignment. With AVX-512 alignment requirements, there seems to be a mismatch in stack alignment with inline functions—especially when mixed with dynamically allocated memory. It’s likely causing an issue during stack handling. However, further checks are needed, and I’ll need to narrow down the cause.

Investigating potential misalignment issues

The crash only occurs with an array size of 25, suggesting the issue is likely tied to misalignment when using AVX-512. The _mm512_load_ps routine requires 64-byte alignment, and if the stack isn’t properly aligned due to inlining or compiler optimization, it may lead to crashes. It’s possible that a 64-byte alignment isn’t upheld for local arrays when inlined, especially with certain compiler optimizations at play. Further examining alignment handling should help clarify this point.

Investigating potential runtime causes

The issue seems less about misalignment and more about stack overflow or array bounds exceeding in the scalar section of the macro. Although the function has correct low-level optimizations, there's a chance the array handling—particularly around angle folding and sine computation—enters an unintended state or duplicate access under specific conditions, causing runtime instability. Further inspection of array boundary handling could clarify this.

Examining stack and alignment issues

The error at an array size of 25 seems possibly due to stack corruption or issues in function calling conventions, especially when considering how the LCG_RAND macro could be affecting stack alignment. The transition between vector and scalar code may cause minor mishandlings in register or variable alignment, particularly with the sign mask calculation. The real issue might be related to the handling of negative values in the scalar path.

Analyzing memory alignment and rounding

The potential issue could arise from memory alignment—using 64-byte alignment with AVX-512 instructions may cause a crash if the compiler doesn't support this. It might also be due to a mismatch between signed and unsigned variables or issues with the __declspec(align(64)) directive, which could result in misaligned accesses. This only causes problems for certain array sizes but doesn't explain the behavior with sizes 16 and 64.

Diagnosing alignment and memory issues

The issue seems to be with memory alignment. For size 25 arrays, the angles array is not properly aligned to 64 bytes, causing crashes when using aligned loads like _mm512_load_ps. A possible solution is to switch to unaligned loads (_mm512_loadu_ps) or ensure explicit 64-byte alignment using proper memory allocation. This issue may arise due to compiler optimizations for stack alignment.

Identifying alignment issues

It seems that the error could be related to stack array alignment, especially when using aligned loads with _mm512_load_ps. The alignment of local variables on the stack may not be guaranteed in all cases, such as with inlining or P/Invoke. The fix is to use unaligned intrinsics like _mm512_loadu_ps, or to manually ensure alignment with functions like _mm_malloc. Also, make sure arrays are aligned to 64 bytes to prevent misangle events, especially at size 25.

Thought for 3m 55s

Resposta

Похоже, вы ловите разовый «выстрел в ногу» из-за выравнивания стека и использования _mm512_load_ps/_mm512_store_ps (требуют 64-байтового выравнивания адреса). Для n=25 расклад локальных массивов на стеке меняется так, что один из них (или сразу оба) оказывается не 64-выравнен — и на vmovaps zmm, m512 прилетает #GP. Для 14/16/64 адрес по счастливой случайности попадал в кратный 64 байт.

Даже несмотря на __declspec(align(64)), с инлайном (__forceinline), вызовом через экспорт из DLL, TBB/MPI (файбры/нестандартные стеки) и управляемым окружением гарантии «строгого» 64-выравнивания для локалов не всегда соблюдаются. Иногда компилятор не вставляет динамическое выравнивание пролога функции (или делает это у вызывающей, а после инлайна – уже нет), и всё съезжает.

Что сделать, чтобы стабильно не падало

  1. Перейти на невыравненные загрузки/выгрузки. На AVX-512 разницы в пропускной способности vmovups/vmovaps почти нет, а надёжность — 100%.

    cpp
    __m512 vx = _mm512_loadu_ps(&(in)[i]); ... _mm512_storeu_ps(&(cos_out)[i], vcos); _mm512_storeu_ps(&(sin_out)[i], vsin);

    Это самый быстрый и надёжный фикс.

  2. Если принципиально хотите «aligned», не кладите буферы на стек:

    • выделяйте _aligned_malloc(…, 64) / _mm_malloc(…, 64) (и не забывайте free/_mm_free);
    • или храните их как static thread_local alignas(64) (если это безопасно для вашего потока/реентерабельности);
    • уберите __forceinline, чтобы компилятор мог вставить пролог с and rsp, -64.
  3. На время диагностики добавьте assert’ы:

    cpp
    assert((reinterpret_cast<uintptr_t>(angles) & 63) == 0); assert((reinterpret_cast<uintptr_t>(sin_vals) & 63) == 0); assert((reinterpret_cast<uintptr_t>(cos_vals) & 63) == 0);

    Если хоть один срабатывает при n=25 — это ровно ваша проблема.

  4. С TBB/MPI/WinForms держите в уме: нестандартные пулы потоков и файбры → нестандартные размеры/начальное выравнивание стека. Ещё один аргумент в пользу loadu/storeu.


Бонус: маленький баг в скалярном хвосте

В скалярной части знак sin сейчас берётся только по x>0, а не по квадранту, т.е. для углов в (π, 2π) при положительном x вы получите положительный синус (должен быть отрицательный). Это не авария, но ошибка точности. Исправляется учётом квадранта, например:

cpp
float xr = fmodf(abs_x, 6.28318530718f); int q = int(xr / 1.57079632679f); // номер четверти 0..3 xr = fmodf(xr, 1.57079632679f); // редукция в [0, π/2] float s = fmaf(xr*xr, -0.16666666f, 1.0f) * xr; // sin poly float c = fmaf(xr*xr, 0.0416666667f, -0.5f) * (xr*xr) + 1.0f;// cos poly // перекидывание между четвертями switch (q & 3) { case 0: /* s=+s; c=+c; */ break; case 1: { float t=s; s= c; c=-t; } break; case 2: { s=-s; c=-c; } break; case 3: { float t=s; s=-c; c= t; } break; } if (x < 0) s = -s;

Итого

  • Падение при n=25 — практически наверняка misaligned load/store из-за стекового выравнивания.
  • Переключитесь на _mm512_loadu_ps/_mm512_storeu_ps или обеспечьте «жёсткое» 64-выравнивание вне стека.
  • Попутно поправьте знак sin в скалярном хвосте (точность).

Compartilhar esta Q&A