"struct ManipCost final { int n; bool variableLe...

बनाया गया: 1 अप्रैल 2026

का उपयोग करके उत्तर दिया गया GPT-5.4 Thinking द्वारा Chat01

प्रश्न

"struct ManipCost final {
int n;
bool variableLen;
float fixedLength, stretchFactor, targetX, targetY, minTheta, archBiasW, archBiasK, sharpW;
unsigned obstacleCount, constraintMode;
SquareObstacle obstacles[MAX_OBSTACLES];
float obstacleClearance;
ManipSolveMode solveMode = ManipSolveMode::Positioning;
alignas(32) float referenceState[64]{};
alignas(32) float referencePrefixAngles[32]{};
int referenceStateCount = 0, referenceAngleCount = 0, referenceLengthCount = 0;
float transitionCaptureWeight = 10000.0f, transitionEnergyWeight = 1.0f,
transitionLengthEnergyWeight = 0.35f, transitionPrefixEnergyWeight = 0.175f,
transitionMaxAngleStep = 0.5f, transitionMaxLengthStepFactor = 0.15f,
transitionJointSignDeadband = 0.1f, transitionPrefixSignDeadband = 0.2f,
transitionMinJointAbsAngleRatio = 0.1f, transitionMinJointAbsAngleFloor = 0.0175f;
bool transitionPreserveJointMagnitudeFloor = true,
transitionPreserveJointSigns = true,
transitionPreservePrefixSigns = true,
transitionCheckInnerCircleAgainstLinks = true;

text
struct TransitionGeomSample final { alignas(32) float q[64]{}; float x = 0.0f, y = 0.0f, clearance = FLT_MAX; };

private:
template<int I, int N>
static __forceinline void agp_pose_prefix(const float* __restrict th, float& acc, float* __restrict phi) noexcept {
if constexpr (I < N) {
acc = fmaf(th[I], 1.0f, acc);
phi[I] = acc;
agp_pose_prefix<I + 1, N>(th, acc, phi);
}
}

text
template<int I, int N, bool STORE> __forceinline void agp_pose_accum_fixed( const float* __restrict q, const float* __restrict s_arr, const float* __restrict c_arr, float& x, float& y, float* __restrict px, float* __restrict py) const noexcept { if constexpr (I < N) { const float Li = fixedLength; x = fmaf(Li, c_arr[I], x); y = fmaf(Li, s_arr[I], y); if constexpr (STORE) { px[I + 1] = x; py[I + 1] = y; } agp_pose_accum_fixed<I + 1, N, STORE>(q, s_arr, c_arr, x, y, px, py); } } template<int I, int N, bool STORE> __forceinline void agp_pose_accum_var( const float* __restrict q, const float* __restrict s_arr, const float* __restrict c_arr, float& x, float& y, float* __restrict px, float* __restrict py) const noexcept { if constexpr (I < N) { const float Li = q[n + I]; x = fmaf(Li, c_arr[I], x); y = fmaf(Li, s_arr[I], y); if constexpr (STORE) { px[I + 1] = x; py[I + 1] = y; } agp_pose_accum_var<I + 1, N, STORE>(q, s_arr, c_arr, x, y, px, py); } } template<int N, bool VARLEN, bool STORE> __forceinline void agp_compute_pose_fixedn( const float* __restrict q, float& out_x, float& out_y, float* __restrict px, float* __restrict py) const noexcept { alignas(32) float phi[16], s_arr[16], c_arr[16]; float phi_acc = 0.0f; if constexpr (STORE) { px[0] = 0.0f; py[0] = 0.0f; } agp_pose_prefix<0, N>(q, phi_acc, phi); FABE13_SINCOS(phi, s_arr, c_arr, N); float x = 0.0f, y = 0.0f; if constexpr (VARLEN) agp_pose_accum_var<0, N, STORE>(q, s_arr, c_arr, x, y, px, py); else agp_pose_accum_fixed<0, N, STORE>(q, s_arr, c_arr, x, y, px, py); out_x = x; out_y = y; }

public:
__declspec(noalias) __forceinline void compute_pose(const float* __restrict q, float& out_x, float& out_y, float* __restrict px, float* __restrict py) const noexcept {
const bool store = (px != nullptr) & (py != nullptr);
if (n <= 0) {
if (store) { px[0] = 0.0f; py[0] = 0.0f; }
out_x = out_y = 0.0f;
return;
}
if (static_cast<unsigned>(n) <= 16u) {
if (variableLen) {
if (store) {
switch (n) {
case 1: agp_compute_pose_fixedn< 1, true, true>(q, out_x, out_y, px, py); return;
case 2: agp_compute_pose_fixedn< 2, true, true>(q, out_x, out_y, px, py); return;
case 3: agp_compute_pose_fixedn< 3, true, true>(q, out_x, out_y, px, py); return;
case 4: agp_compute_pose_fixedn< 4, true, true>(q, out_x, out_y, px, py); return;
case 5: agp_compute_pose_fixedn< 5, true, true>(q, out_x, out_y, px, py); return;
case 6: agp_compute_pose_fixedn< 6, true, true>(q, out_x, out_y, px, py); return;
case 7: agp_compute_pose_fixedn< 7, true, true>(q, out_x, out_y, px, py); return;
case 8: agp_compute_pose_fixedn< 8, true, true>(q, out_x, out_y, px, py); return;
case 9: agp_compute_pose_fixedn< 9, true, true>(q, out_x, out_y, px, py); return;
case 10: agp_compute_pose_fixedn<10, true, true>(q, out_x, out_y, px, py); return;
case 11: agp_compute_pose_fixedn<11, true, true>(q, out_x, out_y, px, py); return;
case 12: agp_compute_pose_fixedn<12, true, true>(q, out_x, out_y, px, py); return;
case 13: agp_compute_pose_fixedn<13, true, true>(q, out_x, out_y, px, py); return;
case 14: agp_compute_pose_fixedn<14, true, true>(q, out_x, out_y, px, py); return;
case 15: agp_compute_pose_fixedn<15, true, true>(q, out_x, out_y, px, py); return;
default: agp_compute_pose_fixedn<16, true, true>(q, out_x, out_y, px, py); return;
}
}
else {
switch (n) {
case 1: agp_compute_pose_fixedn< 1, true, false>(q, out_x, out_y, px, py); return;
case 2: agp_compute_pose_fixedn< 2, true, false>(q, out_x, out_y, px, py); return;
case 3: agp_compute_pose_fixedn< 3, true, false>(q, out_x, out_y, px, py); return;
case 4: agp_compute_pose_fixedn< 4, true, false>(q, out_x, out_y, px, py); return;
case 5: agp_compute_pose_fixedn< 5, true, false>(q, out_x, out_y, px, py); return;
case 6: agp_compute_pose_fixedn< 6, true, false>(q, out_x, out_y, px, py); return;
case 7: agp_compute_pose_fixedn< 7, true, false>(q, out_x, out_y, px, py); return;
case 8: agp_compute_pose_fixedn< 8, true, false>(q, out_x, out_y, px, py); return;
case 9: agp_compute_pose_fixedn< 9, true, false>(q, out_x, out_y, px, py); return;
case 10: agp_compute_pose_fixedn<10, true, false>(q, out_x, out_y, px, py); return;
case 11: agp_compute_pose_fixedn<11, true, false>(q, out_x, out_y, px, py); return;
case 12: agp_compute_pose_fixedn<12, true, false>(q, out_x, out_y, px, py); return;
case 13: agp_compute_pose_fixedn<13, true, false>(q, out_x, out_y, px, py); return;
case 14: agp_compute_pose_fixedn<14, true, false>(q, out_x, out_y, px, py); return;
case 15: agp_compute_pose_fixedn<15, true, false>(q, out_x, out_y, px, py); return;
default: agp_compute_pose_fixedn<16, true, false>(q, out_x, out_y, px, py); return;
}
}
}
else {
if (store) {
switch (n) {
case 1: agp_compute_pose_fixedn< 1, false, true>(q, out_x, out_y, px, py); return;
case 2: agp_compute_pose_fixedn< 2, false, true>(q, out_x, out_y, px, py); return;
case 3: agp_compute_pose_fixedn< 3, false, true>(q, out_x, out_y, px, py); return;
case 4: agp_compute_pose_fixedn< 4, false, true>(q, out_x, out_y, px, py); return;
case 5: agp_compute_pose_fixedn< 5, false, true>(q, out_x, out_y, px, py); return;
case 6: agp_compute_pose_fixedn< 6, false, true>(q, out_x, out_y, px, py); return;
case 7: agp_compute_pose_fixedn< 7, false, true>(q, out_x, out_y, px, py); return;
case 8: agp_compute_pose_fixedn< 8, false, true>(q, out_x, out_y, px, py); return;
case 9: agp_compute_pose_fixedn< 9, false, true>(q, out_x, out_y, px, py); return;
case 10: agp_compute_pose_fixedn<10, false, true>(q, out_x, out_y, px, py); return;
case 11: agp_compute_pose_fixedn<11, false, true>(q, out_x, out_y, px, py); return;
case 12: agp_compute_pose_fixedn<12, false, true>(q, out_x, out_y, px, py); return;
case 13: agp_compute_pose_fixedn<13, false, true>(q, out_x, out_y, px, py); return;
case 14: agp_compute_pose_fixedn<14, false, true>(q, out_x, out_y, px, py); return;
case 15: agp_compute_pose_fixedn<15, false, true>(q, out_x, out_y, px, py); return;
default: agp_compute_pose_fixedn<16, false, true>(q, out_x, out_y, px, py); return;
}
}
else {
switch (n) {
case 1: agp_compute_pose_fixedn< 1, false, false>(q, out_x, out_y, px, py); return;
case 2: agp_compute_pose_fixedn< 2, false, false>(q, out_x, out_y, px, py); return;
case 3: agp_compute_pose_fixedn< 3, false, false>(q, out_x, out_y, px, py); return;
case 4: agp_compute_pose_fixedn< 4, false, false>(q, out_x, out_y, px, py); return;
case 5: agp_compute_pose_fixedn< 5, false, false>(q, out_x, out_y, px, py); return;
case 6: agp_compute_pose_fixedn< 6, false, false>(q, out_x, out_y, px, py); return;
case 7: agp_compute_pose_fixedn< 7, false, false>(q, out_x, out_y, px, py); return;
case 8: agp_compute_pose_fixedn< 8, false, false>(q, out_x, out_y, px, py); return;
case 9: agp_compute_pose_fixedn< 9, false, false>(q, out_x, out_y, px, py); return;
case 10: agp_compute_pose_fixedn<10, false, false>(q, out_x, out_y, px, py); return;
case 11: agp_compute_pose_fixedn<11, false, false>(q, out_x, out_y, px, py); return;
case 12: agp_compute_pose_fixedn<12, false, false>(q, out_x, out_y, px, py); return;
case 13: agp_compute_pose_fixedn<13, false, false>(q, out_x, out_y, px, py); return;
case 14: agp_compute_pose_fixedn<14, false, false>(q, out_x, out_y, px, py); return;
case 15: agp_compute_pose_fixedn<15, false, false>(q, out_x, out_y, px, py); return;
default: agp_compute_pose_fixedn<16, false, false>(q, out_x, out_y, px, py); return;
}
}
}
}

text
const float* __restrict th = q; alignas(32) float phi[32], s_arr[32], c_arr[32]; float x = 0.0f, y = 0.0f, phi_acc = 0.0f; if (store) { px[0] = 0.0f; py[0] = 0.0f; } int i = 0; __pragma(loop(ivdep)) while (i < n) { phi_acc = fmaf(th[i], 1.0f, phi_acc); phi[i] = phi_acc; ++i; } FABE13_SINCOS(phi, s_arr, c_arr, n); i = 0; __pragma(loop(ivdep)) while (i < n) { const float Li = variableLen ? q[n + i] : fixedLength; x = fmaf(Li, c_arr[i], x); y = fmaf(Li, s_arr[i], y); if (store) { px[i + 1] = x; py[i + 1] = y; } ++i; } out_x = x; out_y = y; } __declspec(noalias) __forceinline void copy_state_full_dim(const float* __restrict src, float* __restrict dst) const noexcept { const int dim = full_state_dim(); int i = 0; __pragma(loop(ivdep)) while (i + 8 <= dim) { __m256 v = _mm256_load_ps(src + i); _mm256_store_ps(dst + i, v); i += 8; } __pragma(loop(ivdep)) while (i + 4 <= dim) { __m128 v = _mm_load_ps(src + i); _mm_store_ps(dst + i, v); i += 4; } while (i < dim) { dst[i] = src[i]; ++i; } static __declspec(align(32)) const __m256 vz = _mm256_setzero_ps(); while (i + 8 <= 64) { _mm256_store_ps(dst + i, vz); i += 8; } while (i < 64) { dst[i] = 0.0f; ++i; } } __declspec(noalias) __forceinline void lerp_state_full_dim(const float* __restrict qa, const float* __restrict qb, float t, float* __restrict out_q) const noexcept { const int dim = full_state_dim(); const __m256 vt = _mm256_set1_ps(t); int i = 0; __pragma(loop(ivdep)) while (i + 8 <= dim) { const __m256 va = _mm256_load_ps(qa + i); const __m256 vb = _mm256_load_ps(qb + i); _mm256_store_ps(out_q + i, _mm256_fmadd_ps(vt, _mm256_sub_ps(vb, va), va)); i += 8; } while (i < dim) { out_q[i] = fmaf(t, fmaf(-qa[i], 1.0f, qb[i]), qa[i]); ++i; } static __declspec(align(32)) const __m256 vz = _mm256_setzero_ps(); while (i + 8 <= 64) { _mm256_store_ps(out_q + i, vz); i += 8; } while (i < 64) { out_q[i] = 0.0f; ++i; } } static __declspec(noalias) __forceinline float wrap_pi(float a) noexcept { constexpr float PI = 3.1415926535897932384626433832795f, TWO_PI = 6.2831853071795864769252867665590f; float k = floorf((a + PI) / TWO_PI); a -= k * TWO_PI; if (a > PI) a -= TWO_PI; else if (a < -PI) a += TWO_PI; return a; } static __declspec(noalias) __forceinline float point_segment_distance_sq( float px, float py, float ax, float ay, float bx, float by) noexcept { float abx = bx - ax, aby = by - ay, apx = px - ax, apy = py - ay, ab2 = fmaf(abx, abx, aby * aby); if (ab2 <= 1e-20f) return fmaf(apx, apx, apy * apy); float t = (apx * abx + apy * aby) / ab2; if (t < 0.0f) t = 0.0f; else if (t > 1.0f) t = 1.0f; float qx = fmaf(t, abx, ax), qy = fmaf(t, aby, ay), dx = px - qx, dy = py - qy; return fmaf(dx, dx, dy * dy); } static __declspec(noalias) __forceinline float polyline_circle_violation( const float* __restrict px, const float* __restrict py, int segments, float cx, float cy, float radius, float clearance) noexcept { float rr = radius + clearance, best_d2 = FLT_MAX; int i = 0; while (i < segments) { float d2 = point_segment_distance_sq(cx, cy, px[i], py[i], px[i + 1], py[i + 1]); if (d2 < best_d2) best_d2 = d2; if (best_d2 <= rr * rr) break; ++i; } return rr - sqrtf(best_d2); } __forceinline bool has_transition_reference_angles() const noexcept { return referenceAngleCount > 0; } __forceinline bool has_transition_reference_lengths() const noexcept { return variableLen && (referenceLengthCount > 0); } __forceinline bool use_joint_angle_limit_constraints() const noexcept { return minTheta > 0.0f; } __forceinline int full_state_dim() const noexcept { return n + (variableLen ? n : 0); } __forceinline float reference_angle(int i) const noexcept { return (i < referenceAngleCount) ? referenceState[i] : 0.0f; } __forceinline float reference_length(int i) const noexcept { return (has_transition_reference_lengths() && i < referenceLengthCount) ? referenceState[n + i] : fixedLength; } __forceinline float wrapped_angle_delta(const float* __restrict q, int i) const noexcept { return wrap_pi(q[i] - reference_angle(i)); } __forceinline float max_length_step(int i) const noexcept { float refL = reference_length(i), absStep = transitionMaxLengthStepFactor * fmaxf(refL, 0.5f); return fmaxf(absStep, 0.05f); } __forceinline float min_joint_abs_angle_bound(int i) const noexcept { float refAbs = fabsf(reference_angle(i)), scaled = transitionMinJointAbsAngleRatio * refAbs; return fmaxf(transitionMinJointAbsAngleFloor, scaled); } __forceinline unsigned transition_continuity_constraint_count() const noexcept { unsigned total = 0u; if (solveMode == ManipSolveMode::Transition && has_transition_reference_angles()) { total += static_cast<unsigned>(referenceAngleCount); if (has_transition_reference_lengths()) total += static_cast<unsigned>(referenceLengthCount); if (transitionPreserveJointMagnitudeFloor) total += static_cast<unsigned>(referenceAngleCount); if (transitionPreserveJointSigns) total += static_cast<unsigned>(referenceAngleCount); if (transitionPreservePrefixSigns) total += static_cast<unsigned>(referenceAngleCount); } return total; } __forceinline unsigned joint_angle_limit_constraint_count() const noexcept { return use_joint_angle_limit_constraints() ? static_cast<unsigned>(n) : 0u; } __forceinline unsigned pose_constraint_base_index() const noexcept { return transition_continuity_constraint_count() + joint_angle_limit_constraint_count(); } static __declspec(noalias) __forceinline float outer_circle_clearance(float x, float y) noexcept { float dx = x - 5.0f; return 5.0f - sqrtf(fmaf(dx, dx, y * y)); } static __declspec(noalias) __forceinline float inner_circle_clearance(float x, float y) noexcept { float dx = x - 8.0f, dy = y + 3.0f; return sqrtf(fmaf(dx, dx, dy * dy)) - sqrtf(7.7f); } __declspec(noalias) __forceinline void SetTransitionReference(const float* src, int count) noexcept { int dim = full_state_dim(); referenceStateCount = (count < dim) ? count : dim; referenceAngleCount = (referenceStateCount < n) ? referenceStateCount : n; referenceLengthCount = variableLen ? ((referenceStateCount > n) ? (referenceStateCount - n) : 0) : 0; int i = 0; while (i < dim) { if (src != nullptr && i < referenceStateCount) referenceState[i] = src[i]; else if (i < n) referenceState[i] = 0.0f; else referenceState[i] = fixedLength; ++i; } while (i < 64) { referenceState[i] = 0.0f; ++i; } float phi_acc = 0.0f; i = 0; while (i < referenceAngleCount) { phi_acc += referenceState[i]; referencePrefixAngles[i] = phi_acc; ++i; } while (i < n) { referencePrefixAngles[i] = 0.0f; ++i; } } __forceinline float TransitionAngleEnergy(const float* __restrict q) const noexcept { float acc = 0.0f; int i = 0; __pragma(loop(ivdep)) while (i < referenceAngleCount) { float d = wrapped_angle_delta(q, i); acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionLengthEnergy(const float* __restrict q) const noexcept { if (!has_transition_reference_lengths()) return 0.0f; float acc = 0.0f; int i = 0; __pragma(loop(ivdep)) while (i < referenceLengthCount) { float d = q[n + i] - referenceState[n + i]; acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionPrefixEnergy(const float* __restrict q) const noexcept { if (!has_transition_reference_angles()) return 0.0f; float acc = 0.0f, phi_acc = 0.0f; int i = 0; while (i < referenceAngleCount) { phi_acc += q[i]; float d = wrap_pi(phi_acc - referencePrefixAngles[i]); acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionEnergy(const float* __restrict q) const noexcept { float acc = transitionEnergyWeight * TransitionAngleEnergy(q); acc = fmaf(transitionLengthEnergyWeight, TransitionLengthEnergy(q), acc); acc = fmaf(transitionPrefixEnergyWeight, TransitionPrefixEnergy(q), acc); return acc; } __declspec(noalias) __forceinline ManipCost( int _n, bool _variableLen, float _targetX, float _targetY, float _minTheta, float _fixedLength, float _stretchFactor, float* obstacleData, unsigned _obstacleCount) noexcept : n(_n), variableLen(_variableLen), fixedLength(_fixedLength), stretchFactor(_stretchFactor), targetX(_targetX), targetY(_targetY), minTheta(_minTheta), archBiasW(0.02f), archBiasK(3.0f), sharpW(0.05f), obstacleCount((_obstacleCount > MAX_OBSTACLES) ? MAX_OBSTACLES : _obstacleCount), constraintMode(0u), obstacleClearance(OBSTACLE_CLEARANCE) { unsigned i = 0u; while (i < MAX_OBSTACLES) { obstacles[i].cx = obstacles[i].cy = obstacles[i].half = obstacles[i].pad = 0.0f; ++i; } if (obstacleData) { i = 0u; while (i < obstacleCount) { obstacles[i].cx = obstacleData[3u * i + 0u]; obstacles[i].cy = obstacleData[3u * i + 1u]; obstacles[i].half = obstacleData[3u * i + 2u]; ++i; } } SetTransitionReference(nullptr, 0); } __declspec(noalias) __forceinline float link_length(const float* __restrict q, int i) const noexcept { return variableLen ? q[n + i] : fixedLength; } __declspec(noalias) __forceinline float compute_positioning_objective_from_pose(const float* __restrict q, float x, float y) const noexcept { const float* __restrict th = q; float archPen = 0.0f; int i = 0; __pragma(loop(ivdep)) while (i < n) { float theta = th[i], t = fmaf(-theta, archBiasK, 0.0f), sp; if (t > 10.0f) sp = t; else { float exp_val = fmaf(t, fmaf(t, fmaf(t, fmaf(t, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); sp = log1pf(exp_val); } archPen = fmaf(archBiasW, sp, archPen); ++i; } float dx = fmaf(x, 1.0f, -targetX), dy = fmaf(y, 1.0f, -targetY), dist = sqrtf(fmaf(dx, dx, dy * dy)); return fmaf(dist, 1.0f, archPen); } __declspec(noalias) __forceinline float compute_transition_objective_from_pose(const float* __restrict q, float x, float y) const noexcept { float dx = x - targetX, dy = y - targetY, dist2 = fmaf(dx, dx, dy * dy); return fmaf(transitionCaptureWeight, dist2, TransitionEnergy(q)); } __declspec(noalias) __forceinline bool use_obstacle_constraints() const noexcept { return ((constraintMode & MANIP_CONSTRAINT_OBSTACLES) != 0u) && (obstacleCount > 0u); } __declspec(noalias) __forceinline bool use_nonlinear_constraints() const noexcept { return (constraintMode & MANIP_CONSTRAINT_NONLINEAR) != 0u; } __declspec(noalias) __forceinline unsigned total_constraints() const noexcept { unsigned total = transition_continuity_constraint_count(); total += joint_angle_limit_constraint_count(); if (use_obstacle_constraints()) total += obstacleCount; if (use_nonlinear_constraints()) total += transitionCheckInnerCircleAgainstLinks ? 3u : 2u; return total; } __declspec(noalias) __forceinline unsigned feasible_index() const noexcept { return total_constraints(); } static __declspec(noalias) __forceinline float nonlinear_constraint_0(float x, float y) noexcept { float dx = x - 5.0f; return dx * dx + y * y - 25.0f; } static __declspec(noalias) __forceinline float nonlinear_constraint_1(float x, float y) noexcept { float dx = x - 8.0f, dy = y + 3.0f; return 7.7f - (dx * dx + dy * dy); } __declspec(noalias) __forceinline bool evaluate_transition_continuity_only( const float* __restrict q, unsigned& out_index, float& out_value) const noexcept { unsigned idx = 0u; if (solveMode != ManipSolveMode::Transition || !has_transition_reference_angles()) return true; int i = 0; while (i < referenceAngleCount) { float d = wrapped_angle_delta(q, i), viol = fabsf(d) - transitionMaxAngleStep; if (viol > 0.0f) { out_index = idx; out_value = viol; return false; } ++idx; ++i; } if (has_transition_reference_lengths()) { i = 0; while (i < referenceLengthCount) { float dL = fabsf(q[n + i] - referenceState[n + i]) - max_length_step(i); if (dL > 0.0f) { out_index = idx; out_value = dL; return false; } ++idx; ++i; } } if (transitionPreserveJointMagnitudeFloor) { i = 0; while (i < referenceAngleCount) { float refAbs = fabsf(referenceState[i]); if (refAbs > transitionJointSignDeadband) { float minAbs = min_joint_abs_angle_bound(i), viol = minAbs - fabsf(q[i]); if (viol > 0.0f) { out_index = idx; out_value = viol; return false; } } ++idx; ++i; } } if (transitionPreserveJointSigns) { i = 0; while (i < referenceAngleCount) { float ref = referenceState[i]; if (fabsf(ref) > transitionJointSignDeadband) { float prod = q[i] * ref; if (prod < 0.0f) { out_index = idx; out_value = -prod; return false; } } ++idx; ++i; } } if (transitionPreservePrefixSigns) { float phi_acc = 0.0f; i = 0; while (i < referenceAngleCount) { phi_acc += q[i]; float refPhi = referencePrefixAngles[i]; if (fabsf(refPhi) > transitionPrefixSignDeadband) { float prod = phi_acc * refPhi; if (prod < 0.0f) { out_index = idx; out_value = -prod; return false; } } ++idx; ++i; } } return true; } __declspec(noalias) __forceinline bool evaluate_joint_angle_limits_only( const float* __restrict q, unsigned base_index, unsigned& out_index, float& out_value) const noexcept { if (!use_joint_angle_limit_constraints()) return true; int i = 0; while (i < n) { float viol = fabsf(q[i]) - minTheta; if (viol > 0.0f) { out_index = base_index + static_cast<unsigned>(i); out_value = viol; return false; } ++i; } return true; } __declspec(noalias) __forceinline bool evaluate_pose_constraints_from_arrays( const float* __restrict px, const float* __restrict py, float x, float y, unsigned base_index, unsigned& out_index, float& out_value, float* out_geom_clearance = nullptr) const noexcept { float geom_clearance = FLT_MAX; unsigned idx = base_index; if (use_obstacle_constraints()) { unsigned j = 0u; while (j < obstacleCount) { float viol = polyline_square_violation(px, py, n, obstacles[j], obstacleClearance), clearance = -viol; if (clearance < geom_clearance) geom_clearance = clearance; if (viol > 0.0f) { out_index = idx; out_value = viol; if (out_geom_clearance) *out_geom_clearance = clearance; return false; } ++idx; ++j; } } if (use_nonlinear_constraints()) { if (transitionCheckInnerCircleAgainstLinks) { float viol = polyline_circle_violation(px, py, n, 8.0f, -3.0f, sqrtf(7.7f), 0.0f), clearance = -viol; if (clearance < geom_clearance) geom_clearance = clearance; if (viol > 0.0f) { out_index = idx; out_value = viol; if (out_geom_clearance) *out_geom_clearance = clearance; return false; } ++idx; } float c0 = nonlinear_constraint_0(x, y), c0_clearance = outer_circle_clearance(x, y); if (c0_clearance < geom_clearance) geom_clearance = c0_clearance; if (c0 > 0.0f) { out_index = idx; out_value = c0; if (out_geom_clearance) *out_geom_clearance = c0_clearance; return false; } ++idx; float c1 = nonlinear_constraint_1(x, y), c1_clearance = inner_circle_clearance(x, y); if (c1_clearance < geom_clearance) geom_clearance = c1_clearance; if (c1 > 0.0f) { out_index = idx; out_value = c1; if (out_geom_clearance) *out_geom_clearance = c1_clearance; return false; } ++idx; } if (out_geom_clearance) *out_geom_clearance = geom_clearance; return true; } __declspec(noalias) __forceinline bool evaluate_state_without_transition_continuity( const float* __restrict q, float& out_x, float& out_y, unsigned& out_index, float& out_value, float* out_geom_clearance = nullptr) const noexcept { if (!evaluate_joint_angle_limits_only(q, transition_continuity_constraint_count(), out_index, out_value)) { if (out_geom_clearance) *out_geom_clearance = -FLT_MAX; return false; } alignas(32) float px[33], py[33]; compute_pose(q, out_x, out_y, px, py); if (!evaluate_pose_constraints_from_arrays(px, py, out_x, out_y, pose_constraint_base_index(), out_index, out_value, out_geom_clearance)) return false; return true; } __declspec(noalias) __forceinline float transition_segment_motion_bound(const float* __restrict qa, const float* __restrict qb) const noexcept { float tail_reach = 0.0f, bound = 0.0f; int i = n - 1; while (i >= 0) { tail_reach += fmaxf(link_length(qa, i), link_length(qb, i)); bound = fmaf(tail_reach, fabsf(qb[i] - qa[i]), bound); --i; } if (variableLen) { i = 0; __pragma(loop(ivdep)) while (i < n) { bound = fmaf(1.0f, fabsf(qb[n + i] - qa[n + i]), bound); ++i; } } return bound; } __forceinline bool transition_sweep_requires_validation() const noexcept { return (solveMode == ManipSolveMode::Transition) && has_transition_reference_angles() && (use_obstacle_constraints() || use_nonlinear_constraints()); } __declspec(noalias) __forceinline bool build_transition_geom_sample( const float* __restrict q, TransitionGeomSample& out_sample, unsigned& out_index, float& out_value) const noexcept { copy_state_full_dim(q, out_sample.q); return evaluate_state_without_transition_continuity( out_sample.q, out_sample.x, out_sample.y, out_index, out_value, &out_sample.clearance); } __declspec(noalias) __forceinline bool validate_transition_segment_recursive( const TransitionGeomSample& a, const TransitionGeomSample& b, int depth, unsigned& out_index, float& out_value) const noexcept { constexpr float SWEEP_CLEARANCE_EPS = 1e-4f, SWEEP_MIN_BOUND = 5e-4f; float motion_bound = transition_segment_motion_bound(a.q, b.q); if (motion_bound <= SWEEP_CLEARANCE_EPS) return true; float required_clearance = motion_bound + SWEEP_CLEARANCE_EPS; if (a.clearance > required_clearance && b.clearance > required_clearance) return true; TransitionGeomSample mid{}; lerp_state_full_dim(a.q, b.q, 0.5f, mid.q); if (!evaluate_state_without_transition_continuity(mid.q, mid.x, mid.y, out_index, out_value, &mid.clearance)) return false; float half_required_clearance = 0.5f * motion_bound + SWEEP_CLEARANCE_EPS; if (a.clearance > half_required_clearance && mid.clearance > half_required_clearance && b.clearance > half_required_clearance) return true; if (depth <= 0 || motion_bound <= SWEEP_MIN_BOUND) { TransitionGeomSample q25{}, q75{}; lerp_state_full_dim(a.q, b.q, 0.25f, q25.q); if (!evaluate_state_without_transition_continuity(q25.q, q25.x, q25.y, out_index, out_value, &q25.clearance)) return false; lerp_state_full_dim(a.q, b.q, 0.75f, q75.q); if (!evaluate_state_without_transition_continuity(q75.q, q75.x, q75.y, out_index, out_value, &q75.clearance)) return false; return true; } return validate_transition_segment_recursive(a, mid, depth - 1, out_index, out_value) && validate_transition_segment_recursive(mid, b, depth - 1, out_index, out_value); } __declspec(noalias) __forceinline bool evaluate_transition_swept_motion( const float* __restrict q, float final_x, float final_y, float final_geom_clearance, unsigned& out_index, float& out_value) const noexcept { if (!transition_sweep_requires_validation()) return true; TransitionGeomSample start{}; if (!build_transition_geom_sample(referenceState, start, out_index, out_value)) return false; TransitionGeomSample finish{}; copy_state_full_dim(q, finish.q); finish.x = final_x; finish.y = final_y; finish.clearance = final_geom_clearance; constexpr int SWEEP_MAX_DEPTH = 10; return validate_transition_segment_recursive(start, finish, SWEEP_MAX_DEPTH, out_index, out_value); } __declspec(noalias) __forceinline bool evaluate_indexed( const float* __restrict q, float& out_x, float& out_y, unsigned& out_index, float& out_value) const noexcept { if (!evaluate_transition_continuity_only(q, out_index, out_value)) return false; float final_geom_clearance = FLT_MAX; if (!evaluate_state_without_transition_continuity(q, out_x, out_y, out_index, out_value, &final_geom_clearance)) return false; if (solveMode == ManipSolveMode::Transition && has_transition_reference_angles()) { if (!transition_sweep_requires_validation()) { if (!evaluate_joint_angle_limits_only(referenceState, transition_continuity_constraint_count(), out_index, out_value)) return false; } else if (!evaluate_transition_swept_motion(q, out_x, out_y, final_geom_clearance, out_index, out_value)) return false; } out_index = feasible_index(); if (solveMode == ManipSolveMode::Transition) { out_value = compute_transition_objective_from_pose(q, out_x, out_y); return true; } out_value = compute_positioning_objective_from_pose(q, out_x, out_y); return true; }

};" - новая версия, старая версия: "struct ManipCost final {
int n;
bool variableLen;
float fixedLength;
float stretchFactor;
float targetX;
float targetY;
float minTheta;
float archBiasW;
float archBiasK;
float sharpW;
unsigned obstacleCount;
unsigned constraintMode;
SquareObstacle obstacles[MAX_OBSTACLES];
float obstacleClearance;
ManipSolveMode solveMode = ManipSolveMode::Positioning;

text
alignas(16) float referenceState[64]{}; alignas(16) float referencePrefixAngles[32]{}; int referenceStateCount = 0; int referenceAngleCount = 0; int referenceLengthCount = 0; float transitionCaptureWeight = 20000.0f; float transitionEnergyWeight = 1.0f; float transitionLengthEnergyWeight = 0.35f; float transitionPrefixEnergyWeight = 0.18f; float transitionMaxAngleStep = 0.40f; float transitionMaxLengthStepFactor = 0.12f; float transitionJointSignDeadband = 0.05f; float transitionPrefixSignDeadband = 0.08f; float transitionMinJointAbsAngleRatio = 0.25f; float transitionMinJointAbsAngleFloor = 0.05f; bool transitionPreserveJointMagnitudeFloor = true; bool transitionPreserveJointSigns = true; bool transitionPreservePrefixSigns = true; bool transitionCheckInnerCircleAgainstLinks = true; struct TransitionGeomSample final { alignas(16) float q[64]{}; float x = 0.0f; float y = 0.0f; float clearance = FLT_MAX; }; static __declspec(noalias) __forceinline float wrap_pi(float a) noexcept { constexpr float PI = 3.1415926535897932384626433832795f; constexpr float TWO_PI = 6.2831853071795864769252867665590f; float k = floorf((a + PI) / TWO_PI); a -= k * TWO_PI; if (a > PI) a -= TWO_PI; else if (a < -PI) a += TWO_PI; return a; } static __declspec(noalias) __forceinline float point_segment_distance_sq( float px, float py, float ax, float ay, float bx, float by) noexcept { float abx = bx - ax; float aby = by - ay; float apx = px - ax; float apy = py - ay; float ab2 = fmaf(abx, abx, aby * aby); if (ab2 <= 1e-20f) return fmaf(apx, apx, apy * apy); float t = (apx * abx + apy * aby) / ab2; if (t < 0.0f) t = 0.0f; else if (t > 1.0f) t = 1.0f; float qx = fmaf(t, abx, ax); float qy = fmaf(t, aby, ay); float dx = px - qx; float dy = py - qy; return fmaf(dx, dx, dy * dy); } static __declspec(noalias) __forceinline float polyline_circle_violation( const float* __restrict px, const float* __restrict py, int segments, float cx, float cy, float radius, float clearance) noexcept { float rr = radius + clearance; float best_d2 = FLT_MAX; int i = 0; while (i < segments) { float d2 = point_segment_distance_sq(cx, cy, px[i], py[i], px[i + 1], py[i + 1]); if (d2 < best_d2) best_d2 = d2; if (best_d2 <= rr * rr) break; ++i; } return rr - sqrtf(best_d2); } __forceinline bool has_transition_reference_angles() const noexcept { return referenceAngleCount > 0; } __forceinline bool has_transition_reference_lengths() const noexcept { return variableLen && (referenceLengthCount > 0); } __forceinline bool use_joint_angle_limit_constraints() const noexcept { return minTheta > 0.0f; } __forceinline int full_state_dim() const noexcept { return n + (variableLen ? n : 0); } __forceinline float reference_angle(int i) const noexcept { return (i < referenceAngleCount) ? referenceState[i] : 0.0f; } __forceinline float reference_length(int i) const noexcept { return (has_transition_reference_lengths() && i < referenceLengthCount) ? referenceState[n + i] : fixedLength; } __forceinline float wrapped_angle_delta(const float* __restrict q, int i) const noexcept { return wrap_pi(q[i] - reference_angle(i)); } __forceinline float max_length_step(int i) const noexcept { float refL = reference_length(i); float absStep = transitionMaxLengthStepFactor * fmaxf(refL, 0.5f); return fmaxf(absStep, 0.05f); } __forceinline float min_joint_abs_angle_bound(int i) const noexcept { float refAbs = fabsf(reference_angle(i)); float scaled = transitionMinJointAbsAngleRatio * refAbs; return fmaxf(transitionMinJointAbsAngleFloor, scaled); } __forceinline unsigned transition_continuity_constraint_count() const noexcept { unsigned total = 0u; if (solveMode == ManipSolveMode::Transition && has_transition_reference_angles()) { total += static_cast<unsigned>(referenceAngleCount); if (has_transition_reference_lengths()) total += static_cast<unsigned>(referenceLengthCount); if (transitionPreserveJointMagnitudeFloor) total += static_cast<unsigned>(referenceAngleCount); if (transitionPreserveJointSigns) total += static_cast<unsigned>(referenceAngleCount); if (transitionPreservePrefixSigns) total += static_cast<unsigned>(referenceAngleCount); } return total; } __forceinline unsigned joint_angle_limit_constraint_count() const noexcept { return use_joint_angle_limit_constraints() ? static_cast<unsigned>(n) : 0u; } __forceinline unsigned pose_constraint_base_index() const noexcept { return transition_continuity_constraint_count() + joint_angle_limit_constraint_count(); } static __declspec(noalias) __forceinline float outer_circle_clearance(float x, float y) noexcept { float dx = x - 5.0f; return 5.0f - sqrtf(fmaf(dx, dx, y * y)); } static __declspec(noalias) __forceinline float inner_circle_clearance(float x, float y) noexcept { float dx = x - 8.0f; float dy = y + 3.0f; return sqrtf(fmaf(dx, dx, dy * dy)) - sqrtf(7.7f); } __declspec(noalias) __forceinline void SetTransitionReference(const float* src, int count) noexcept { int dim = full_state_dim(); referenceStateCount = (count < dim) ? count : dim; referenceAngleCount = (referenceStateCount < n) ? referenceStateCount : n; referenceLengthCount = variableLen ? ((referenceStateCount > n) ? (referenceStateCount - n) : 0) : 0; int i = 0; while (i < dim) { if (src != nullptr && i < referenceStateCount) referenceState[i] = src[i]; else if (i < n) referenceState[i] = 0.0f; else referenceState[i] = fixedLength; ++i; } while (i < 64) { referenceState[i] = 0.0f; ++i; } float phi_acc = 0.0f; i = 0; while (i < referenceAngleCount) { phi_acc += referenceState[i]; referencePrefixAngles[i] = phi_acc; ++i; } while (i < n) { referencePrefixAngles[i] = 0.0f; ++i; } } __forceinline float TransitionAngleEnergy(const float* __restrict q) const noexcept { float acc = 0.0f; int i = 0; while (i < referenceAngleCount) { float d = wrapped_angle_delta(q, i); acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionLengthEnergy(const float* __restrict q) const noexcept { if (!has_transition_reference_lengths()) return 0.0f; float acc = 0.0f; int i = 0; while (i < referenceLengthCount) { float d = q[n + i] - referenceState[n + i]; acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionPrefixEnergy(const float* __restrict q) const noexcept { if (!has_transition_reference_angles()) return 0.0f; float acc = 0.0f; float phi_acc = 0.0f; int i = 0; while (i < referenceAngleCount) { phi_acc += q[i]; float d = wrap_pi(phi_acc - referencePrefixAngles[i]); acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionEnergy(const float* __restrict q) const noexcept { float acc = transitionEnergyWeight * TransitionAngleEnergy(q); acc = fmaf(transitionLengthEnergyWeight, TransitionLengthEnergy(q), acc); acc = fmaf(transitionPrefixEnergyWeight, TransitionPrefixEnergy(q), acc); return acc; } __declspec(noalias) __forceinline ManipCost( int _n, bool _variableLen, float _targetX, float _targetY, float _minTheta, float _fixedLength, float _stretchFactor, float* obstacleData, unsigned _obstacleCount) noexcept : n(_n), variableLen(_variableLen), fixedLength(_fixedLength), stretchFactor(_stretchFactor), targetX(_targetX), targetY(_targetY), minTheta(_minTheta), archBiasW(0.02f), archBiasK(3.0f), sharpW(0.05f), obstacleCount((_obstacleCount > MAX_OBSTACLES) ? MAX_OBSTACLES : _obstacleCount), constraintMode(0u), obstacleClearance(OBSTACLE_CLEARANCE) { unsigned i = 0u; while (i < MAX_OBSTACLES) { obstacles[i].cx = 0.0f; obstacles[i].cy = 0.0f; obstacles[i].half = 0.0f; obstacles[i].pad = 0.0f; ++i; } if (obstacleData) { i = 0u; while (i < obstacleCount) { obstacles[i].cx = obstacleData[3u * i + 0u]; obstacles[i].cy = obstacleData[3u * i + 1u]; obstacles[i].half = obstacleData[3u * i + 2u]; ++i; } } SetTransitionReference(nullptr, 0); } __declspec(noalias) __forceinline float link_length(const float* __restrict q, int i) const noexcept { return variableLen ? q[n + i] : fixedLength; } __declspec(noalias) __forceinline void compute_pose( const float* __restrict q, float& out_x, float& out_y, float* __restrict px, float* __restrict py) const noexcept { const float* __restrict th = q; alignas(16) float phi[32]; alignas(16) float s_arr[32]; alignas(16) float c_arr[32]; float x = 0.0f; float y = 0.0f; float phi_acc = 0.0f; if (px && py) { px[0] = 0.0f; py[0] = 0.0f; } int i = 0; while (i < n) { phi_acc += th[i]; phi[i] = phi_acc; ++i; } FABE13_SINCOS(phi, s_arr, c_arr, n); i = 0; while (i < n) { float Li = link_length(q, i); x = fmaf(Li, c_arr[i], x); y = fmaf(Li, s_arr[i], y); if (px && py) { px[i + 1] = x; py[i + 1] = y; } ++i; } out_x = x; out_y = y; } __declspec(noalias) __forceinline float compute_positioning_objective_from_pose( const float* __restrict q, float x, float y) const noexcept { const float* __restrict th = q; float archPen = 0.0f; int i = 0; while (i < n) { float theta = th[i]; float t = fmaf(-theta, archBiasK, 0.0f); float sp; if (t > 10.0f) { sp = t; } else { float exp_val = fmaf(t, fmaf(t, fmaf(t, fmaf(t, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); sp = log1pf(exp_val); } archPen = fmaf(archBiasW, sp, archPen); ++i; } float dx = fmaf(x, 1.0f, -targetX); float dy = fmaf(y, 1.0f, -targetY); float dist = sqrtf(fmaf(dx, dx, dy * dy)); return fmaf(dist, 1.0f, archPen); } __declspec(noalias) __forceinline float compute_transition_objective_from_pose( const float* __restrict q, float x, float y) const noexcept { float dx = x - targetX; float dy = y - targetY; float dist2 = fmaf(dx, dx, dy * dy); return fmaf(transitionCaptureWeight, dist2, TransitionEnergy(q)); } __declspec(noalias) __forceinline bool use_obstacle_constraints() const noexcept { return ((constraintMode & MANIP_CONSTRAINT_OBSTACLES) != 0u) && (obstacleCount > 0u); } __declspec(noalias) __forceinline bool use_nonlinear_constraints() const noexcept { return (constraintMode & MANIP_CONSTRAINT_NONLINEAR) != 0u; } __declspec(noalias) __forceinline unsigned total_constraints() const noexcept { unsigned total = transition_continuity_constraint_count(); total += joint_angle_limit_constraint_count(); if (use_obstacle_constraints()) total += obstacleCount; if (use_nonlinear_constraints()) total += transitionCheckInnerCircleAgainstLinks ? 3u : 2u; return total; } __declspec(noalias) __forceinline unsigned feasible_index() const noexcept { return total_constraints(); } static __declspec(noalias) __forceinline float nonlinear_constraint_0(float x, float y) noexcept { float dx = x - 5.0f; return dx * dx + y * y - 25.0f; } static __declspec(noalias) __forceinline float nonlinear_constraint_1(float x, float y) noexcept { float dx = x - 8.0f; float dy = y + 3.0f; return 7.7f - (dx * dx + dy * dy); } __declspec(noalias) __forceinline bool evaluate_transition_continuity_only( const float* __restrict q, unsigned& out_index, float& out_value) const noexcept { unsigned idx = 0u; if (solveMode != ManipSolveMode::Transition || !has_transition_reference_angles()) return true; int i = 0; while (i < referenceAngleCount) { float d = wrapped_angle_delta(q, i); float viol = fabsf(d) - transitionMaxAngleStep; if (viol > 0.0f) { out_index = idx; out_value = viol; return false; } ++idx; ++i; } if (has_transition_reference_lengths()) { i = 0; while (i < referenceLengthCount) { float dL = fabsf(q[n + i] - referenceState[n + i]) - max_length_step(i); if (dL > 0.0f) { out_index = idx; out_value = dL; return false; } ++idx; ++i; } } if (transitionPreserveJointMagnitudeFloor) { i = 0; while (i < referenceAngleCount) { float refAbs = fabsf(referenceState[i]); if (refAbs > transitionJointSignDeadband) { float minAbs = min_joint_abs_angle_bound(i); float viol = minAbs - fabsf(q[i]); if (viol > 0.0f) { out_index = idx; out_value = viol; return false; } } ++idx; ++i; } } if (transitionPreserveJointSigns) { i = 0; while (i < referenceAngleCount) { float ref = referenceState[i]; if (fabsf(ref) > transitionJointSignDeadband) { float prod = q[i] * ref; if (prod < 0.0f) { out_index = idx; out_value = -prod; return false; } } ++idx; ++i; } } if (transitionPreservePrefixSigns) { float phi_acc = 0.0f; i = 0; while (i < referenceAngleCount) { phi_acc += q[i]; float refPhi = referencePrefixAngles[i]; if (fabsf(refPhi) > transitionPrefixSignDeadband) { float prod = phi_acc * refPhi; if (prod < 0.0f) { out_index = idx; out_value = -prod; return false; } } ++idx; ++i; } } return true; } __declspec(noalias) __forceinline bool evaluate_joint_angle_limits_only( const float* __restrict q, unsigned base_index, unsigned& out_index, float& out_value) const noexcept { if (!use_joint_angle_limit_constraints()) return true; int i = 0; while (i < n) { float viol = fabsf(q[i]) - minTheta; if (viol > 0.0f) { out_index = base_index + static_cast<unsigned>(i); out_value = viol; return false; } ++i; } return true; } __declspec(noalias) __forceinline bool evaluate_pose_constraints_from_arrays( const float* __restrict px, const float* __restrict py, float x, float y, unsigned base_index, unsigned& out_index, float& out_value, float* out_geom_clearance = nullptr) const noexcept { float geom_clearance = FLT_MAX; unsigned idx = base_index; if (use_obstacle_constraints()) { unsigned j = 0u; while (j < obstacleCount) { float viol = polyline_square_violation(px, py, n, obstacles[j], obstacleClearance); float clearance = -viol; if (clearance < geom_clearance) geom_clearance = clearance; if (viol > 0.0f) { out_index = idx; out_value = viol; if (out_geom_clearance) *out_geom_clearance = clearance; return false; } ++idx; ++j; } } if (use_nonlinear_constraints()) { if (transitionCheckInnerCircleAgainstLinks) { float viol = polyline_circle_violation( px, py, n, 8.0f, -3.0f, sqrtf(7.7f), 0.0f); float clearance = -viol; if (clearance < geom_clearance) geom_clearance = clearance; if (viol > 0.0f) { out_index = idx; out_value = viol; if (out_geom_clearance) *out_geom_clearance = clearance; return false; } ++idx; } float c0 = nonlinear_constraint_0(x, y); float c0_clearance = outer_circle_clearance(x, y); if (c0_clearance < geom_clearance) geom_clearance = c0_clearance; if (c0 > 0.0f) { out_index = idx; out_value = c0; if (out_geom_clearance) *out_geom_clearance = c0_clearance; return false; } ++idx; float c1 = nonlinear_constraint_1(x, y); float c1_clearance = inner_circle_clearance(x, y); if (c1_clearance < geom_clearance) geom_clearance = c1_clearance; if (c1 > 0.0f) { out_index = idx; out_value = c1; if (out_geom_clearance) *out_geom_clearance = c1_clearance; return false; } ++idx; } if (out_geom_clearance) *out_geom_clearance = geom_clearance; return true; } __declspec(noalias) __forceinline bool evaluate_state_without_transition_continuity( const float* __restrict q, float& out_x, float& out_y, unsigned& out_index, float& out_value, float* out_geom_clearance = nullptr) const noexcept { if (!evaluate_joint_angle_limits_only(q, transition_continuity_constraint_count(), out_index, out_value)) { if (out_geom_clearance) *out_geom_clearance = -FLT_MAX; return false; } alignas(16) float px[33]; alignas(16) float py[33]; compute_pose(q, out_x, out_y, px, py); if (!evaluate_pose_constraints_from_arrays(px, py, out_x, out_y, pose_constraint_base_index(), out_index, out_value, out_geom_clearance)) return false; return true; } __declspec(noalias) __forceinline void copy_state_full_dim( const float* __restrict src, float* __restrict dst) const noexcept { int dim = full_state_dim(); int i = 0; while (i < dim) { dst[i] = src[i]; ++i; } while (i < 64) { dst[i] = 0.0f; ++i; } } __declspec(noalias) __forceinline void lerp_state_full_dim( const float* __restrict qa, const float* __restrict qb, float t, float* __restrict out_q) const noexcept { int dim = full_state_dim(); int i = 0; while (i < dim) { out_q[i] = fmaf(t, qb[i] - qa[i], qa[i]); ++i; } while (i < 64) { out_q[i] = 0.0f; ++i; } } __declspec(noalias) __forceinline float transition_segment_motion_bound( const float* __restrict qa, const float* __restrict qb) const noexcept { float tail_reach = 0.0f; float bound = 0.0f; int i = n - 1; while (i >= 0) { tail_reach += fmaxf(link_length(qa, i), link_length(qb, i)); bound = fmaf(tail_reach, fabsf(qb[i] - qa[i]), bound); --i; } if (variableLen) { i = 0; while (i < n) { bound = fmaf(1.0f, fabsf(qb[n + i] - qa[n + i]), bound); ++i; } } return bound; } __forceinline bool transition_sweep_requires_validation() const noexcept { return (solveMode == ManipSolveMode::Transition) && has_transition_reference_angles() && (use_obstacle_constraints() || use_nonlinear_constraints()); } __declspec(noalias) __forceinline bool build_transition_geom_sample( const float* __restrict q, TransitionGeomSample& out_sample, unsigned& out_index, float& out_value) const noexcept { copy_state_full_dim(q, out_sample.q); return evaluate_state_without_transition_continuity( out_sample.q, out_sample.x, out_sample.y, out_index, out_value, &out_sample.clearance); } __declspec(noalias) __forceinline bool validate_transition_segment_recursive( const TransitionGeomSample& a, const TransitionGeomSample& b, int depth, unsigned& out_index, float& out_value) const noexcept { constexpr float SWEEP_CLEARANCE_EPS = 1e-4f; constexpr float SWEEP_MIN_BOUND = 5e-4f; float motion_bound = transition_segment_motion_bound(a.q, b.q); if (motion_bound <= SWEEP_CLEARANCE_EPS) return true; float required_clearance = motion_bound + SWEEP_CLEARANCE_EPS; if (a.clearance > required_clearance && b.clearance > required_clearance) return true; TransitionGeomSample mid{}; lerp_state_full_dim(a.q, b.q, 0.5f, mid.q); if (!evaluate_state_without_transition_continuity( mid.q, mid.x, mid.y, out_index, out_value, &mid.clearance)) return false; float half_required_clearance = 0.5f * motion_bound + SWEEP_CLEARANCE_EPS; if (a.clearance > half_required_clearance && mid.clearance > half_required_clearance && b.clearance > half_required_clearance) return true; if (depth <= 0 || motion_bound <= SWEEP_MIN_BOUND) { TransitionGeomSample q25{}, q75{}; lerp_state_full_dim(a.q, b.q, 0.25f, q25.q); if (!evaluate_state_without_transition_continuity( q25.q, q25.x, q25.y, out_index, out_value, &q25.clearance)) return false; lerp_state_full_dim(a.q, b.q, 0.75f, q75.q); if (!evaluate_state_without_transition_continuity( q75.q, q75.x, q75.y, out_index, out_value, &q75.clearance)) return false; return true; } return validate_transition_segment_recursive(a, mid, depth - 1, out_index, out_value) && validate_transition_segment_recursive(mid, b, depth - 1, out_index, out_value); } __declspec(noalias) __forceinline bool evaluate_transition_swept_motion( const float* __restrict q, float final_x, float final_y, float final_geom_clearance, unsigned& out_index, float& out_value) const noexcept { if (!transition_sweep_requires_validation()) return true; TransitionGeomSample start{}; if (!build_transition_geom_sample(referenceState, start, out_index, out_value)) return false; TransitionGeomSample finish{}; copy_state_full_dim(q, finish.q); finish.x = final_x; finish.y = final_y; finish.clearance = final_geom_clearance; constexpr int SWEEP_MAX_DEPTH = 10; return validate_transition_segment_recursive(start, finish, SWEEP_MAX_DEPTH, out_index, out_value); } __declspec(noalias) __forceinline bool evaluate_indexed( const float* __restrict q, float& out_x, float& out_y, unsigned& out_index, float& out_value) const noexcept { if (!evaluate_transition_continuity_only(q, out_index, out_value)) return false; float final_geom_clearance = FLT_MAX; if (!evaluate_state_without_transition_continuity(q, out_x, out_y, out_index, out_value, &final_geom_clearance)) return false; if (solveMode == ManipSolveMode::Transition && has_transition_reference_angles()) { if (!transition_sweep_requires_validation()) { if (!evaluate_joint_angle_limits_only( referenceState, transition_continuity_constraint_count(), out_index, out_value)) return false; } else if (!evaluate_transition_swept_motion(q, out_x, out_y, final_geom_clearance, out_index, out_value)) { return false; } } out_index = feasible_index(); if (solveMode == ManipSolveMode::Transition) { out_value = compute_transition_objective_from_pose(q, out_x, out_y); return true; } out_value = compute_positioning_objective_from_pose(q, out_x, out_y); return true; } __declspec(noalias) __forceinline float operator()( const float* __restrict q, float& out_x, float& out_y) const noexcept { compute_pose(q, out_x, out_y, nullptr, nullptr); if (solveMode == ManipSolveMode::Transition) return compute_transition_objective_from_pose(q, out_x, out_y); return compute_positioning_objective_from_pose(q, out_x, out_y); }

};" - после замены на новую версию получаю ошибку с кодом 0xcfffffff (иногда с кодом 0xc0000005), старая версия работала стабильно но мне нужно ускорить код насколько это возможно, найди из-за чего именно скорее всего возникает ошибка и пришли полный код исправленной новой версии - старайся сделать новую версию ещё быстрее - ещё более активно используй AVX2 везде где это может дать выигрыш в скорости - скорость должна быть максимальной - но при этом исправь ошибки чтобы новая версия работала не только быстро но ещё и без ошибок и верни полный исправленный код структуры без пропусков целиком, главное чтобы он работал максимально быстро насколько это вообще возможно, поощряется использование низкоуровневых трюков и битовых трюков, выравнивания, низкоуровневых специальных операций, fmaf и других аппаратных способов ускорения, минимизируй число функций чтобы уменьшить накладные расходы на вызовы, код может быть громоздким - главное чтобы он был максимально быстрым, используй интринсики, различные ключевые слова для оптимизации, выровненный доступ к данным и т. д. - всё что может помочь ускорить даже гипотетически, jump_table, операторы ?: прямо в формулах, инлайн функции через __forceline, ручное развёртывание малых циклов, векторизацию, AVX2, эффективные операции с памятью и минимизацию обращений к памяти, минимизация аллокаций/переллокаций, кеширование, предвычисление, всё, что может помочь или хотя бы увеличит шансы на ускорение

विचार

Ensuring safety with masks and vector operations

I’ll use a simple loop for rare cases to avoid header issues and check the mask with the bit manipulation technique, which should handle edge cases. For vector operations, I’ll ensure no reliance on unavailable functions and potentially use __m256 constants in function calls to maintain compatibility, ensuring proper alignment and safety.

Ensuring proper alignment with loads

I need to be careful with alignment issues when accessing referenceState + n + i. Since n may not be a multiple of 8, the base address may not be aligned. I'll use unaligned loads (loadu) for cases where alignment can't be guaranteed, like in compute_pose and TransitionLengthEnergy.

Thought for 4m 2s

उत्तर

Наиболее вероятная причина падения — небезопасные aligned-load/store в новой версии.

Что именно ломает код:

  1. _mm256_load_ps(src + i) / _mm_load_ps(src + i) в copy_state_full_dim()
    В старой версии там был обычный скалярный copy.
    В новой версии src очень часто указывает на внешний q, а он не обязан быть выровнен на 32 байта.
    Для _mm256_load_ps это UB и очень типично даёт 0xc0000005.

  2. Переход с alignas(16) на alignas(32) у полей внутри ManipCost
    Это меняет требование к выравниванию всего объекта. Если ManipCost где-то выделяется не через normal C++17 overaligned new, а через свой allocator / pool / malloc / placement в чужой буфер, объект может оказаться не 32-byte aligned, и дальше всё становится UB.
    Старая версия с alignas(16) могла жить, новая — уже нет.

  3. Вторичный скрытый дефект: в исходниках был предел по размерам буферов не до конца согласован с n.
    Я это тоже подчистил: теперь логика жёстко держит full_state_dim <= 64, а число углов безопасно.

Ниже — полная исправленная структура, где:

  • убраны опасные aligned-load/store на внешних данных,
  • сохранён и усилен AVX2 там, где это реально безопасно и даёт выигрыш,
  • ускорен compute_pose() для режима без px/py,
  • ускорены copy_state_full_dim, lerp_state_full_dim, TransitionLengthEnergy, evaluate_joint_angle_limits_only,
  • возвращён operator(),
  • выравнивание полей объекта возвращено в безопасный режим, а 32-byte alignment оставлен только там, где он под контролем.
cpp
struct ManipCost final { static constexpr int MAX_STATE_DIM = 64; static constexpr int MAX_ANGLE_DIM = 64; static constexpr int MAX_VARLEN_N = 32; static constexpr int MAX_FIXED_N = 64; int n; bool variableLen; float fixedLength, stretchFactor, targetX, targetY, minTheta, archBiasW, archBiasK, sharpW; unsigned obstacleCount, constraintMode; SquareObstacle obstacles[MAX_OBSTACLES]; float obstacleClearance; ManipSolveMode solveMode = ManipSolveMode::Positioning; // Держим выравнивание членов как в старой стабильной версии, чтобы не ломать внешний allocator. alignas(16) float referenceState[MAX_STATE_DIM]{}; alignas(16) float referencePrefixAngles[MAX_ANGLE_DIM]{}; int referenceStateCount = 0; int referenceAngleCount = 0; int referenceLengthCount = 0; float transitionCaptureWeight = 10000.0f; float transitionEnergyWeight = 1.0f; float transitionLengthEnergyWeight = 0.35f; float transitionPrefixEnergyWeight = 0.175f; float transitionMaxAngleStep = 0.5f; float transitionMaxLengthStepFactor = 0.15f; float transitionJointSignDeadband = 0.1f; float transitionPrefixSignDeadband = 0.2f; float transitionMinJointAbsAngleRatio = 0.1f; float transitionMinJointAbsAngleFloor = 0.0175f; bool transitionPreserveJointMagnitudeFloor = true; bool transitionPreserveJointSigns = true; bool transitionPreservePrefixSigns = true; bool transitionCheckInnerCircleAgainstLinks = true; struct alignas(32) TransitionGeomSample final { alignas(32) float q[MAX_STATE_DIM]{}; float x = 0.0f; float y = 0.0f; float clearance = FLT_MAX; }; private: static __forceinline float hsum256_ps(__m256 v) noexcept { __m128 lo = _mm256_castps256_ps128(v); __m128 hi = _mm256_extractf128_ps(v, 1); __m128 sum = _mm_add_ps(lo, hi); sum = _mm_hadd_ps(sum, sum); sum = _mm_hadd_ps(sum, sum); return _mm_cvtss_f32(sum); } template<int I, int N> static __forceinline void agp_pose_prefix( const float* __restrict th, float& acc, float* __restrict phi) noexcept { if constexpr (I < N) { acc = fmaf(th[I], 1.0f, acc); phi[I] = acc; agp_pose_prefix<I + 1, N>(th, acc, phi); } } template<int I, int N, bool STORE> __forceinline void agp_pose_accum_fixed( const float* __restrict s_arr, const float* __restrict c_arr, float& x, float& y, float* __restrict px, float* __restrict py) const noexcept { if constexpr (I < N) { const float Li = fixedLength; x = fmaf(Li, c_arr[I], x); y = fmaf(Li, s_arr[I], y); if constexpr (STORE) { px[I + 1] = x; py[I + 1] = y; } agp_pose_accum_fixed<I + 1, N, STORE>(s_arr, c_arr, x, y, px, py); } } template<int I, int N, bool STORE> __forceinline void agp_pose_accum_var( const float* __restrict q, const float* __restrict s_arr, const float* __restrict c_arr, float& x, float& y, float* __restrict px, float* __restrict py) const noexcept { if constexpr (I < N) { const float Li = q[n + I]; x = fmaf(Li, c_arr[I], x); y = fmaf(Li, s_arr[I], y); if constexpr (STORE) { px[I + 1] = x; py[I + 1] = y; } agp_pose_accum_var<I + 1, N, STORE>(q, s_arr, c_arr, x, y, px, py); } } template<int N, bool VARLEN, bool STORE> __forceinline void agp_compute_pose_fixedn( const float* __restrict q, float& out_x, float& out_y, float* __restrict px, float* __restrict py) const noexcept { alignas(32) float phi[16]; alignas(32) float s_arr[16]; alignas(32) float c_arr[16]; float phi_acc = 0.0f; if constexpr (STORE) { px[0] = 0.0f; py[0] = 0.0f; } agp_pose_prefix<0, N>(q, phi_acc, phi); FABE13_SINCOS(phi, s_arr, c_arr, N); float x = 0.0f, y = 0.0f; if constexpr (VARLEN) agp_pose_accum_var<0, N, STORE>(q, s_arr, c_arr, x, y, px, py); else agp_pose_accum_fixed<0, N, STORE>(s_arr, c_arr, x, y, px, py); out_x = x; out_y = y; } public: __declspec(noalias) __forceinline void compute_pose( const float* __restrict q, float& out_x, float& out_y, float* __restrict px, float* __restrict py) const noexcept { const bool store = (px != nullptr) && (py != nullptr); if (n <= 0) { if (store) { px[0] = 0.0f; py[0] = 0.0f; } out_x = 0.0f; out_y = 0.0f; return; } // Очень быстрый switch-path для малых n. if (static_cast<unsigned>(n) <= 16u) { if (variableLen) { if (store) { switch (n) { case 1: agp_compute_pose_fixedn< 1, true, true>(q, out_x, out_y, px, py); return; case 2: agp_compute_pose_fixedn< 2, true, true>(q, out_x, out_y, px, py); return; case 3: agp_compute_pose_fixedn< 3, true, true>(q, out_x, out_y, px, py); return; case 4: agp_compute_pose_fixedn< 4, true, true>(q, out_x, out_y, px, py); return; case 5: agp_compute_pose_fixedn< 5, true, true>(q, out_x, out_y, px, py); return; case 6: agp_compute_pose_fixedn< 6, true, true>(q, out_x, out_y, px, py); return; case 7: agp_compute_pose_fixedn< 7, true, true>(q, out_x, out_y, px, py); return; case 8: agp_compute_pose_fixedn< 8, true, true>(q, out_x, out_y, px, py); return; case 9: agp_compute_pose_fixedn< 9, true, true>(q, out_x, out_y, px, py); return; case 10: agp_compute_pose_fixedn<10, true, true>(q, out_x, out_y, px, py); return; case 11: agp_compute_pose_fixedn<11, true, true>(q, out_x, out_y, px, py); return; case 12: agp_compute_pose_fixedn<12, true, true>(q, out_x, out_y, px, py); return; case 13: agp_compute_pose_fixedn<13, true, true>(q, out_x, out_y, px, py); return; case 14: agp_compute_pose_fixedn<14, true, true>(q, out_x, out_y, px, py); return; case 15: agp_compute_pose_fixedn<15, true, true>(q, out_x, out_y, px, py); return; default: agp_compute_pose_fixedn<16, true, true>(q, out_x, out_y, px, py); return; } } else { switch (n) { case 1: agp_compute_pose_fixedn< 1, true, false>(q, out_x, out_y, px, py); return; case 2: agp_compute_pose_fixedn< 2, true, false>(q, out_x, out_y, px, py); return; case 3: agp_compute_pose_fixedn< 3, true, false>(q, out_x, out_y, px, py); return; case 4: agp_compute_pose_fixedn< 4, true, false>(q, out_x, out_y, px, py); return; case 5: agp_compute_pose_fixedn< 5, true, false>(q, out_x, out_y, px, py); return; case 6: agp_compute_pose_fixedn< 6, true, false>(q, out_x, out_y, px, py); return; case 7: agp_compute_pose_fixedn< 7, true, false>(q, out_x, out_y, px, py); return; case 8: agp_compute_pose_fixedn< 8, true, false>(q, out_x, out_y, px, py); return; case 9: agp_compute_pose_fixedn< 9, true, false>(q, out_x, out_y, px, py); return; case 10: agp_compute_pose_fixedn<10, true, false>(q, out_x, out_y, px, py); return; case 11: agp_compute_pose_fixedn<11, true, false>(q, out_x, out_y, px, py); return; case 12: agp_compute_pose_fixedn<12, true, false>(q, out_x, out_y, px, py); return; case 13: agp_compute_pose_fixedn<13, true, false>(q, out_x, out_y, px, py); return; case 14: agp_compute_pose_fixedn<14, true, false>(q, out_x, out_y, px, py); return; case 15: agp_compute_pose_fixedn<15, true, false>(q, out_x, out_y, px, py); return; default: agp_compute_pose_fixedn<16, true, false>(q, out_x, out_y, px, py); return; } } } else { if (store) { switch (n) { case 1: agp_compute_pose_fixedn< 1, false, true>(q, out_x, out_y, px, py); return; case 2: agp_compute_pose_fixedn< 2, false, true>(q, out_x, out_y, px, py); return; case 3: agp_compute_pose_fixedn< 3, false, true>(q, out_x, out_y, px, py); return; case 4: agp_compute_pose_fixedn< 4, false, true>(q, out_x, out_y, px, py); return; case 5: agp_compute_pose_fixedn< 5, false, true>(q, out_x, out_y, px, py); return; case 6: agp_compute_pose_fixedn< 6, false, true>(q, out_x, out_y, px, py); return; case 7: agp_compute_pose_fixedn< 7, false, true>(q, out_x, out_y, px, py); return; case 8: agp_compute_pose_fixedn< 8, false, true>(q, out_x, out_y, px, py); return; case 9: agp_compute_pose_fixedn< 9, false, true>(q, out_x, out_y, px, py); return; case 10: agp_compute_pose_fixedn<10, false, true>(q, out_x, out_y, px, py); return; case 11: agp_compute_pose_fixedn<11, false, true>(q, out_x, out_y, px, py); return; case 12: agp_compute_pose_fixedn<12, false, true>(q, out_x, out_y, px, py); return; case 13: agp_compute_pose_fixedn<13, false, true>(q, out_x, out_y, px, py); return; case 14: agp_compute_pose_fixedn<14, false, true>(q, out_x, out_y, px, py); return; case 15: agp_compute_pose_fixedn<15, false, true>(q, out_x, out_y, px, py); return; default: agp_compute_pose_fixedn<16, false, true>(q, out_x, out_y, px, py); return; } } else { switch (n) { case 1: agp_compute_pose_fixedn< 1, false, false>(q, out_x, out_y, px, py); return; case 2: agp_compute_pose_fixedn< 2, false, false>(q, out_x, out_y, px, py); return; case 3: agp_compute_pose_fixedn< 3, false, false>(q, out_x, out_y, px, py); return; case 4: agp_compute_pose_fixedn< 4, false, false>(q, out_x, out_y, px, py); return; case 5: agp_compute_pose_fixedn< 5, false, false>(q, out_x, out_y, px, py); return; case 6: agp_compute_pose_fixedn< 6, false, false>(q, out_x, out_y, px, py); return; case 7: agp_compute_pose_fixedn< 7, false, false>(q, out_x, out_y, px, py); return; case 8: agp_compute_pose_fixedn< 8, false, false>(q, out_x, out_y, px, py); return; case 9: agp_compute_pose_fixedn< 9, false, false>(q, out_x, out_y, px, py); return; case 10: agp_compute_pose_fixedn<10, false, false>(q, out_x, out_y, px, py); return; case 11: agp_compute_pose_fixedn<11, false, false>(q, out_x, out_y, px, py); return; case 12: agp_compute_pose_fixedn<12, false, false>(q, out_x, out_y, px, py); return; case 13: agp_compute_pose_fixedn<13, false, false>(q, out_x, out_y, px, py); return; case 14: agp_compute_pose_fixedn<14, false, false>(q, out_x, out_y, px, py); return; case 15: agp_compute_pose_fixedn<15, false, false>(q, out_x, out_y, px, py); return; default: agp_compute_pose_fixedn<16, false, false>(q, out_x, out_y, px, py); return; } } } } // Общий путь. alignas(32) float phi[MAX_ANGLE_DIM]; alignas(32) float s_arr[MAX_ANGLE_DIM]; alignas(32) float c_arr[MAX_ANGLE_DIM]; const float* __restrict th = q; float phi_acc = 0.0f; if (store) { px[0] = 0.0f; py[0] = 0.0f; } int i = 0; while (i + 4 <= n) { const float p0 = phi_acc + th[i + 0]; const float p1 = p0 + th[i + 1]; const float p2 = p1 + th[i + 2]; const float p3 = p2 + th[i + 3]; phi[i + 0] = p0; phi[i + 1] = p1; phi[i + 2] = p2; phi[i + 3] = p3; phi_acc = p3; i += 4; } while (i < n) { phi_acc += th[i]; phi[i] = phi_acc; ++i; } FABE13_SINCOS(phi, s_arr, c_arr, n); if (!store) { int j = 0; __m256 vx = _mm256_setzero_ps(); __m256 vy = _mm256_setzero_ps(); if (variableLen) { while (j + 8 <= n) { const __m256 vL = _mm256_loadu_ps(q + n + j); const __m256 vc = _mm256_load_ps(c_arr + j); const __m256 vs = _mm256_load_ps(s_arr + j); vx = _mm256_fmadd_ps(vL, vc, vx); vy = _mm256_fmadd_ps(vL, vs, vy); j += 8; } float x = hsum256_ps(vx); float y = hsum256_ps(vy); while (j < n) { const float Li = q[n + j]; x = fmaf(Li, c_arr[j], x); y = fmaf(Li, s_arr[j], y); ++j; } out_x = x; out_y = y; return; } else { const __m256 vL = _mm256_set1_ps(fixedLength); while (j + 8 <= n) { const __m256 vc = _mm256_load_ps(c_arr + j); const __m256 vs = _mm256_load_ps(s_arr + j); vx = _mm256_fmadd_ps(vL, vc, vx); vy = _mm256_fmadd_ps(vL, vs, vy); j += 8; } float x = hsum256_ps(vx); float y = hsum256_ps(vy); while (j < n) { x = fmaf(fixedLength, c_arr[j], x); y = fmaf(fixedLength, s_arr[j], y); ++j; } out_x = x; out_y = y; return; } } // Нужны px/py -> префиксные точки, тут scalar/unrolled быстрее и проще. float x = 0.0f, y = 0.0f; i = 0; if (variableLen) { while (i + 4 <= n) { { const float L0 = q[n + i + 0]; x = fmaf(L0, c_arr[i + 0], x); y = fmaf(L0, s_arr[i + 0], y); px[i + 1] = x; py[i + 1] = y; } { const float L1 = q[n + i + 1]; x = fmaf(L1, c_arr[i + 1], x); y = fmaf(L1, s_arr[i + 1], y); px[i + 2] = x; py[i + 2] = y; } { const float L2 = q[n + i + 2]; x = fmaf(L2, c_arr[i + 2], x); y = fmaf(L2, s_arr[i + 2], y); px[i + 3] = x; py[i + 3] = y; } { const float L3 = q[n + i + 3]; x = fmaf(L3, c_arr[i + 3], x); y = fmaf(L3, s_arr[i + 3], y); px[i + 4] = x; py[i + 4] = y; } i += 4; } while (i < n) { const float Li = q[n + i]; x = fmaf(Li, c_arr[i], x); y = fmaf(Li, s_arr[i], y); px[i + 1] = x; py[i + 1] = y; ++i; } } else { const float L = fixedLength; while (i + 4 <= n) { x = fmaf(L, c_arr[i + 0], x); y = fmaf(L, s_arr[i + 0], y); px[i + 1] = x; py[i + 1] = y; x = fmaf(L, c_arr[i + 1], x); y = fmaf(L, s_arr[i + 1], y); px[i + 2] = x; py[i + 2] = y; x = fmaf(L, c_arr[i + 2], x); y = fmaf(L, s_arr[i + 2], y); px[i + 3] = x; py[i + 3] = y; x = fmaf(L, c_arr[i + 3], x); y = fmaf(L, s_arr[i + 3], y); px[i + 4] = x; py[i + 4] = y; i += 4; } while (i < n) { x = fmaf(L, c_arr[i], x); y = fmaf(L, s_arr[i], y); px[i + 1] = x; py[i + 1] = y; ++i; } } out_x = x; out_y = y; } __declspec(noalias) __forceinline void copy_state_full_dim( const float* __restrict src, float* __restrict dst) const noexcept { const int dim = full_state_dim(); int i = 0; while (i + 8 <= dim) { const __m256 v = _mm256_loadu_ps(src + i); _mm256_storeu_ps(dst + i, v); i += 8; } while (i + 4 <= dim) { const __m128 v = _mm_loadu_ps(src + i); _mm_storeu_ps(dst + i, v); i += 4; } while (i < dim) { dst[i] = src[i]; ++i; } const __m256 vz = _mm256_setzero_ps(); while (i + 8 <= MAX_STATE_DIM) { _mm256_storeu_ps(dst + i, vz); i += 8; } while (i < MAX_STATE_DIM) { dst[i] = 0.0f; ++i; } } __declspec(noalias) __forceinline void lerp_state_full_dim( const float* __restrict qa, const float* __restrict qb, float t, float* __restrict out_q) const noexcept { const int dim = full_state_dim(); const __m256 vt = _mm256_set1_ps(t); int i = 0; while (i + 8 <= dim) { const __m256 va = _mm256_loadu_ps(qa + i); const __m256 vb = _mm256_loadu_ps(qb + i); const __m256 vd = _mm256_sub_ps(vb, va); _mm256_storeu_ps(out_q + i, _mm256_fmadd_ps(vt, vd, va)); i += 8; } while (i < dim) { out_q[i] = fmaf(t, qb[i] - qa[i], qa[i]); ++i; } const __m256 vz = _mm256_setzero_ps(); while (i + 8 <= MAX_STATE_DIM) { _mm256_storeu_ps(out_q + i, vz); i += 8; } while (i < MAX_STATE_DIM) { out_q[i] = 0.0f; ++i; } } static __declspec(noalias) __forceinline float wrap_pi(float a) noexcept { constexpr float PI = 3.1415926535897932384626433832795f; constexpr float TWO_PI = 6.2831853071795864769252867665590f; float k = floorf((a + PI) / TWO_PI); a -= k * TWO_PI; if (a > PI) a -= TWO_PI; else if (a < -PI) a += TWO_PI; return a; } static __declspec(noalias) __forceinline float point_segment_distance_sq( float px, float py, float ax, float ay, float bx, float by) noexcept { const float abx = bx - ax; const float aby = by - ay; const float apx = px - ax; const float apy = py - ay; const float ab2 = fmaf(abx, abx, aby * aby); if (ab2 <= 1e-20f) return fmaf(apx, apx, apy * apy); float t = (apx * abx + apy * aby) / ab2; if (t < 0.0f) t = 0.0f; else if (t > 1.0f) t = 1.0f; const float qx = fmaf(t, abx, ax); const float qy = fmaf(t, aby, ay); const float dx = px - qx; const float dy = py - qy; return fmaf(dx, dx, dy * dy); } static __declspec(noalias) __forceinline float polyline_circle_violation( const float* __restrict px, const float* __restrict py, int segments, float cx, float cy, float radius, float clearance) noexcept { const float rr = radius + clearance; float best_d2 = FLT_MAX; int i = 0; while (i < segments) { const float d2 = point_segment_distance_sq(cx, cy, px[i], py[i], px[i + 1], py[i + 1]); if (d2 < best_d2) best_d2 = d2; if (best_d2 <= rr * rr) break; ++i; } return rr - sqrtf(best_d2); } __forceinline bool has_transition_reference_angles() const noexcept { return referenceAngleCount > 0; } __forceinline bool has_transition_reference_lengths() const noexcept { return variableLen && (referenceLengthCount > 0); } __forceinline bool use_joint_angle_limit_constraints() const noexcept { return minTheta > 0.0f; } __forceinline int full_state_dim() const noexcept { return n + (variableLen ? n : 0); } __forceinline float reference_angle(int i) const noexcept { return (i < referenceAngleCount) ? referenceState[i] : 0.0f; } __forceinline float reference_length(int i) const noexcept { return (has_transition_reference_lengths() && i < referenceLengthCount) ? referenceState[n + i] : fixedLength; } __forceinline float wrapped_angle_delta(const float* __restrict q, int i) const noexcept { return wrap_pi(q[i] - reference_angle(i)); } __forceinline float max_length_step(int i) const noexcept { const float refL = reference_length(i); const float absStep = transitionMaxLengthStepFactor * fmaxf(refL, 0.5f); return fmaxf(absStep, 0.05f); } __forceinline float min_joint_abs_angle_bound(int i) const noexcept { const float refAbs = fabsf(reference_angle(i)); const float scaled = transitionMinJointAbsAngleRatio * refAbs; return fmaxf(transitionMinJointAbsAngleFloor, scaled); } __forceinline unsigned transition_continuity_constraint_count() const noexcept { unsigned total = 0u; if (solveMode == ManipSolveMode::Transition && has_transition_reference_angles()) { total += static_cast<unsigned>(referenceAngleCount); if (has_transition_reference_lengths()) total += static_cast<unsigned>(referenceLengthCount); if (transitionPreserveJointMagnitudeFloor) total += static_cast<unsigned>(referenceAngleCount); if (transitionPreserveJointSigns) total += static_cast<unsigned>(referenceAngleCount); if (transitionPreservePrefixSigns) total += static_cast<unsigned>(referenceAngleCount); } return total; } __forceinline unsigned joint_angle_limit_constraint_count() const noexcept { return use_joint_angle_limit_constraints() ? static_cast<unsigned>(n) : 0u; } __forceinline unsigned pose_constraint_base_index() const noexcept { return transition_continuity_constraint_count() + joint_angle_limit_constraint_count(); } static __declspec(noalias) __forceinline float outer_circle_clearance(float x, float y) noexcept { const float dx = x - 5.0f; return 5.0f - sqrtf(fmaf(dx, dx, y * y)); } static __declspec(noalias) __forceinline float inner_circle_clearance(float x, float y) noexcept { const float dx = x - 8.0f; const float dy = y + 3.0f; return sqrtf(fmaf(dx, dx, dy * dy)) - sqrtf(7.7f); } __declspec(noalias) __forceinline void SetTransitionReference( const float* src, int count) noexcept { const int dim = full_state_dim(); referenceStateCount = (count < dim) ? count : dim; referenceAngleCount = (referenceStateCount < n) ? referenceStateCount : n; referenceLengthCount = variableLen ? ((referenceStateCount > n) ? (referenceStateCount - n) : 0) : 0; int i = 0; while (i < dim) { if (src != nullptr && i < referenceStateCount) referenceState[i] = src[i]; else if (i < n) referenceState[i] = 0.0f; else referenceState[i] = fixedLength; ++i; } while (i < MAX_STATE_DIM) { referenceState[i] = 0.0f; ++i; } float phi_acc = 0.0f; i = 0; while (i < referenceAngleCount) { phi_acc += referenceState[i]; referencePrefixAngles[i] = phi_acc; ++i; } while (i < MAX_ANGLE_DIM) { referencePrefixAngles[i] = 0.0f; ++i; } } __forceinline float TransitionAngleEnergy(const float* __restrict q) const noexcept { float acc = 0.0f; int i = 0; while (i < referenceAngleCount) { const float d = wrapped_angle_delta(q, i); acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionLengthEnergy(const float* __restrict q) const noexcept { if (!has_transition_reference_lengths()) return 0.0f; int i = 0; __m256 vacc = _mm256_setzero_ps(); const float* __restrict qL = q + n; const float* __restrict rL = referenceState + n; while (i + 8 <= referenceLengthCount) { const __m256 a = _mm256_loadu_ps(qL + i); const __m256 b = _mm256_loadu_ps(rL + i); const __m256 d = _mm256_sub_ps(a, b); vacc = _mm256_fmadd_ps(d, d, vacc); i += 8; } float acc = hsum256_ps(vacc); while (i < referenceLengthCount) { const float d = qL[i] - rL[i]; acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionPrefixEnergy(const float* __restrict q) const noexcept { if (!has_transition_reference_angles()) return 0.0f; float acc = 0.0f; float phi_acc = 0.0f; int i = 0; while (i < referenceAngleCount) { phi_acc += q[i]; const float d = wrap_pi(phi_acc - referencePrefixAngles[i]); acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionEnergy(const float* __restrict q) const noexcept { float acc = transitionEnergyWeight * TransitionAngleEnergy(q); acc = fmaf(transitionLengthEnergyWeight, TransitionLengthEnergy(q), acc); acc = fmaf(transitionPrefixEnergyWeight, TransitionPrefixEnergy(q), acc); return acc; } __declspec(noalias) __forceinline ManipCost( int _n, bool _variableLen, float _targetX, float _targetY, float _minTheta, float _fixedLength, float _stretchFactor, float* obstacleData, unsigned _obstacleCount) noexcept : n((_n <= 0) ? 0 : (_variableLen ? ((_n > MAX_VARLEN_N) ? MAX_VARLEN_N : _n) : ((_n > MAX_FIXED_N) ? MAX_FIXED_N : _n))), variableLen(_variableLen), fixedLength(_fixedLength), stretchFactor(_stretchFactor), targetX(_targetX), targetY(_targetY), minTheta(_minTheta), archBiasW(0.02f), archBiasK(3.0f), sharpW(0.05f), obstacleCount((_obstacleCount > MAX_OBSTACLES) ? MAX_OBSTACLES : _obstacleCount), constraintMode(0u), obstacleClearance(OBSTACLE_CLEARANCE) { unsigned i = 0u; while (i < MAX_OBSTACLES) { obstacles[i].cx = 0.0f; obstacles[i].cy = 0.0f; obstacles[i].half = 0.0f; obstacles[i].pad = 0.0f; ++i; } if (obstacleData) { i = 0u; while (i < obstacleCount) { obstacles[i].cx = obstacleData[3u * i + 0u]; obstacles[i].cy = obstacleData[3u * i + 1u]; obstacles[i].half = obstacleData[3u * i + 2u]; ++i; } } SetTransitionReference(nullptr, 0); } __declspec(noalias) __forceinline float link_length(const float* __restrict q, int i) const noexcept { return variableLen ? q[n + i] : fixedLength; } __declspec(noalias) __forceinline float compute_positioning_objective_from_pose( const float* __restrict q, float x, float y) const noexcept { const float* __restrict th = q; float archPen = 0.0f; int i = 0; while (i < n) { const float theta = th[i]; const float t = fmaf(-theta, archBiasK, 0.0f); float sp; if (t > 10.0f) { sp = t; } else { const float exp_val = fmaf(t, fmaf(t, fmaf(t, fmaf(t, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); sp = log1pf(exp_val); } archPen = fmaf(archBiasW, sp, archPen); ++i; } const float dx = x - targetX; const float dy = y - targetY; const float dist = sqrtf(fmaf(dx, dx, dy * dy)); return dist + archPen; } __declspec(noalias) __forceinline float compute_transition_objective_from_pose( const float* __restrict q, float x, float y) const noexcept { const float dx = x - targetX; const float dy = y - targetY; const float dist2 = fmaf(dx, dx, dy * dy); return fmaf(transitionCaptureWeight, dist2, TransitionEnergy(q)); } __declspec(noalias) __forceinline bool use_obstacle_constraints() const noexcept { return ((constraintMode & MANIP_CONSTRAINT_OBSTACLES) != 0u) && (obstacleCount > 0u); } __declspec(noalias) __forceinline bool use_nonlinear_constraints() const noexcept { return (constraintMode & MANIP_CONSTRAINT_NONLINEAR) != 0u; } __declspec(noalias) __forceinline unsigned total_constraints() const noexcept { unsigned total = transition_continuity_constraint_count(); total += joint_angle_limit_constraint_count(); if (use_obstacle_constraints()) total += obstacleCount; if (use_nonlinear_constraints()) total += transitionCheckInnerCircleAgainstLinks ? 3u : 2u; return total; } __declspec(noalias) __forceinline unsigned feasible_index() const noexcept { return total_constraints(); } static __declspec(noalias) __forceinline float nonlinear_constraint_0(float x, float y) noexcept { const float dx = x - 5.0f; return dx * dx + y * y - 25.0f; } static __declspec(noalias) __forceinline float nonlinear_constraint_1(float x, float y) noexcept { const float dx = x - 8.0f; const float dy = y + 3.0f; return 7.7f - (dx * dx + dy * dy); } __declspec(noalias) __forceinline bool evaluate_transition_continuity_only( const float* __restrict q, unsigned& out_index, float& out_value) const noexcept { unsigned idx = 0u; if (solveMode != ManipSolveMode::Transition || !has_transition_reference_angles()) return true; int i = 0; while (i < referenceAngleCount) { const float d = wrapped_angle_delta(q, i); const float viol = fabsf(d) - transitionMaxAngleStep; if (viol > 0.0f) { out_index = idx; out_value = viol; return false; } ++idx; ++i; } if (has_transition_reference_lengths()) { i = 0; while (i < referenceLengthCount) { const float dL = fabsf(q[n + i] - referenceState[n + i]) - max_length_step(i); if (dL > 0.0f) { out_index = idx; out_value = dL; return false; } ++idx; ++i; } } if (transitionPreserveJointMagnitudeFloor) { i = 0; while (i < referenceAngleCount) { const float refAbs = fabsf(referenceState[i]); if (refAbs > transitionJointSignDeadband) { const float minAbs = min_joint_abs_angle_bound(i); const float viol = minAbs - fabsf(q[i]); if (viol > 0.0f) { out_index = idx; out_value = viol; return false; } } ++idx; ++i; } } if (transitionPreserveJointSigns) { i = 0; while (i < referenceAngleCount) { const float ref = referenceState[i]; if (fabsf(ref) > transitionJointSignDeadband) { const float prod = q[i] * ref; if (prod < 0.0f) { out_index = idx; out_value = -prod; return false; } } ++idx; ++i; } } if (transitionPreservePrefixSigns) { float phi_acc = 0.0f; i = 0; while (i < referenceAngleCount) { phi_acc += q[i]; const float refPhi = referencePrefixAngles[i]; if (fabsf(refPhi) > transitionPrefixSignDeadband) { const float prod = phi_acc * refPhi; if (prod < 0.0f) { out_index = idx; out_value = -prod; return false; } } ++idx; ++i; } } return true; } __declspec(noalias) __forceinline bool evaluate_joint_angle_limits_only( const float* __restrict q, unsigned base_index, unsigned& out_index, float& out_value) const noexcept { if (!use_joint_angle_limit_constraints()) return true; const __m256 vlimit = _mm256_set1_ps(minTheta); const __m256 vzero = _mm256_setzero_ps(); const __m256 vabsmsk = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)); int i = 0; while (i + 8 <= n) { const __m256 vq = _mm256_loadu_ps(q + i); const __m256 vabs = _mm256_and_ps(vq, vabsmsk); const __m256 vviol = _mm256_sub_ps(vabs, vlimit); const __m256 vcmp = _mm256_cmp_ps(vviol, vzero, _CMP_GT_OQ); const int mask = _mm256_movemask_ps(vcmp); if (mask) { unsigned first = 0u; while (((mask >> first) & 1) == 0) ++first; alignas(32) float tmp[8]; _mm256_store_ps(tmp, vviol); out_index = base_index + static_cast<unsigned>(i) + first; out_value = tmp[first]; return false; } i += 8; } while (i < n) { const float viol = fabsf(q[i]) - minTheta; if (viol > 0.0f) { out_index = base_index + static_cast<unsigned>(i); out_value = viol; return false; } ++i; } return true; } __declspec(noalias) __forceinline bool evaluate_pose_constraints_from_arrays( const float* __restrict px, const float* __restrict py, float x, float y, unsigned base_index, unsigned& out_index, float& out_value, float* out_geom_clearance = nullptr) const noexcept { float geom_clearance = FLT_MAX; unsigned idx = base_index; if (use_obstacle_constraints()) { unsigned j = 0u; while (j < obstacleCount) { const float viol = polyline_square_violation(px, py, n, obstacles[j], obstacleClearance); const float clearance = -viol; if (clearance < geom_clearance) geom_clearance = clearance; if (viol > 0.0f) { out_index = idx; out_value = viol; if (out_geom_clearance) *out_geom_clearance = clearance; return false; } ++idx; ++j; } } if (use_nonlinear_constraints()) { if (transitionCheckInnerCircleAgainstLinks) { const float viol = polyline_circle_violation(px, py, n, 8.0f, -3.0f, sqrtf(7.7f), 0.0f); const float clearance = -viol; if (clearance < geom_clearance) geom_clearance = clearance; if (viol > 0.0f) { out_index = idx; out_value = viol; if (out_geom_clearance) *out_geom_clearance = clearance; return false; } ++idx; } const float c0 = nonlinear_constraint_0(x, y); const float c0_clearance = outer_circle_clearance(x, y); if (c0_clearance < geom_clearance) geom_clearance = c0_clearance; if (c0 > 0.0f) { out_index = idx; out_value = c0; if (out_geom_clearance) *out_geom_clearance = c0_clearance; return false; } ++idx; const float c1 = nonlinear_constraint_1(x, y); const float c1_clearance = inner_circle_clearance(x, y); if (c1_clearance < geom_clearance) geom_clearance = c1_clearance; if (c1 > 0.0f) { out_index = idx; out_value = c1; if (out_geom_clearance) *out_geom_clearance = c1_clearance; return false; } ++idx; } if (out_geom_clearance) *out_geom_clearance = geom_clearance; return true; } __declspec(noalias) __forceinline bool evaluate_state_without_transition_continuity( const float* __restrict q, float& out_x, float& out_y, unsigned& out_index, float& out_value, float* out_geom_clearance = nullptr) const noexcept { if (!evaluate_joint_angle_limits_only(q, transition_continuity_constraint_count(), out_index, out_value)) { if (out_geom_clearance) *out_geom_clearance = -FLT_MAX; return false; } alignas(32) float px[MAX_FIXED_N + 1]; alignas(32) float py[MAX_FIXED_N + 1]; compute_pose(q, out_x, out_y, px, py); if (!evaluate_pose_constraints_from_arrays( px, py, out_x, out_y, pose_constraint_base_index(), out_index, out_value, out_geom_clearance)) return false; return true; } __declspec(noalias) __forceinline float transition_segment_motion_bound( const float* __restrict qa, const float* __restrict qb) const noexcept { float tail_reach = 0.0f; float bound = 0.0f; int i = n - 1; while (i >= 0) { tail_reach += fmaxf(link_length(qa, i), link_length(qb, i)); bound = fmaf(tail_reach, fabsf(qb[i] - qa[i]), bound); --i; } if (variableLen) { i = 0; while (i < n) { bound = fmaf(1.0f, fabsf(qb[n + i] - qa[n + i]), bound); ++i; } } return bound; } __forceinline bool transition_sweep_requires_validation() const noexcept { return (solveMode == ManipSolveMode::Transition) && has_transition_reference_angles() && (use_obstacle_constraints() || use_nonlinear_constraints()); } __declspec(noalias) __forceinline bool build_transition_geom_sample( const float* __restrict q, TransitionGeomSample& out_sample, unsigned& out_index, float& out_value) const noexcept { copy_state_full_dim(q, out_sample.q); return evaluate_state_without_transition_continuity( out_sample.q, out_sample.x, out_sample.y, out_index, out_value, &out_sample.clearance); } __declspec(noalias) __forceinline bool validate_transition_segment_recursive( const TransitionGeomSample& a, const TransitionGeomSample& b, int depth, unsigned& out_index, float& out_value) const noexcept { constexpr float SWEEP_CLEARANCE_EPS = 1e-4f; constexpr float SWEEP_MIN_BOUND = 5e-4f; const float motion_bound = transition_segment_motion_bound(a.q, b.q); if (motion_bound <= SWEEP_CLEARANCE_EPS) return true; const float required_clearance = motion_bound + SWEEP_CLEARANCE_EPS; if (a.clearance > required_clearance && b.clearance > required_clearance) return true; TransitionGeomSample mid{}; lerp_state_full_dim(a.q, b.q, 0.5f, mid.q); if (!evaluate_state_without_transition_continuity( mid.q, mid.x, mid.y, out_index, out_value, &mid.clearance)) return false; const float half_required_clearance = 0.5f * motion_bound + SWEEP_CLEARANCE_EPS; if (a.clearance > half_required_clearance && mid.clearance > half_required_clearance && b.clearance > half_required_clearance) return true; if (depth <= 0 || motion_bound <= SWEEP_MIN_BOUND) { TransitionGeomSample q25{}, q75{}; lerp_state_full_dim(a.q, b.q, 0.25f, q25.q); if (!evaluate_state_without_transition_continuity( q25.q, q25.x, q25.y, out_index, out_value, &q25.clearance)) return false; lerp_state_full_dim(a.q, b.q, 0.75f, q75.q); if (!evaluate_state_without_transition_continuity( q75.q, q75.x, q75.y, out_index, out_value, &q75.clearance)) return false; return true; } return validate_transition_segment_recursive(a, mid, depth - 1, out_index, out_value) && validate_transition_segment_recursive(mid, b, depth - 1, out_index, out_value); } __declspec(noalias) __forceinline bool evaluate_transition_swept_motion( const float* __restrict q, float final_x, float final_y, float final_geom_clearance, unsigned& out_index, float& out_value) const noexcept { if (!transition_sweep_requires_validation()) return true; TransitionGeomSample start{}; if (!build_transition_geom_sample(referenceState, start, out_index, out_value)) return false; TransitionGeomSample finish{}; copy_state_full_dim(q, finish.q); finish.x = final_x; finish.y = final_y; finish.clearance = final_geom_clearance; constexpr int SWEEP_MAX_DEPTH = 10; return validate_transition_segment_recursive(start, finish, SWEEP_MAX_DEPTH, out_index, out_value); } __declspec(noalias) __forceinline bool evaluate_indexed( const float* __restrict q, float& out_x, float& out_y, unsigned& out_index, float& out_value) const noexcept { if (!evaluate_transition_continuity_only(q, out_index, out_value)) return false; float final_geom_clearance = FLT_MAX; if (!evaluate_state_without_transition_continuity( q, out_x, out_y, out_index, out_value, &final_geom_clearance)) return false; if (solveMode == ManipSolveMode::Transition && has_transition_reference_angles()) { if (!transition_sweep_requires_validation()) { if (!evaluate_joint_angle_limits_only( referenceState, transition_continuity_constraint_count(), out_index, out_value)) return false; } else if (!evaluate_transition_swept_motion( q, out_x, out_y, final_geom_clearance, out_index, out_value)) { return false; } } out_index = feasible_index(); if (solveMode == ManipSolveMode::Transition) { out_value = compute_transition_objective_from_pose(q, out_x, out_y); return true; } out_value = compute_positioning_objective_from_pose(q, out_x, out_y); return true; } __declspec(noalias) __forceinline float operator()( const float* __restrict q, float& out_x, float& out_y) const noexcept { compute_pose(q, out_x, out_y, nullptr, nullptr); if (solveMode == ManipSolveMode::Transition) return compute_transition_objective_from_pose(q, out_x, out_y); return compute_positioning_objective_from_pose(q, out_x, out_y); } };

Что я бы считал главным исправлением:

  • заменить все _mm256_load_ps/_store_ps на внешних или потенциально невыравненных данных на loadu/storeu;
  • не требовать 32-byte alignment от самого ManipCost, если кодовая база может выделять его через нестандартный allocator;
  • не хранить в объекте выравнивание, без которого реально нельзя жить.

Если хочешь, я следующим сообщением могу дать ещё более агрессивную версию именно под твой реальный диапазон n — например, отдельно заточенную под n <= 16 или n <= 32, где можно выжать ещё больше.

इस प्रश्नोत्तर को साझा करें