"struct ManipCost final { int n; bool variableLe...
Criado em: 1 de abril de 2026
Respondido usando GPT-5.4 Thinking por Chat01
Criado em: 1 de abril de 2026
Respondido usando GPT-5.4 Thinking por Chat01
"struct ManipCost final {
int n;
bool variableLen;
float fixedLength, stretchFactor, targetX, targetY, minTheta, archBiasW, archBiasK, sharpW;
unsigned obstacleCount, constraintMode;
SquareObstacle obstacles[MAX_OBSTACLES];
float obstacleClearance;
ManipSolveMode solveMode = ManipSolveMode::Positioning;
alignas(32) float referenceState[64]{};
alignas(32) float referencePrefixAngles[32]{};
int referenceStateCount = 0, referenceAngleCount = 0, referenceLengthCount = 0;
float transitionCaptureWeight = 10000.0f, transitionEnergyWeight = 1.0f,
transitionLengthEnergyWeight = 0.35f, transitionPrefixEnergyWeight = 0.175f,
transitionMaxAngleStep = 0.5f, transitionMaxLengthStepFactor = 0.15f,
transitionJointSignDeadband = 0.1f, transitionPrefixSignDeadband = 0.2f,
transitionMinJointAbsAngleRatio = 0.1f, transitionMinJointAbsAngleFloor = 0.0175f;
bool transitionPreserveJointMagnitudeFloor = true,
transitionPreserveJointSigns = true,
transitionPreservePrefixSigns = true,
transitionCheckInnerCircleAgainstLinks = true;
textstruct TransitionGeomSample final { alignas(32) float q[64]{}; float x = 0.0f, y = 0.0f, clearance = FLT_MAX; };
private:
template<int I, int N>
static __forceinline void agp_pose_prefix(const float* __restrict th, float& acc, float* __restrict phi) noexcept {
if constexpr (I < N) {
acc = fmaf(th[I], 1.0f, acc);
phi[I] = acc;
agp_pose_prefix<I + 1, N>(th, acc, phi);
}
}
texttemplate<int I, int N, bool STORE> __forceinline void agp_pose_accum_fixed( const float* __restrict q, const float* __restrict s_arr, const float* __restrict c_arr, float& x, float& y, float* __restrict px, float* __restrict py) const noexcept { if constexpr (I < N) { const float Li = fixedLength; x = fmaf(Li, c_arr[I], x); y = fmaf(Li, s_arr[I], y); if constexpr (STORE) { px[I + 1] = x; py[I + 1] = y; } agp_pose_accum_fixed<I + 1, N, STORE>(q, s_arr, c_arr, x, y, px, py); } } template<int I, int N, bool STORE> __forceinline void agp_pose_accum_var( const float* __restrict q, const float* __restrict s_arr, const float* __restrict c_arr, float& x, float& y, float* __restrict px, float* __restrict py) const noexcept { if constexpr (I < N) { const float Li = q[n + I]; x = fmaf(Li, c_arr[I], x); y = fmaf(Li, s_arr[I], y); if constexpr (STORE) { px[I + 1] = x; py[I + 1] = y; } agp_pose_accum_var<I + 1, N, STORE>(q, s_arr, c_arr, x, y, px, py); } } template<int N, bool VARLEN, bool STORE> __forceinline void agp_compute_pose_fixedn( const float* __restrict q, float& out_x, float& out_y, float* __restrict px, float* __restrict py) const noexcept { alignas(32) float phi[16], s_arr[16], c_arr[16]; float phi_acc = 0.0f; if constexpr (STORE) { px[0] = 0.0f; py[0] = 0.0f; } agp_pose_prefix<0, N>(q, phi_acc, phi); FABE13_SINCOS(phi, s_arr, c_arr, N); float x = 0.0f, y = 0.0f; if constexpr (VARLEN) agp_pose_accum_var<0, N, STORE>(q, s_arr, c_arr, x, y, px, py); else agp_pose_accum_fixed<0, N, STORE>(q, s_arr, c_arr, x, y, px, py); out_x = x; out_y = y; }
public:
__declspec(noalias) __forceinline void compute_pose(const float* __restrict q, float& out_x, float& out_y, float* __restrict px, float* __restrict py) const noexcept {
const bool store = (px != nullptr) & (py != nullptr);
if (n <= 0) {
if (store) { px[0] = 0.0f; py[0] = 0.0f; }
out_x = out_y = 0.0f;
return;
}
if (static_cast<unsigned>(n) <= 16u) {
if (variableLen) {
if (store) {
switch (n) {
case 1: agp_compute_pose_fixedn< 1, true, true>(q, out_x, out_y, px, py); return;
case 2: agp_compute_pose_fixedn< 2, true, true>(q, out_x, out_y, px, py); return;
case 3: agp_compute_pose_fixedn< 3, true, true>(q, out_x, out_y, px, py); return;
case 4: agp_compute_pose_fixedn< 4, true, true>(q, out_x, out_y, px, py); return;
case 5: agp_compute_pose_fixedn< 5, true, true>(q, out_x, out_y, px, py); return;
case 6: agp_compute_pose_fixedn< 6, true, true>(q, out_x, out_y, px, py); return;
case 7: agp_compute_pose_fixedn< 7, true, true>(q, out_x, out_y, px, py); return;
case 8: agp_compute_pose_fixedn< 8, true, true>(q, out_x, out_y, px, py); return;
case 9: agp_compute_pose_fixedn< 9, true, true>(q, out_x, out_y, px, py); return;
case 10: agp_compute_pose_fixedn<10, true, true>(q, out_x, out_y, px, py); return;
case 11: agp_compute_pose_fixedn<11, true, true>(q, out_x, out_y, px, py); return;
case 12: agp_compute_pose_fixedn<12, true, true>(q, out_x, out_y, px, py); return;
case 13: agp_compute_pose_fixedn<13, true, true>(q, out_x, out_y, px, py); return;
case 14: agp_compute_pose_fixedn<14, true, true>(q, out_x, out_y, px, py); return;
case 15: agp_compute_pose_fixedn<15, true, true>(q, out_x, out_y, px, py); return;
default: agp_compute_pose_fixedn<16, true, true>(q, out_x, out_y, px, py); return;
}
}
else {
switch (n) {
case 1: agp_compute_pose_fixedn< 1, true, false>(q, out_x, out_y, px, py); return;
case 2: agp_compute_pose_fixedn< 2, true, false>(q, out_x, out_y, px, py); return;
case 3: agp_compute_pose_fixedn< 3, true, false>(q, out_x, out_y, px, py); return;
case 4: agp_compute_pose_fixedn< 4, true, false>(q, out_x, out_y, px, py); return;
case 5: agp_compute_pose_fixedn< 5, true, false>(q, out_x, out_y, px, py); return;
case 6: agp_compute_pose_fixedn< 6, true, false>(q, out_x, out_y, px, py); return;
case 7: agp_compute_pose_fixedn< 7, true, false>(q, out_x, out_y, px, py); return;
case 8: agp_compute_pose_fixedn< 8, true, false>(q, out_x, out_y, px, py); return;
case 9: agp_compute_pose_fixedn< 9, true, false>(q, out_x, out_y, px, py); return;
case 10: agp_compute_pose_fixedn<10, true, false>(q, out_x, out_y, px, py); return;
case 11: agp_compute_pose_fixedn<11, true, false>(q, out_x, out_y, px, py); return;
case 12: agp_compute_pose_fixedn<12, true, false>(q, out_x, out_y, px, py); return;
case 13: agp_compute_pose_fixedn<13, true, false>(q, out_x, out_y, px, py); return;
case 14: agp_compute_pose_fixedn<14, true, false>(q, out_x, out_y, px, py); return;
case 15: agp_compute_pose_fixedn<15, true, false>(q, out_x, out_y, px, py); return;
default: agp_compute_pose_fixedn<16, true, false>(q, out_x, out_y, px, py); return;
}
}
}
else {
if (store) {
switch (n) {
case 1: agp_compute_pose_fixedn< 1, false, true>(q, out_x, out_y, px, py); return;
case 2: agp_compute_pose_fixedn< 2, false, true>(q, out_x, out_y, px, py); return;
case 3: agp_compute_pose_fixedn< 3, false, true>(q, out_x, out_y, px, py); return;
case 4: agp_compute_pose_fixedn< 4, false, true>(q, out_x, out_y, px, py); return;
case 5: agp_compute_pose_fixedn< 5, false, true>(q, out_x, out_y, px, py); return;
case 6: agp_compute_pose_fixedn< 6, false, true>(q, out_x, out_y, px, py); return;
case 7: agp_compute_pose_fixedn< 7, false, true>(q, out_x, out_y, px, py); return;
case 8: agp_compute_pose_fixedn< 8, false, true>(q, out_x, out_y, px, py); return;
case 9: agp_compute_pose_fixedn< 9, false, true>(q, out_x, out_y, px, py); return;
case 10: agp_compute_pose_fixedn<10, false, true>(q, out_x, out_y, px, py); return;
case 11: agp_compute_pose_fixedn<11, false, true>(q, out_x, out_y, px, py); return;
case 12: agp_compute_pose_fixedn<12, false, true>(q, out_x, out_y, px, py); return;
case 13: agp_compute_pose_fixedn<13, false, true>(q, out_x, out_y, px, py); return;
case 14: agp_compute_pose_fixedn<14, false, true>(q, out_x, out_y, px, py); return;
case 15: agp_compute_pose_fixedn<15, false, true>(q, out_x, out_y, px, py); return;
default: agp_compute_pose_fixedn<16, false, true>(q, out_x, out_y, px, py); return;
}
}
else {
switch (n) {
case 1: agp_compute_pose_fixedn< 1, false, false>(q, out_x, out_y, px, py); return;
case 2: agp_compute_pose_fixedn< 2, false, false>(q, out_x, out_y, px, py); return;
case 3: agp_compute_pose_fixedn< 3, false, false>(q, out_x, out_y, px, py); return;
case 4: agp_compute_pose_fixedn< 4, false, false>(q, out_x, out_y, px, py); return;
case 5: agp_compute_pose_fixedn< 5, false, false>(q, out_x, out_y, px, py); return;
case 6: agp_compute_pose_fixedn< 6, false, false>(q, out_x, out_y, px, py); return;
case 7: agp_compute_pose_fixedn< 7, false, false>(q, out_x, out_y, px, py); return;
case 8: agp_compute_pose_fixedn< 8, false, false>(q, out_x, out_y, px, py); return;
case 9: agp_compute_pose_fixedn< 9, false, false>(q, out_x, out_y, px, py); return;
case 10: agp_compute_pose_fixedn<10, false, false>(q, out_x, out_y, px, py); return;
case 11: agp_compute_pose_fixedn<11, false, false>(q, out_x, out_y, px, py); return;
case 12: agp_compute_pose_fixedn<12, false, false>(q, out_x, out_y, px, py); return;
case 13: agp_compute_pose_fixedn<13, false, false>(q, out_x, out_y, px, py); return;
case 14: agp_compute_pose_fixedn<14, false, false>(q, out_x, out_y, px, py); return;
case 15: agp_compute_pose_fixedn<15, false, false>(q, out_x, out_y, px, py); return;
default: agp_compute_pose_fixedn<16, false, false>(q, out_x, out_y, px, py); return;
}
}
}
}
textconst float* __restrict th = q; alignas(32) float phi[32], s_arr[32], c_arr[32]; float x = 0.0f, y = 0.0f, phi_acc = 0.0f; if (store) { px[0] = 0.0f; py[0] = 0.0f; } int i = 0; __pragma(loop(ivdep)) while (i < n) { phi_acc = fmaf(th[i], 1.0f, phi_acc); phi[i] = phi_acc; ++i; } FABE13_SINCOS(phi, s_arr, c_arr, n); i = 0; __pragma(loop(ivdep)) while (i < n) { const float Li = variableLen ? q[n + i] : fixedLength; x = fmaf(Li, c_arr[i], x); y = fmaf(Li, s_arr[i], y); if (store) { px[i + 1] = x; py[i + 1] = y; } ++i; } out_x = x; out_y = y; } __declspec(noalias) __forceinline void copy_state_full_dim(const float* __restrict src, float* __restrict dst) const noexcept { const int dim = full_state_dim(); int i = 0; __pragma(loop(ivdep)) while (i + 8 <= dim) { __m256 v = _mm256_load_ps(src + i); _mm256_store_ps(dst + i, v); i += 8; } __pragma(loop(ivdep)) while (i + 4 <= dim) { __m128 v = _mm_load_ps(src + i); _mm_store_ps(dst + i, v); i += 4; } while (i < dim) { dst[i] = src[i]; ++i; } static __declspec(align(32)) const __m256 vz = _mm256_setzero_ps(); while (i + 8 <= 64) { _mm256_store_ps(dst + i, vz); i += 8; } while (i < 64) { dst[i] = 0.0f; ++i; } } __declspec(noalias) __forceinline void lerp_state_full_dim(const float* __restrict qa, const float* __restrict qb, float t, float* __restrict out_q) const noexcept { const int dim = full_state_dim(); const __m256 vt = _mm256_set1_ps(t); int i = 0; __pragma(loop(ivdep)) while (i + 8 <= dim) { const __m256 va = _mm256_load_ps(qa + i); const __m256 vb = _mm256_load_ps(qb + i); _mm256_store_ps(out_q + i, _mm256_fmadd_ps(vt, _mm256_sub_ps(vb, va), va)); i += 8; } while (i < dim) { out_q[i] = fmaf(t, fmaf(-qa[i], 1.0f, qb[i]), qa[i]); ++i; } static __declspec(align(32)) const __m256 vz = _mm256_setzero_ps(); while (i + 8 <= 64) { _mm256_store_ps(out_q + i, vz); i += 8; } while (i < 64) { out_q[i] = 0.0f; ++i; } } static __declspec(noalias) __forceinline float wrap_pi(float a) noexcept { constexpr float PI = 3.1415926535897932384626433832795f, TWO_PI = 6.2831853071795864769252867665590f; float k = floorf((a + PI) / TWO_PI); a -= k * TWO_PI; if (a > PI) a -= TWO_PI; else if (a < -PI) a += TWO_PI; return a; } static __declspec(noalias) __forceinline float point_segment_distance_sq( float px, float py, float ax, float ay, float bx, float by) noexcept { float abx = bx - ax, aby = by - ay, apx = px - ax, apy = py - ay, ab2 = fmaf(abx, abx, aby * aby); if (ab2 <= 1e-20f) return fmaf(apx, apx, apy * apy); float t = (apx * abx + apy * aby) / ab2; if (t < 0.0f) t = 0.0f; else if (t > 1.0f) t = 1.0f; float qx = fmaf(t, abx, ax), qy = fmaf(t, aby, ay), dx = px - qx, dy = py - qy; return fmaf(dx, dx, dy * dy); } static __declspec(noalias) __forceinline float polyline_circle_violation( const float* __restrict px, const float* __restrict py, int segments, float cx, float cy, float radius, float clearance) noexcept { float rr = radius + clearance, best_d2 = FLT_MAX; int i = 0; while (i < segments) { float d2 = point_segment_distance_sq(cx, cy, px[i], py[i], px[i + 1], py[i + 1]); if (d2 < best_d2) best_d2 = d2; if (best_d2 <= rr * rr) break; ++i; } return rr - sqrtf(best_d2); } __forceinline bool has_transition_reference_angles() const noexcept { return referenceAngleCount > 0; } __forceinline bool has_transition_reference_lengths() const noexcept { return variableLen && (referenceLengthCount > 0); } __forceinline bool use_joint_angle_limit_constraints() const noexcept { return minTheta > 0.0f; } __forceinline int full_state_dim() const noexcept { return n + (variableLen ? n : 0); } __forceinline float reference_angle(int i) const noexcept { return (i < referenceAngleCount) ? referenceState[i] : 0.0f; } __forceinline float reference_length(int i) const noexcept { return (has_transition_reference_lengths() && i < referenceLengthCount) ? referenceState[n + i] : fixedLength; } __forceinline float wrapped_angle_delta(const float* __restrict q, int i) const noexcept { return wrap_pi(q[i] - reference_angle(i)); } __forceinline float max_length_step(int i) const noexcept { float refL = reference_length(i), absStep = transitionMaxLengthStepFactor * fmaxf(refL, 0.5f); return fmaxf(absStep, 0.05f); } __forceinline float min_joint_abs_angle_bound(int i) const noexcept { float refAbs = fabsf(reference_angle(i)), scaled = transitionMinJointAbsAngleRatio * refAbs; return fmaxf(transitionMinJointAbsAngleFloor, scaled); } __forceinline unsigned transition_continuity_constraint_count() const noexcept { unsigned total = 0u; if (solveMode == ManipSolveMode::Transition && has_transition_reference_angles()) { total += static_cast<unsigned>(referenceAngleCount); if (has_transition_reference_lengths()) total += static_cast<unsigned>(referenceLengthCount); if (transitionPreserveJointMagnitudeFloor) total += static_cast<unsigned>(referenceAngleCount); if (transitionPreserveJointSigns) total += static_cast<unsigned>(referenceAngleCount); if (transitionPreservePrefixSigns) total += static_cast<unsigned>(referenceAngleCount); } return total; } __forceinline unsigned joint_angle_limit_constraint_count() const noexcept { return use_joint_angle_limit_constraints() ? static_cast<unsigned>(n) : 0u; } __forceinline unsigned pose_constraint_base_index() const noexcept { return transition_continuity_constraint_count() + joint_angle_limit_constraint_count(); } static __declspec(noalias) __forceinline float outer_circle_clearance(float x, float y) noexcept { float dx = x - 5.0f; return 5.0f - sqrtf(fmaf(dx, dx, y * y)); } static __declspec(noalias) __forceinline float inner_circle_clearance(float x, float y) noexcept { float dx = x - 8.0f, dy = y + 3.0f; return sqrtf(fmaf(dx, dx, dy * dy)) - sqrtf(7.7f); } __declspec(noalias) __forceinline void SetTransitionReference(const float* src, int count) noexcept { int dim = full_state_dim(); referenceStateCount = (count < dim) ? count : dim; referenceAngleCount = (referenceStateCount < n) ? referenceStateCount : n; referenceLengthCount = variableLen ? ((referenceStateCount > n) ? (referenceStateCount - n) : 0) : 0; int i = 0; while (i < dim) { if (src != nullptr && i < referenceStateCount) referenceState[i] = src[i]; else if (i < n) referenceState[i] = 0.0f; else referenceState[i] = fixedLength; ++i; } while (i < 64) { referenceState[i] = 0.0f; ++i; } float phi_acc = 0.0f; i = 0; while (i < referenceAngleCount) { phi_acc += referenceState[i]; referencePrefixAngles[i] = phi_acc; ++i; } while (i < n) { referencePrefixAngles[i] = 0.0f; ++i; } } __forceinline float TransitionAngleEnergy(const float* __restrict q) const noexcept { float acc = 0.0f; int i = 0; __pragma(loop(ivdep)) while (i < referenceAngleCount) { float d = wrapped_angle_delta(q, i); acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionLengthEnergy(const float* __restrict q) const noexcept { if (!has_transition_reference_lengths()) return 0.0f; float acc = 0.0f; int i = 0; __pragma(loop(ivdep)) while (i < referenceLengthCount) { float d = q[n + i] - referenceState[n + i]; acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionPrefixEnergy(const float* __restrict q) const noexcept { if (!has_transition_reference_angles()) return 0.0f; float acc = 0.0f, phi_acc = 0.0f; int i = 0; while (i < referenceAngleCount) { phi_acc += q[i]; float d = wrap_pi(phi_acc - referencePrefixAngles[i]); acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionEnergy(const float* __restrict q) const noexcept { float acc = transitionEnergyWeight * TransitionAngleEnergy(q); acc = fmaf(transitionLengthEnergyWeight, TransitionLengthEnergy(q), acc); acc = fmaf(transitionPrefixEnergyWeight, TransitionPrefixEnergy(q), acc); return acc; } __declspec(noalias) __forceinline ManipCost( int _n, bool _variableLen, float _targetX, float _targetY, float _minTheta, float _fixedLength, float _stretchFactor, float* obstacleData, unsigned _obstacleCount) noexcept : n(_n), variableLen(_variableLen), fixedLength(_fixedLength), stretchFactor(_stretchFactor), targetX(_targetX), targetY(_targetY), minTheta(_minTheta), archBiasW(0.02f), archBiasK(3.0f), sharpW(0.05f), obstacleCount((_obstacleCount > MAX_OBSTACLES) ? MAX_OBSTACLES : _obstacleCount), constraintMode(0u), obstacleClearance(OBSTACLE_CLEARANCE) { unsigned i = 0u; while (i < MAX_OBSTACLES) { obstacles[i].cx = obstacles[i].cy = obstacles[i].half = obstacles[i].pad = 0.0f; ++i; } if (obstacleData) { i = 0u; while (i < obstacleCount) { obstacles[i].cx = obstacleData[3u * i + 0u]; obstacles[i].cy = obstacleData[3u * i + 1u]; obstacles[i].half = obstacleData[3u * i + 2u]; ++i; } } SetTransitionReference(nullptr, 0); } __declspec(noalias) __forceinline float link_length(const float* __restrict q, int i) const noexcept { return variableLen ? q[n + i] : fixedLength; } __declspec(noalias) __forceinline float compute_positioning_objective_from_pose(const float* __restrict q, float x, float y) const noexcept { const float* __restrict th = q; float archPen = 0.0f; int i = 0; __pragma(loop(ivdep)) while (i < n) { float theta = th[i], t = fmaf(-theta, archBiasK, 0.0f), sp; if (t > 10.0f) sp = t; else { float exp_val = fmaf(t, fmaf(t, fmaf(t, fmaf(t, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); sp = log1pf(exp_val); } archPen = fmaf(archBiasW, sp, archPen); ++i; } float dx = fmaf(x, 1.0f, -targetX), dy = fmaf(y, 1.0f, -targetY), dist = sqrtf(fmaf(dx, dx, dy * dy)); return fmaf(dist, 1.0f, archPen); } __declspec(noalias) __forceinline float compute_transition_objective_from_pose(const float* __restrict q, float x, float y) const noexcept { float dx = x - targetX, dy = y - targetY, dist2 = fmaf(dx, dx, dy * dy); return fmaf(transitionCaptureWeight, dist2, TransitionEnergy(q)); } __declspec(noalias) __forceinline bool use_obstacle_constraints() const noexcept { return ((constraintMode & MANIP_CONSTRAINT_OBSTACLES) != 0u) && (obstacleCount > 0u); } __declspec(noalias) __forceinline bool use_nonlinear_constraints() const noexcept { return (constraintMode & MANIP_CONSTRAINT_NONLINEAR) != 0u; } __declspec(noalias) __forceinline unsigned total_constraints() const noexcept { unsigned total = transition_continuity_constraint_count(); total += joint_angle_limit_constraint_count(); if (use_obstacle_constraints()) total += obstacleCount; if (use_nonlinear_constraints()) total += transitionCheckInnerCircleAgainstLinks ? 3u : 2u; return total; } __declspec(noalias) __forceinline unsigned feasible_index() const noexcept { return total_constraints(); } static __declspec(noalias) __forceinline float nonlinear_constraint_0(float x, float y) noexcept { float dx = x - 5.0f; return dx * dx + y * y - 25.0f; } static __declspec(noalias) __forceinline float nonlinear_constraint_1(float x, float y) noexcept { float dx = x - 8.0f, dy = y + 3.0f; return 7.7f - (dx * dx + dy * dy); } __declspec(noalias) __forceinline bool evaluate_transition_continuity_only( const float* __restrict q, unsigned& out_index, float& out_value) const noexcept { unsigned idx = 0u; if (solveMode != ManipSolveMode::Transition || !has_transition_reference_angles()) return true; int i = 0; while (i < referenceAngleCount) { float d = wrapped_angle_delta(q, i), viol = fabsf(d) - transitionMaxAngleStep; if (viol > 0.0f) { out_index = idx; out_value = viol; return false; } ++idx; ++i; } if (has_transition_reference_lengths()) { i = 0; while (i < referenceLengthCount) { float dL = fabsf(q[n + i] - referenceState[n + i]) - max_length_step(i); if (dL > 0.0f) { out_index = idx; out_value = dL; return false; } ++idx; ++i; } } if (transitionPreserveJointMagnitudeFloor) { i = 0; while (i < referenceAngleCount) { float refAbs = fabsf(referenceState[i]); if (refAbs > transitionJointSignDeadband) { float minAbs = min_joint_abs_angle_bound(i), viol = minAbs - fabsf(q[i]); if (viol > 0.0f) { out_index = idx; out_value = viol; return false; } } ++idx; ++i; } } if (transitionPreserveJointSigns) { i = 0; while (i < referenceAngleCount) { float ref = referenceState[i]; if (fabsf(ref) > transitionJointSignDeadband) { float prod = q[i] * ref; if (prod < 0.0f) { out_index = idx; out_value = -prod; return false; } } ++idx; ++i; } } if (transitionPreservePrefixSigns) { float phi_acc = 0.0f; i = 0; while (i < referenceAngleCount) { phi_acc += q[i]; float refPhi = referencePrefixAngles[i]; if (fabsf(refPhi) > transitionPrefixSignDeadband) { float prod = phi_acc * refPhi; if (prod < 0.0f) { out_index = idx; out_value = -prod; return false; } } ++idx; ++i; } } return true; } __declspec(noalias) __forceinline bool evaluate_joint_angle_limits_only( const float* __restrict q, unsigned base_index, unsigned& out_index, float& out_value) const noexcept { if (!use_joint_angle_limit_constraints()) return true; int i = 0; while (i < n) { float viol = fabsf(q[i]) - minTheta; if (viol > 0.0f) { out_index = base_index + static_cast<unsigned>(i); out_value = viol; return false; } ++i; } return true; } __declspec(noalias) __forceinline bool evaluate_pose_constraints_from_arrays( const float* __restrict px, const float* __restrict py, float x, float y, unsigned base_index, unsigned& out_index, float& out_value, float* out_geom_clearance = nullptr) const noexcept { float geom_clearance = FLT_MAX; unsigned idx = base_index; if (use_obstacle_constraints()) { unsigned j = 0u; while (j < obstacleCount) { float viol = polyline_square_violation(px, py, n, obstacles[j], obstacleClearance), clearance = -viol; if (clearance < geom_clearance) geom_clearance = clearance; if (viol > 0.0f) { out_index = idx; out_value = viol; if (out_geom_clearance) *out_geom_clearance = clearance; return false; } ++idx; ++j; } } if (use_nonlinear_constraints()) { if (transitionCheckInnerCircleAgainstLinks) { float viol = polyline_circle_violation(px, py, n, 8.0f, -3.0f, sqrtf(7.7f), 0.0f), clearance = -viol; if (clearance < geom_clearance) geom_clearance = clearance; if (viol > 0.0f) { out_index = idx; out_value = viol; if (out_geom_clearance) *out_geom_clearance = clearance; return false; } ++idx; } float c0 = nonlinear_constraint_0(x, y), c0_clearance = outer_circle_clearance(x, y); if (c0_clearance < geom_clearance) geom_clearance = c0_clearance; if (c0 > 0.0f) { out_index = idx; out_value = c0; if (out_geom_clearance) *out_geom_clearance = c0_clearance; return false; } ++idx; float c1 = nonlinear_constraint_1(x, y), c1_clearance = inner_circle_clearance(x, y); if (c1_clearance < geom_clearance) geom_clearance = c1_clearance; if (c1 > 0.0f) { out_index = idx; out_value = c1; if (out_geom_clearance) *out_geom_clearance = c1_clearance; return false; } ++idx; } if (out_geom_clearance) *out_geom_clearance = geom_clearance; return true; } __declspec(noalias) __forceinline bool evaluate_state_without_transition_continuity( const float* __restrict q, float& out_x, float& out_y, unsigned& out_index, float& out_value, float* out_geom_clearance = nullptr) const noexcept { if (!evaluate_joint_angle_limits_only(q, transition_continuity_constraint_count(), out_index, out_value)) { if (out_geom_clearance) *out_geom_clearance = -FLT_MAX; return false; } alignas(32) float px[33], py[33]; compute_pose(q, out_x, out_y, px, py); if (!evaluate_pose_constraints_from_arrays(px, py, out_x, out_y, pose_constraint_base_index(), out_index, out_value, out_geom_clearance)) return false; return true; } __declspec(noalias) __forceinline float transition_segment_motion_bound(const float* __restrict qa, const float* __restrict qb) const noexcept { float tail_reach = 0.0f, bound = 0.0f; int i = n - 1; while (i >= 0) { tail_reach += fmaxf(link_length(qa, i), link_length(qb, i)); bound = fmaf(tail_reach, fabsf(qb[i] - qa[i]), bound); --i; } if (variableLen) { i = 0; __pragma(loop(ivdep)) while (i < n) { bound = fmaf(1.0f, fabsf(qb[n + i] - qa[n + i]), bound); ++i; } } return bound; } __forceinline bool transition_sweep_requires_validation() const noexcept { return (solveMode == ManipSolveMode::Transition) && has_transition_reference_angles() && (use_obstacle_constraints() || use_nonlinear_constraints()); } __declspec(noalias) __forceinline bool build_transition_geom_sample( const float* __restrict q, TransitionGeomSample& out_sample, unsigned& out_index, float& out_value) const noexcept { copy_state_full_dim(q, out_sample.q); return evaluate_state_without_transition_continuity( out_sample.q, out_sample.x, out_sample.y, out_index, out_value, &out_sample.clearance); } __declspec(noalias) __forceinline bool validate_transition_segment_recursive( const TransitionGeomSample& a, const TransitionGeomSample& b, int depth, unsigned& out_index, float& out_value) const noexcept { constexpr float SWEEP_CLEARANCE_EPS = 1e-4f, SWEEP_MIN_BOUND = 5e-4f; float motion_bound = transition_segment_motion_bound(a.q, b.q); if (motion_bound <= SWEEP_CLEARANCE_EPS) return true; float required_clearance = motion_bound + SWEEP_CLEARANCE_EPS; if (a.clearance > required_clearance && b.clearance > required_clearance) return true; TransitionGeomSample mid{}; lerp_state_full_dim(a.q, b.q, 0.5f, mid.q); if (!evaluate_state_without_transition_continuity(mid.q, mid.x, mid.y, out_index, out_value, &mid.clearance)) return false; float half_required_clearance = 0.5f * motion_bound + SWEEP_CLEARANCE_EPS; if (a.clearance > half_required_clearance && mid.clearance > half_required_clearance && b.clearance > half_required_clearance) return true; if (depth <= 0 || motion_bound <= SWEEP_MIN_BOUND) { TransitionGeomSample q25{}, q75{}; lerp_state_full_dim(a.q, b.q, 0.25f, q25.q); if (!evaluate_state_without_transition_continuity(q25.q, q25.x, q25.y, out_index, out_value, &q25.clearance)) return false; lerp_state_full_dim(a.q, b.q, 0.75f, q75.q); if (!evaluate_state_without_transition_continuity(q75.q, q75.x, q75.y, out_index, out_value, &q75.clearance)) return false; return true; } return validate_transition_segment_recursive(a, mid, depth - 1, out_index, out_value) && validate_transition_segment_recursive(mid, b, depth - 1, out_index, out_value); } __declspec(noalias) __forceinline bool evaluate_transition_swept_motion( const float* __restrict q, float final_x, float final_y, float final_geom_clearance, unsigned& out_index, float& out_value) const noexcept { if (!transition_sweep_requires_validation()) return true; TransitionGeomSample start{}; if (!build_transition_geom_sample(referenceState, start, out_index, out_value)) return false; TransitionGeomSample finish{}; copy_state_full_dim(q, finish.q); finish.x = final_x; finish.y = final_y; finish.clearance = final_geom_clearance; constexpr int SWEEP_MAX_DEPTH = 10; return validate_transition_segment_recursive(start, finish, SWEEP_MAX_DEPTH, out_index, out_value); } __declspec(noalias) __forceinline bool evaluate_indexed( const float* __restrict q, float& out_x, float& out_y, unsigned& out_index, float& out_value) const noexcept { if (!evaluate_transition_continuity_only(q, out_index, out_value)) return false; float final_geom_clearance = FLT_MAX; if (!evaluate_state_without_transition_continuity(q, out_x, out_y, out_index, out_value, &final_geom_clearance)) return false; if (solveMode == ManipSolveMode::Transition && has_transition_reference_angles()) { if (!transition_sweep_requires_validation()) { if (!evaluate_joint_angle_limits_only(referenceState, transition_continuity_constraint_count(), out_index, out_value)) return false; } else if (!evaluate_transition_swept_motion(q, out_x, out_y, final_geom_clearance, out_index, out_value)) return false; } out_index = feasible_index(); if (solveMode == ManipSolveMode::Transition) { out_value = compute_transition_objective_from_pose(q, out_x, out_y); return true; } out_value = compute_positioning_objective_from_pose(q, out_x, out_y); return true; }
};" - новая версия, старая версия: "struct ManipCost final {
int n;
bool variableLen;
float fixedLength;
float stretchFactor;
float targetX;
float targetY;
float minTheta;
float archBiasW;
float archBiasK;
float sharpW;
unsigned obstacleCount;
unsigned constraintMode;
SquareObstacle obstacles[MAX_OBSTACLES];
float obstacleClearance;
ManipSolveMode solveMode = ManipSolveMode::Positioning;
textalignas(16) float referenceState[64]{}; alignas(16) float referencePrefixAngles[32]{}; int referenceStateCount = 0; int referenceAngleCount = 0; int referenceLengthCount = 0; float transitionCaptureWeight = 20000.0f; float transitionEnergyWeight = 1.0f; float transitionLengthEnergyWeight = 0.35f; float transitionPrefixEnergyWeight = 0.18f; float transitionMaxAngleStep = 0.40f; float transitionMaxLengthStepFactor = 0.12f; float transitionJointSignDeadband = 0.05f; float transitionPrefixSignDeadband = 0.08f; float transitionMinJointAbsAngleRatio = 0.25f; float transitionMinJointAbsAngleFloor = 0.05f; bool transitionPreserveJointMagnitudeFloor = true; bool transitionPreserveJointSigns = true; bool transitionPreservePrefixSigns = true; bool transitionCheckInnerCircleAgainstLinks = true; struct TransitionGeomSample final { alignas(16) float q[64]{}; float x = 0.0f; float y = 0.0f; float clearance = FLT_MAX; }; static __declspec(noalias) __forceinline float wrap_pi(float a) noexcept { constexpr float PI = 3.1415926535897932384626433832795f; constexpr float TWO_PI = 6.2831853071795864769252867665590f; float k = floorf((a + PI) / TWO_PI); a -= k * TWO_PI; if (a > PI) a -= TWO_PI; else if (a < -PI) a += TWO_PI; return a; } static __declspec(noalias) __forceinline float point_segment_distance_sq( float px, float py, float ax, float ay, float bx, float by) noexcept { float abx = bx - ax; float aby = by - ay; float apx = px - ax; float apy = py - ay; float ab2 = fmaf(abx, abx, aby * aby); if (ab2 <= 1e-20f) return fmaf(apx, apx, apy * apy); float t = (apx * abx + apy * aby) / ab2; if (t < 0.0f) t = 0.0f; else if (t > 1.0f) t = 1.0f; float qx = fmaf(t, abx, ax); float qy = fmaf(t, aby, ay); float dx = px - qx; float dy = py - qy; return fmaf(dx, dx, dy * dy); } static __declspec(noalias) __forceinline float polyline_circle_violation( const float* __restrict px, const float* __restrict py, int segments, float cx, float cy, float radius, float clearance) noexcept { float rr = radius + clearance; float best_d2 = FLT_MAX; int i = 0; while (i < segments) { float d2 = point_segment_distance_sq(cx, cy, px[i], py[i], px[i + 1], py[i + 1]); if (d2 < best_d2) best_d2 = d2; if (best_d2 <= rr * rr) break; ++i; } return rr - sqrtf(best_d2); } __forceinline bool has_transition_reference_angles() const noexcept { return referenceAngleCount > 0; } __forceinline bool has_transition_reference_lengths() const noexcept { return variableLen && (referenceLengthCount > 0); } __forceinline bool use_joint_angle_limit_constraints() const noexcept { return minTheta > 0.0f; } __forceinline int full_state_dim() const noexcept { return n + (variableLen ? n : 0); } __forceinline float reference_angle(int i) const noexcept { return (i < referenceAngleCount) ? referenceState[i] : 0.0f; } __forceinline float reference_length(int i) const noexcept { return (has_transition_reference_lengths() && i < referenceLengthCount) ? referenceState[n + i] : fixedLength; } __forceinline float wrapped_angle_delta(const float* __restrict q, int i) const noexcept { return wrap_pi(q[i] - reference_angle(i)); } __forceinline float max_length_step(int i) const noexcept { float refL = reference_length(i); float absStep = transitionMaxLengthStepFactor * fmaxf(refL, 0.5f); return fmaxf(absStep, 0.05f); } __forceinline float min_joint_abs_angle_bound(int i) const noexcept { float refAbs = fabsf(reference_angle(i)); float scaled = transitionMinJointAbsAngleRatio * refAbs; return fmaxf(transitionMinJointAbsAngleFloor, scaled); } __forceinline unsigned transition_continuity_constraint_count() const noexcept { unsigned total = 0u; if (solveMode == ManipSolveMode::Transition && has_transition_reference_angles()) { total += static_cast<unsigned>(referenceAngleCount); if (has_transition_reference_lengths()) total += static_cast<unsigned>(referenceLengthCount); if (transitionPreserveJointMagnitudeFloor) total += static_cast<unsigned>(referenceAngleCount); if (transitionPreserveJointSigns) total += static_cast<unsigned>(referenceAngleCount); if (transitionPreservePrefixSigns) total += static_cast<unsigned>(referenceAngleCount); } return total; } __forceinline unsigned joint_angle_limit_constraint_count() const noexcept { return use_joint_angle_limit_constraints() ? static_cast<unsigned>(n) : 0u; } __forceinline unsigned pose_constraint_base_index() const noexcept { return transition_continuity_constraint_count() + joint_angle_limit_constraint_count(); } static __declspec(noalias) __forceinline float outer_circle_clearance(float x, float y) noexcept { float dx = x - 5.0f; return 5.0f - sqrtf(fmaf(dx, dx, y * y)); } static __declspec(noalias) __forceinline float inner_circle_clearance(float x, float y) noexcept { float dx = x - 8.0f; float dy = y + 3.0f; return sqrtf(fmaf(dx, dx, dy * dy)) - sqrtf(7.7f); } __declspec(noalias) __forceinline void SetTransitionReference(const float* src, int count) noexcept { int dim = full_state_dim(); referenceStateCount = (count < dim) ? count : dim; referenceAngleCount = (referenceStateCount < n) ? referenceStateCount : n; referenceLengthCount = variableLen ? ((referenceStateCount > n) ? (referenceStateCount - n) : 0) : 0; int i = 0; while (i < dim) { if (src != nullptr && i < referenceStateCount) referenceState[i] = src[i]; else if (i < n) referenceState[i] = 0.0f; else referenceState[i] = fixedLength; ++i; } while (i < 64) { referenceState[i] = 0.0f; ++i; } float phi_acc = 0.0f; i = 0; while (i < referenceAngleCount) { phi_acc += referenceState[i]; referencePrefixAngles[i] = phi_acc; ++i; } while (i < n) { referencePrefixAngles[i] = 0.0f; ++i; } } __forceinline float TransitionAngleEnergy(const float* __restrict q) const noexcept { float acc = 0.0f; int i = 0; while (i < referenceAngleCount) { float d = wrapped_angle_delta(q, i); acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionLengthEnergy(const float* __restrict q) const noexcept { if (!has_transition_reference_lengths()) return 0.0f; float acc = 0.0f; int i = 0; while (i < referenceLengthCount) { float d = q[n + i] - referenceState[n + i]; acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionPrefixEnergy(const float* __restrict q) const noexcept { if (!has_transition_reference_angles()) return 0.0f; float acc = 0.0f; float phi_acc = 0.0f; int i = 0; while (i < referenceAngleCount) { phi_acc += q[i]; float d = wrap_pi(phi_acc - referencePrefixAngles[i]); acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionEnergy(const float* __restrict q) const noexcept { float acc = transitionEnergyWeight * TransitionAngleEnergy(q); acc = fmaf(transitionLengthEnergyWeight, TransitionLengthEnergy(q), acc); acc = fmaf(transitionPrefixEnergyWeight, TransitionPrefixEnergy(q), acc); return acc; } __declspec(noalias) __forceinline ManipCost( int _n, bool _variableLen, float _targetX, float _targetY, float _minTheta, float _fixedLength, float _stretchFactor, float* obstacleData, unsigned _obstacleCount) noexcept : n(_n), variableLen(_variableLen), fixedLength(_fixedLength), stretchFactor(_stretchFactor), targetX(_targetX), targetY(_targetY), minTheta(_minTheta), archBiasW(0.02f), archBiasK(3.0f), sharpW(0.05f), obstacleCount((_obstacleCount > MAX_OBSTACLES) ? MAX_OBSTACLES : _obstacleCount), constraintMode(0u), obstacleClearance(OBSTACLE_CLEARANCE) { unsigned i = 0u; while (i < MAX_OBSTACLES) { obstacles[i].cx = 0.0f; obstacles[i].cy = 0.0f; obstacles[i].half = 0.0f; obstacles[i].pad = 0.0f; ++i; } if (obstacleData) { i = 0u; while (i < obstacleCount) { obstacles[i].cx = obstacleData[3u * i + 0u]; obstacles[i].cy = obstacleData[3u * i + 1u]; obstacles[i].half = obstacleData[3u * i + 2u]; ++i; } } SetTransitionReference(nullptr, 0); } __declspec(noalias) __forceinline float link_length(const float* __restrict q, int i) const noexcept { return variableLen ? q[n + i] : fixedLength; } __declspec(noalias) __forceinline void compute_pose( const float* __restrict q, float& out_x, float& out_y, float* __restrict px, float* __restrict py) const noexcept { const float* __restrict th = q; alignas(16) float phi[32]; alignas(16) float s_arr[32]; alignas(16) float c_arr[32]; float x = 0.0f; float y = 0.0f; float phi_acc = 0.0f; if (px && py) { px[0] = 0.0f; py[0] = 0.0f; } int i = 0; while (i < n) { phi_acc += th[i]; phi[i] = phi_acc; ++i; } FABE13_SINCOS(phi, s_arr, c_arr, n); i = 0; while (i < n) { float Li = link_length(q, i); x = fmaf(Li, c_arr[i], x); y = fmaf(Li, s_arr[i], y); if (px && py) { px[i + 1] = x; py[i + 1] = y; } ++i; } out_x = x; out_y = y; } __declspec(noalias) __forceinline float compute_positioning_objective_from_pose( const float* __restrict q, float x, float y) const noexcept { const float* __restrict th = q; float archPen = 0.0f; int i = 0; while (i < n) { float theta = th[i]; float t = fmaf(-theta, archBiasK, 0.0f); float sp; if (t > 10.0f) { sp = t; } else { float exp_val = fmaf(t, fmaf(t, fmaf(t, fmaf(t, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); sp = log1pf(exp_val); } archPen = fmaf(archBiasW, sp, archPen); ++i; } float dx = fmaf(x, 1.0f, -targetX); float dy = fmaf(y, 1.0f, -targetY); float dist = sqrtf(fmaf(dx, dx, dy * dy)); return fmaf(dist, 1.0f, archPen); } __declspec(noalias) __forceinline float compute_transition_objective_from_pose( const float* __restrict q, float x, float y) const noexcept { float dx = x - targetX; float dy = y - targetY; float dist2 = fmaf(dx, dx, dy * dy); return fmaf(transitionCaptureWeight, dist2, TransitionEnergy(q)); } __declspec(noalias) __forceinline bool use_obstacle_constraints() const noexcept { return ((constraintMode & MANIP_CONSTRAINT_OBSTACLES) != 0u) && (obstacleCount > 0u); } __declspec(noalias) __forceinline bool use_nonlinear_constraints() const noexcept { return (constraintMode & MANIP_CONSTRAINT_NONLINEAR) != 0u; } __declspec(noalias) __forceinline unsigned total_constraints() const noexcept { unsigned total = transition_continuity_constraint_count(); total += joint_angle_limit_constraint_count(); if (use_obstacle_constraints()) total += obstacleCount; if (use_nonlinear_constraints()) total += transitionCheckInnerCircleAgainstLinks ? 3u : 2u; return total; } __declspec(noalias) __forceinline unsigned feasible_index() const noexcept { return total_constraints(); } static __declspec(noalias) __forceinline float nonlinear_constraint_0(float x, float y) noexcept { float dx = x - 5.0f; return dx * dx + y * y - 25.0f; } static __declspec(noalias) __forceinline float nonlinear_constraint_1(float x, float y) noexcept { float dx = x - 8.0f; float dy = y + 3.0f; return 7.7f - (dx * dx + dy * dy); } __declspec(noalias) __forceinline bool evaluate_transition_continuity_only( const float* __restrict q, unsigned& out_index, float& out_value) const noexcept { unsigned idx = 0u; if (solveMode != ManipSolveMode::Transition || !has_transition_reference_angles()) return true; int i = 0; while (i < referenceAngleCount) { float d = wrapped_angle_delta(q, i); float viol = fabsf(d) - transitionMaxAngleStep; if (viol > 0.0f) { out_index = idx; out_value = viol; return false; } ++idx; ++i; } if (has_transition_reference_lengths()) { i = 0; while (i < referenceLengthCount) { float dL = fabsf(q[n + i] - referenceState[n + i]) - max_length_step(i); if (dL > 0.0f) { out_index = idx; out_value = dL; return false; } ++idx; ++i; } } if (transitionPreserveJointMagnitudeFloor) { i = 0; while (i < referenceAngleCount) { float refAbs = fabsf(referenceState[i]); if (refAbs > transitionJointSignDeadband) { float minAbs = min_joint_abs_angle_bound(i); float viol = minAbs - fabsf(q[i]); if (viol > 0.0f) { out_index = idx; out_value = viol; return false; } } ++idx; ++i; } } if (transitionPreserveJointSigns) { i = 0; while (i < referenceAngleCount) { float ref = referenceState[i]; if (fabsf(ref) > transitionJointSignDeadband) { float prod = q[i] * ref; if (prod < 0.0f) { out_index = idx; out_value = -prod; return false; } } ++idx; ++i; } } if (transitionPreservePrefixSigns) { float phi_acc = 0.0f; i = 0; while (i < referenceAngleCount) { phi_acc += q[i]; float refPhi = referencePrefixAngles[i]; if (fabsf(refPhi) > transitionPrefixSignDeadband) { float prod = phi_acc * refPhi; if (prod < 0.0f) { out_index = idx; out_value = -prod; return false; } } ++idx; ++i; } } return true; } __declspec(noalias) __forceinline bool evaluate_joint_angle_limits_only( const float* __restrict q, unsigned base_index, unsigned& out_index, float& out_value) const noexcept { if (!use_joint_angle_limit_constraints()) return true; int i = 0; while (i < n) { float viol = fabsf(q[i]) - minTheta; if (viol > 0.0f) { out_index = base_index + static_cast<unsigned>(i); out_value = viol; return false; } ++i; } return true; } __declspec(noalias) __forceinline bool evaluate_pose_constraints_from_arrays( const float* __restrict px, const float* __restrict py, float x, float y, unsigned base_index, unsigned& out_index, float& out_value, float* out_geom_clearance = nullptr) const noexcept { float geom_clearance = FLT_MAX; unsigned idx = base_index; if (use_obstacle_constraints()) { unsigned j = 0u; while (j < obstacleCount) { float viol = polyline_square_violation(px, py, n, obstacles[j], obstacleClearance); float clearance = -viol; if (clearance < geom_clearance) geom_clearance = clearance; if (viol > 0.0f) { out_index = idx; out_value = viol; if (out_geom_clearance) *out_geom_clearance = clearance; return false; } ++idx; ++j; } } if (use_nonlinear_constraints()) { if (transitionCheckInnerCircleAgainstLinks) { float viol = polyline_circle_violation( px, py, n, 8.0f, -3.0f, sqrtf(7.7f), 0.0f); float clearance = -viol; if (clearance < geom_clearance) geom_clearance = clearance; if (viol > 0.0f) { out_index = idx; out_value = viol; if (out_geom_clearance) *out_geom_clearance = clearance; return false; } ++idx; } float c0 = nonlinear_constraint_0(x, y); float c0_clearance = outer_circle_clearance(x, y); if (c0_clearance < geom_clearance) geom_clearance = c0_clearance; if (c0 > 0.0f) { out_index = idx; out_value = c0; if (out_geom_clearance) *out_geom_clearance = c0_clearance; return false; } ++idx; float c1 = nonlinear_constraint_1(x, y); float c1_clearance = inner_circle_clearance(x, y); if (c1_clearance < geom_clearance) geom_clearance = c1_clearance; if (c1 > 0.0f) { out_index = idx; out_value = c1; if (out_geom_clearance) *out_geom_clearance = c1_clearance; return false; } ++idx; } if (out_geom_clearance) *out_geom_clearance = geom_clearance; return true; } __declspec(noalias) __forceinline bool evaluate_state_without_transition_continuity( const float* __restrict q, float& out_x, float& out_y, unsigned& out_index, float& out_value, float* out_geom_clearance = nullptr) const noexcept { if (!evaluate_joint_angle_limits_only(q, transition_continuity_constraint_count(), out_index, out_value)) { if (out_geom_clearance) *out_geom_clearance = -FLT_MAX; return false; } alignas(16) float px[33]; alignas(16) float py[33]; compute_pose(q, out_x, out_y, px, py); if (!evaluate_pose_constraints_from_arrays(px, py, out_x, out_y, pose_constraint_base_index(), out_index, out_value, out_geom_clearance)) return false; return true; } __declspec(noalias) __forceinline void copy_state_full_dim( const float* __restrict src, float* __restrict dst) const noexcept { int dim = full_state_dim(); int i = 0; while (i < dim) { dst[i] = src[i]; ++i; } while (i < 64) { dst[i] = 0.0f; ++i; } } __declspec(noalias) __forceinline void lerp_state_full_dim( const float* __restrict qa, const float* __restrict qb, float t, float* __restrict out_q) const noexcept { int dim = full_state_dim(); int i = 0; while (i < dim) { out_q[i] = fmaf(t, qb[i] - qa[i], qa[i]); ++i; } while (i < 64) { out_q[i] = 0.0f; ++i; } } __declspec(noalias) __forceinline float transition_segment_motion_bound( const float* __restrict qa, const float* __restrict qb) const noexcept { float tail_reach = 0.0f; float bound = 0.0f; int i = n - 1; while (i >= 0) { tail_reach += fmaxf(link_length(qa, i), link_length(qb, i)); bound = fmaf(tail_reach, fabsf(qb[i] - qa[i]), bound); --i; } if (variableLen) { i = 0; while (i < n) { bound = fmaf(1.0f, fabsf(qb[n + i] - qa[n + i]), bound); ++i; } } return bound; } __forceinline bool transition_sweep_requires_validation() const noexcept { return (solveMode == ManipSolveMode::Transition) && has_transition_reference_angles() && (use_obstacle_constraints() || use_nonlinear_constraints()); } __declspec(noalias) __forceinline bool build_transition_geom_sample( const float* __restrict q, TransitionGeomSample& out_sample, unsigned& out_index, float& out_value) const noexcept { copy_state_full_dim(q, out_sample.q); return evaluate_state_without_transition_continuity( out_sample.q, out_sample.x, out_sample.y, out_index, out_value, &out_sample.clearance); } __declspec(noalias) __forceinline bool validate_transition_segment_recursive( const TransitionGeomSample& a, const TransitionGeomSample& b, int depth, unsigned& out_index, float& out_value) const noexcept { constexpr float SWEEP_CLEARANCE_EPS = 1e-4f; constexpr float SWEEP_MIN_BOUND = 5e-4f; float motion_bound = transition_segment_motion_bound(a.q, b.q); if (motion_bound <= SWEEP_CLEARANCE_EPS) return true; float required_clearance = motion_bound + SWEEP_CLEARANCE_EPS; if (a.clearance > required_clearance && b.clearance > required_clearance) return true; TransitionGeomSample mid{}; lerp_state_full_dim(a.q, b.q, 0.5f, mid.q); if (!evaluate_state_without_transition_continuity( mid.q, mid.x, mid.y, out_index, out_value, &mid.clearance)) return false; float half_required_clearance = 0.5f * motion_bound + SWEEP_CLEARANCE_EPS; if (a.clearance > half_required_clearance && mid.clearance > half_required_clearance && b.clearance > half_required_clearance) return true; if (depth <= 0 || motion_bound <= SWEEP_MIN_BOUND) { TransitionGeomSample q25{}, q75{}; lerp_state_full_dim(a.q, b.q, 0.25f, q25.q); if (!evaluate_state_without_transition_continuity( q25.q, q25.x, q25.y, out_index, out_value, &q25.clearance)) return false; lerp_state_full_dim(a.q, b.q, 0.75f, q75.q); if (!evaluate_state_without_transition_continuity( q75.q, q75.x, q75.y, out_index, out_value, &q75.clearance)) return false; return true; } return validate_transition_segment_recursive(a, mid, depth - 1, out_index, out_value) && validate_transition_segment_recursive(mid, b, depth - 1, out_index, out_value); } __declspec(noalias) __forceinline bool evaluate_transition_swept_motion( const float* __restrict q, float final_x, float final_y, float final_geom_clearance, unsigned& out_index, float& out_value) const noexcept { if (!transition_sweep_requires_validation()) return true; TransitionGeomSample start{}; if (!build_transition_geom_sample(referenceState, start, out_index, out_value)) return false; TransitionGeomSample finish{}; copy_state_full_dim(q, finish.q); finish.x = final_x; finish.y = final_y; finish.clearance = final_geom_clearance; constexpr int SWEEP_MAX_DEPTH = 10; return validate_transition_segment_recursive(start, finish, SWEEP_MAX_DEPTH, out_index, out_value); } __declspec(noalias) __forceinline bool evaluate_indexed( const float* __restrict q, float& out_x, float& out_y, unsigned& out_index, float& out_value) const noexcept { if (!evaluate_transition_continuity_only(q, out_index, out_value)) return false; float final_geom_clearance = FLT_MAX; if (!evaluate_state_without_transition_continuity(q, out_x, out_y, out_index, out_value, &final_geom_clearance)) return false; if (solveMode == ManipSolveMode::Transition && has_transition_reference_angles()) { if (!transition_sweep_requires_validation()) { if (!evaluate_joint_angle_limits_only( referenceState, transition_continuity_constraint_count(), out_index, out_value)) return false; } else if (!evaluate_transition_swept_motion(q, out_x, out_y, final_geom_clearance, out_index, out_value)) { return false; } } out_index = feasible_index(); if (solveMode == ManipSolveMode::Transition) { out_value = compute_transition_objective_from_pose(q, out_x, out_y); return true; } out_value = compute_positioning_objective_from_pose(q, out_x, out_y); return true; } __declspec(noalias) __forceinline float operator()( const float* __restrict q, float& out_x, float& out_y) const noexcept { compute_pose(q, out_x, out_y, nullptr, nullptr); if (solveMode == ManipSolveMode::Transition) return compute_transition_objective_from_pose(q, out_x, out_y); return compute_positioning_objective_from_pose(q, out_x, out_y); }
};" - после замены на новую версию получаю ошибку с кодом 0xcfffffff (иногда с кодом 0xc0000005), старая версия работала стабильно но мне нужно ускорить код насколько это возможно, найди из-за чего именно скорее всего возникает ошибка и пришли полный код исправленной новой версии - старайся сделать новую версию ещё быстрее - ещё более активно используй AVX2 везде где это может дать выигрыш в скорости - скорость должна быть максимальной - но при этом исправь ошибки чтобы новая версия работала не только быстро но ещё и без ошибок и верни полный исправленный код структуры без пропусков целиком, главное чтобы он работал максимально быстро насколько это вообще возможно, поощряется использование низкоуровневых трюков и битовых трюков, выравнивания, низкоуровневых специальных операций, fmaf и других аппаратных способов ускорения, минимизируй число функций чтобы уменьшить накладные расходы на вызовы, код может быть громоздким - главное чтобы он был максимально быстрым, используй интринсики, различные ключевые слова для оптимизации, выровненный доступ к данным и т. д. - всё что может помочь ускорить даже гипотетически, jump_table, операторы ?: прямо в формулах, инлайн функции через __forceline, ручное развёртывание малых циклов, векторизацию, AVX2, эффективные операции с памятью и минимизацию обращений к памяти, минимизация аллокаций/переллокаций, кеширование, предвычисление, всё, что может помочь или хотя бы увеличит шансы на ускорение
Наиболее вероятная причина падения — небезопасные aligned-load/store в новой версии.
Что именно ломает код:
_mm256_load_ps(src + i) / _mm_load_ps(src + i) в copy_state_full_dim()
В старой версии там был обычный скалярный copy.
В новой версии src очень часто указывает на внешний q, а он не обязан быть выровнен на 32 байта.
Для _mm256_load_ps это UB и очень типично даёт 0xc0000005.
Переход с alignas(16) на alignas(32) у полей внутри ManipCost
Это меняет требование к выравниванию всего объекта. Если ManipCost где-то выделяется не через normal C++17 overaligned new, а через свой allocator / pool / malloc / placement в чужой буфер, объект может оказаться не 32-byte aligned, и дальше всё становится UB.
Старая версия с alignas(16) могла жить, новая — уже нет.
Вторичный скрытый дефект: в исходниках был предел по размерам буферов не до конца согласован с n.
Я это тоже подчистил: теперь логика жёстко держит full_state_dim <= 64, а число углов безопасно.
Ниже — полная исправленная структура, где:
compute_pose() для режима без px/py,copy_state_full_dim, lerp_state_full_dim, TransitionLengthEnergy, evaluate_joint_angle_limits_only,operator(),cppstruct ManipCost final { static constexpr int MAX_STATE_DIM = 64; static constexpr int MAX_ANGLE_DIM = 64; static constexpr int MAX_VARLEN_N = 32; static constexpr int MAX_FIXED_N = 64; int n; bool variableLen; float fixedLength, stretchFactor, targetX, targetY, minTheta, archBiasW, archBiasK, sharpW; unsigned obstacleCount, constraintMode; SquareObstacle obstacles[MAX_OBSTACLES]; float obstacleClearance; ManipSolveMode solveMode = ManipSolveMode::Positioning; // Держим выравнивание членов как в старой стабильной версии, чтобы не ломать внешний allocator. alignas(16) float referenceState[MAX_STATE_DIM]{}; alignas(16) float referencePrefixAngles[MAX_ANGLE_DIM]{}; int referenceStateCount = 0; int referenceAngleCount = 0; int referenceLengthCount = 0; float transitionCaptureWeight = 10000.0f; float transitionEnergyWeight = 1.0f; float transitionLengthEnergyWeight = 0.35f; float transitionPrefixEnergyWeight = 0.175f; float transitionMaxAngleStep = 0.5f; float transitionMaxLengthStepFactor = 0.15f; float transitionJointSignDeadband = 0.1f; float transitionPrefixSignDeadband = 0.2f; float transitionMinJointAbsAngleRatio = 0.1f; float transitionMinJointAbsAngleFloor = 0.0175f; bool transitionPreserveJointMagnitudeFloor = true; bool transitionPreserveJointSigns = true; bool transitionPreservePrefixSigns = true; bool transitionCheckInnerCircleAgainstLinks = true; struct alignas(32) TransitionGeomSample final { alignas(32) float q[MAX_STATE_DIM]{}; float x = 0.0f; float y = 0.0f; float clearance = FLT_MAX; }; private: static __forceinline float hsum256_ps(__m256 v) noexcept { __m128 lo = _mm256_castps256_ps128(v); __m128 hi = _mm256_extractf128_ps(v, 1); __m128 sum = _mm_add_ps(lo, hi); sum = _mm_hadd_ps(sum, sum); sum = _mm_hadd_ps(sum, sum); return _mm_cvtss_f32(sum); } template<int I, int N> static __forceinline void agp_pose_prefix( const float* __restrict th, float& acc, float* __restrict phi) noexcept { if constexpr (I < N) { acc = fmaf(th[I], 1.0f, acc); phi[I] = acc; agp_pose_prefix<I + 1, N>(th, acc, phi); } } template<int I, int N, bool STORE> __forceinline void agp_pose_accum_fixed( const float* __restrict s_arr, const float* __restrict c_arr, float& x, float& y, float* __restrict px, float* __restrict py) const noexcept { if constexpr (I < N) { const float Li = fixedLength; x = fmaf(Li, c_arr[I], x); y = fmaf(Li, s_arr[I], y); if constexpr (STORE) { px[I + 1] = x; py[I + 1] = y; } agp_pose_accum_fixed<I + 1, N, STORE>(s_arr, c_arr, x, y, px, py); } } template<int I, int N, bool STORE> __forceinline void agp_pose_accum_var( const float* __restrict q, const float* __restrict s_arr, const float* __restrict c_arr, float& x, float& y, float* __restrict px, float* __restrict py) const noexcept { if constexpr (I < N) { const float Li = q[n + I]; x = fmaf(Li, c_arr[I], x); y = fmaf(Li, s_arr[I], y); if constexpr (STORE) { px[I + 1] = x; py[I + 1] = y; } agp_pose_accum_var<I + 1, N, STORE>(q, s_arr, c_arr, x, y, px, py); } } template<int N, bool VARLEN, bool STORE> __forceinline void agp_compute_pose_fixedn( const float* __restrict q, float& out_x, float& out_y, float* __restrict px, float* __restrict py) const noexcept { alignas(32) float phi[16]; alignas(32) float s_arr[16]; alignas(32) float c_arr[16]; float phi_acc = 0.0f; if constexpr (STORE) { px[0] = 0.0f; py[0] = 0.0f; } agp_pose_prefix<0, N>(q, phi_acc, phi); FABE13_SINCOS(phi, s_arr, c_arr, N); float x = 0.0f, y = 0.0f; if constexpr (VARLEN) agp_pose_accum_var<0, N, STORE>(q, s_arr, c_arr, x, y, px, py); else agp_pose_accum_fixed<0, N, STORE>(s_arr, c_arr, x, y, px, py); out_x = x; out_y = y; } public: __declspec(noalias) __forceinline void compute_pose( const float* __restrict q, float& out_x, float& out_y, float* __restrict px, float* __restrict py) const noexcept { const bool store = (px != nullptr) && (py != nullptr); if (n <= 0) { if (store) { px[0] = 0.0f; py[0] = 0.0f; } out_x = 0.0f; out_y = 0.0f; return; } // Очень быстрый switch-path для малых n. if (static_cast<unsigned>(n) <= 16u) { if (variableLen) { if (store) { switch (n) { case 1: agp_compute_pose_fixedn< 1, true, true>(q, out_x, out_y, px, py); return; case 2: agp_compute_pose_fixedn< 2, true, true>(q, out_x, out_y, px, py); return; case 3: agp_compute_pose_fixedn< 3, true, true>(q, out_x, out_y, px, py); return; case 4: agp_compute_pose_fixedn< 4, true, true>(q, out_x, out_y, px, py); return; case 5: agp_compute_pose_fixedn< 5, true, true>(q, out_x, out_y, px, py); return; case 6: agp_compute_pose_fixedn< 6, true, true>(q, out_x, out_y, px, py); return; case 7: agp_compute_pose_fixedn< 7, true, true>(q, out_x, out_y, px, py); return; case 8: agp_compute_pose_fixedn< 8, true, true>(q, out_x, out_y, px, py); return; case 9: agp_compute_pose_fixedn< 9, true, true>(q, out_x, out_y, px, py); return; case 10: agp_compute_pose_fixedn<10, true, true>(q, out_x, out_y, px, py); return; case 11: agp_compute_pose_fixedn<11, true, true>(q, out_x, out_y, px, py); return; case 12: agp_compute_pose_fixedn<12, true, true>(q, out_x, out_y, px, py); return; case 13: agp_compute_pose_fixedn<13, true, true>(q, out_x, out_y, px, py); return; case 14: agp_compute_pose_fixedn<14, true, true>(q, out_x, out_y, px, py); return; case 15: agp_compute_pose_fixedn<15, true, true>(q, out_x, out_y, px, py); return; default: agp_compute_pose_fixedn<16, true, true>(q, out_x, out_y, px, py); return; } } else { switch (n) { case 1: agp_compute_pose_fixedn< 1, true, false>(q, out_x, out_y, px, py); return; case 2: agp_compute_pose_fixedn< 2, true, false>(q, out_x, out_y, px, py); return; case 3: agp_compute_pose_fixedn< 3, true, false>(q, out_x, out_y, px, py); return; case 4: agp_compute_pose_fixedn< 4, true, false>(q, out_x, out_y, px, py); return; case 5: agp_compute_pose_fixedn< 5, true, false>(q, out_x, out_y, px, py); return; case 6: agp_compute_pose_fixedn< 6, true, false>(q, out_x, out_y, px, py); return; case 7: agp_compute_pose_fixedn< 7, true, false>(q, out_x, out_y, px, py); return; case 8: agp_compute_pose_fixedn< 8, true, false>(q, out_x, out_y, px, py); return; case 9: agp_compute_pose_fixedn< 9, true, false>(q, out_x, out_y, px, py); return; case 10: agp_compute_pose_fixedn<10, true, false>(q, out_x, out_y, px, py); return; case 11: agp_compute_pose_fixedn<11, true, false>(q, out_x, out_y, px, py); return; case 12: agp_compute_pose_fixedn<12, true, false>(q, out_x, out_y, px, py); return; case 13: agp_compute_pose_fixedn<13, true, false>(q, out_x, out_y, px, py); return; case 14: agp_compute_pose_fixedn<14, true, false>(q, out_x, out_y, px, py); return; case 15: agp_compute_pose_fixedn<15, true, false>(q, out_x, out_y, px, py); return; default: agp_compute_pose_fixedn<16, true, false>(q, out_x, out_y, px, py); return; } } } else { if (store) { switch (n) { case 1: agp_compute_pose_fixedn< 1, false, true>(q, out_x, out_y, px, py); return; case 2: agp_compute_pose_fixedn< 2, false, true>(q, out_x, out_y, px, py); return; case 3: agp_compute_pose_fixedn< 3, false, true>(q, out_x, out_y, px, py); return; case 4: agp_compute_pose_fixedn< 4, false, true>(q, out_x, out_y, px, py); return; case 5: agp_compute_pose_fixedn< 5, false, true>(q, out_x, out_y, px, py); return; case 6: agp_compute_pose_fixedn< 6, false, true>(q, out_x, out_y, px, py); return; case 7: agp_compute_pose_fixedn< 7, false, true>(q, out_x, out_y, px, py); return; case 8: agp_compute_pose_fixedn< 8, false, true>(q, out_x, out_y, px, py); return; case 9: agp_compute_pose_fixedn< 9, false, true>(q, out_x, out_y, px, py); return; case 10: agp_compute_pose_fixedn<10, false, true>(q, out_x, out_y, px, py); return; case 11: agp_compute_pose_fixedn<11, false, true>(q, out_x, out_y, px, py); return; case 12: agp_compute_pose_fixedn<12, false, true>(q, out_x, out_y, px, py); return; case 13: agp_compute_pose_fixedn<13, false, true>(q, out_x, out_y, px, py); return; case 14: agp_compute_pose_fixedn<14, false, true>(q, out_x, out_y, px, py); return; case 15: agp_compute_pose_fixedn<15, false, true>(q, out_x, out_y, px, py); return; default: agp_compute_pose_fixedn<16, false, true>(q, out_x, out_y, px, py); return; } } else { switch (n) { case 1: agp_compute_pose_fixedn< 1, false, false>(q, out_x, out_y, px, py); return; case 2: agp_compute_pose_fixedn< 2, false, false>(q, out_x, out_y, px, py); return; case 3: agp_compute_pose_fixedn< 3, false, false>(q, out_x, out_y, px, py); return; case 4: agp_compute_pose_fixedn< 4, false, false>(q, out_x, out_y, px, py); return; case 5: agp_compute_pose_fixedn< 5, false, false>(q, out_x, out_y, px, py); return; case 6: agp_compute_pose_fixedn< 6, false, false>(q, out_x, out_y, px, py); return; case 7: agp_compute_pose_fixedn< 7, false, false>(q, out_x, out_y, px, py); return; case 8: agp_compute_pose_fixedn< 8, false, false>(q, out_x, out_y, px, py); return; case 9: agp_compute_pose_fixedn< 9, false, false>(q, out_x, out_y, px, py); return; case 10: agp_compute_pose_fixedn<10, false, false>(q, out_x, out_y, px, py); return; case 11: agp_compute_pose_fixedn<11, false, false>(q, out_x, out_y, px, py); return; case 12: agp_compute_pose_fixedn<12, false, false>(q, out_x, out_y, px, py); return; case 13: agp_compute_pose_fixedn<13, false, false>(q, out_x, out_y, px, py); return; case 14: agp_compute_pose_fixedn<14, false, false>(q, out_x, out_y, px, py); return; case 15: agp_compute_pose_fixedn<15, false, false>(q, out_x, out_y, px, py); return; default: agp_compute_pose_fixedn<16, false, false>(q, out_x, out_y, px, py); return; } } } } // Общий путь. alignas(32) float phi[MAX_ANGLE_DIM]; alignas(32) float s_arr[MAX_ANGLE_DIM]; alignas(32) float c_arr[MAX_ANGLE_DIM]; const float* __restrict th = q; float phi_acc = 0.0f; if (store) { px[0] = 0.0f; py[0] = 0.0f; } int i = 0; while (i + 4 <= n) { const float p0 = phi_acc + th[i + 0]; const float p1 = p0 + th[i + 1]; const float p2 = p1 + th[i + 2]; const float p3 = p2 + th[i + 3]; phi[i + 0] = p0; phi[i + 1] = p1; phi[i + 2] = p2; phi[i + 3] = p3; phi_acc = p3; i += 4; } while (i < n) { phi_acc += th[i]; phi[i] = phi_acc; ++i; } FABE13_SINCOS(phi, s_arr, c_arr, n); if (!store) { int j = 0; __m256 vx = _mm256_setzero_ps(); __m256 vy = _mm256_setzero_ps(); if (variableLen) { while (j + 8 <= n) { const __m256 vL = _mm256_loadu_ps(q + n + j); const __m256 vc = _mm256_load_ps(c_arr + j); const __m256 vs = _mm256_load_ps(s_arr + j); vx = _mm256_fmadd_ps(vL, vc, vx); vy = _mm256_fmadd_ps(vL, vs, vy); j += 8; } float x = hsum256_ps(vx); float y = hsum256_ps(vy); while (j < n) { const float Li = q[n + j]; x = fmaf(Li, c_arr[j], x); y = fmaf(Li, s_arr[j], y); ++j; } out_x = x; out_y = y; return; } else { const __m256 vL = _mm256_set1_ps(fixedLength); while (j + 8 <= n) { const __m256 vc = _mm256_load_ps(c_arr + j); const __m256 vs = _mm256_load_ps(s_arr + j); vx = _mm256_fmadd_ps(vL, vc, vx); vy = _mm256_fmadd_ps(vL, vs, vy); j += 8; } float x = hsum256_ps(vx); float y = hsum256_ps(vy); while (j < n) { x = fmaf(fixedLength, c_arr[j], x); y = fmaf(fixedLength, s_arr[j], y); ++j; } out_x = x; out_y = y; return; } } // Нужны px/py -> префиксные точки, тут scalar/unrolled быстрее и проще. float x = 0.0f, y = 0.0f; i = 0; if (variableLen) { while (i + 4 <= n) { { const float L0 = q[n + i + 0]; x = fmaf(L0, c_arr[i + 0], x); y = fmaf(L0, s_arr[i + 0], y); px[i + 1] = x; py[i + 1] = y; } { const float L1 = q[n + i + 1]; x = fmaf(L1, c_arr[i + 1], x); y = fmaf(L1, s_arr[i + 1], y); px[i + 2] = x; py[i + 2] = y; } { const float L2 = q[n + i + 2]; x = fmaf(L2, c_arr[i + 2], x); y = fmaf(L2, s_arr[i + 2], y); px[i + 3] = x; py[i + 3] = y; } { const float L3 = q[n + i + 3]; x = fmaf(L3, c_arr[i + 3], x); y = fmaf(L3, s_arr[i + 3], y); px[i + 4] = x; py[i + 4] = y; } i += 4; } while (i < n) { const float Li = q[n + i]; x = fmaf(Li, c_arr[i], x); y = fmaf(Li, s_arr[i], y); px[i + 1] = x; py[i + 1] = y; ++i; } } else { const float L = fixedLength; while (i + 4 <= n) { x = fmaf(L, c_arr[i + 0], x); y = fmaf(L, s_arr[i + 0], y); px[i + 1] = x; py[i + 1] = y; x = fmaf(L, c_arr[i + 1], x); y = fmaf(L, s_arr[i + 1], y); px[i + 2] = x; py[i + 2] = y; x = fmaf(L, c_arr[i + 2], x); y = fmaf(L, s_arr[i + 2], y); px[i + 3] = x; py[i + 3] = y; x = fmaf(L, c_arr[i + 3], x); y = fmaf(L, s_arr[i + 3], y); px[i + 4] = x; py[i + 4] = y; i += 4; } while (i < n) { x = fmaf(L, c_arr[i], x); y = fmaf(L, s_arr[i], y); px[i + 1] = x; py[i + 1] = y; ++i; } } out_x = x; out_y = y; } __declspec(noalias) __forceinline void copy_state_full_dim( const float* __restrict src, float* __restrict dst) const noexcept { const int dim = full_state_dim(); int i = 0; while (i + 8 <= dim) { const __m256 v = _mm256_loadu_ps(src + i); _mm256_storeu_ps(dst + i, v); i += 8; } while (i + 4 <= dim) { const __m128 v = _mm_loadu_ps(src + i); _mm_storeu_ps(dst + i, v); i += 4; } while (i < dim) { dst[i] = src[i]; ++i; } const __m256 vz = _mm256_setzero_ps(); while (i + 8 <= MAX_STATE_DIM) { _mm256_storeu_ps(dst + i, vz); i += 8; } while (i < MAX_STATE_DIM) { dst[i] = 0.0f; ++i; } } __declspec(noalias) __forceinline void lerp_state_full_dim( const float* __restrict qa, const float* __restrict qb, float t, float* __restrict out_q) const noexcept { const int dim = full_state_dim(); const __m256 vt = _mm256_set1_ps(t); int i = 0; while (i + 8 <= dim) { const __m256 va = _mm256_loadu_ps(qa + i); const __m256 vb = _mm256_loadu_ps(qb + i); const __m256 vd = _mm256_sub_ps(vb, va); _mm256_storeu_ps(out_q + i, _mm256_fmadd_ps(vt, vd, va)); i += 8; } while (i < dim) { out_q[i] = fmaf(t, qb[i] - qa[i], qa[i]); ++i; } const __m256 vz = _mm256_setzero_ps(); while (i + 8 <= MAX_STATE_DIM) { _mm256_storeu_ps(out_q + i, vz); i += 8; } while (i < MAX_STATE_DIM) { out_q[i] = 0.0f; ++i; } } static __declspec(noalias) __forceinline float wrap_pi(float a) noexcept { constexpr float PI = 3.1415926535897932384626433832795f; constexpr float TWO_PI = 6.2831853071795864769252867665590f; float k = floorf((a + PI) / TWO_PI); a -= k * TWO_PI; if (a > PI) a -= TWO_PI; else if (a < -PI) a += TWO_PI; return a; } static __declspec(noalias) __forceinline float point_segment_distance_sq( float px, float py, float ax, float ay, float bx, float by) noexcept { const float abx = bx - ax; const float aby = by - ay; const float apx = px - ax; const float apy = py - ay; const float ab2 = fmaf(abx, abx, aby * aby); if (ab2 <= 1e-20f) return fmaf(apx, apx, apy * apy); float t = (apx * abx + apy * aby) / ab2; if (t < 0.0f) t = 0.0f; else if (t > 1.0f) t = 1.0f; const float qx = fmaf(t, abx, ax); const float qy = fmaf(t, aby, ay); const float dx = px - qx; const float dy = py - qy; return fmaf(dx, dx, dy * dy); } static __declspec(noalias) __forceinline float polyline_circle_violation( const float* __restrict px, const float* __restrict py, int segments, float cx, float cy, float radius, float clearance) noexcept { const float rr = radius + clearance; float best_d2 = FLT_MAX; int i = 0; while (i < segments) { const float d2 = point_segment_distance_sq(cx, cy, px[i], py[i], px[i + 1], py[i + 1]); if (d2 < best_d2) best_d2 = d2; if (best_d2 <= rr * rr) break; ++i; } return rr - sqrtf(best_d2); } __forceinline bool has_transition_reference_angles() const noexcept { return referenceAngleCount > 0; } __forceinline bool has_transition_reference_lengths() const noexcept { return variableLen && (referenceLengthCount > 0); } __forceinline bool use_joint_angle_limit_constraints() const noexcept { return minTheta > 0.0f; } __forceinline int full_state_dim() const noexcept { return n + (variableLen ? n : 0); } __forceinline float reference_angle(int i) const noexcept { return (i < referenceAngleCount) ? referenceState[i] : 0.0f; } __forceinline float reference_length(int i) const noexcept { return (has_transition_reference_lengths() && i < referenceLengthCount) ? referenceState[n + i] : fixedLength; } __forceinline float wrapped_angle_delta(const float* __restrict q, int i) const noexcept { return wrap_pi(q[i] - reference_angle(i)); } __forceinline float max_length_step(int i) const noexcept { const float refL = reference_length(i); const float absStep = transitionMaxLengthStepFactor * fmaxf(refL, 0.5f); return fmaxf(absStep, 0.05f); } __forceinline float min_joint_abs_angle_bound(int i) const noexcept { const float refAbs = fabsf(reference_angle(i)); const float scaled = transitionMinJointAbsAngleRatio * refAbs; return fmaxf(transitionMinJointAbsAngleFloor, scaled); } __forceinline unsigned transition_continuity_constraint_count() const noexcept { unsigned total = 0u; if (solveMode == ManipSolveMode::Transition && has_transition_reference_angles()) { total += static_cast<unsigned>(referenceAngleCount); if (has_transition_reference_lengths()) total += static_cast<unsigned>(referenceLengthCount); if (transitionPreserveJointMagnitudeFloor) total += static_cast<unsigned>(referenceAngleCount); if (transitionPreserveJointSigns) total += static_cast<unsigned>(referenceAngleCount); if (transitionPreservePrefixSigns) total += static_cast<unsigned>(referenceAngleCount); } return total; } __forceinline unsigned joint_angle_limit_constraint_count() const noexcept { return use_joint_angle_limit_constraints() ? static_cast<unsigned>(n) : 0u; } __forceinline unsigned pose_constraint_base_index() const noexcept { return transition_continuity_constraint_count() + joint_angle_limit_constraint_count(); } static __declspec(noalias) __forceinline float outer_circle_clearance(float x, float y) noexcept { const float dx = x - 5.0f; return 5.0f - sqrtf(fmaf(dx, dx, y * y)); } static __declspec(noalias) __forceinline float inner_circle_clearance(float x, float y) noexcept { const float dx = x - 8.0f; const float dy = y + 3.0f; return sqrtf(fmaf(dx, dx, dy * dy)) - sqrtf(7.7f); } __declspec(noalias) __forceinline void SetTransitionReference( const float* src, int count) noexcept { const int dim = full_state_dim(); referenceStateCount = (count < dim) ? count : dim; referenceAngleCount = (referenceStateCount < n) ? referenceStateCount : n; referenceLengthCount = variableLen ? ((referenceStateCount > n) ? (referenceStateCount - n) : 0) : 0; int i = 0; while (i < dim) { if (src != nullptr && i < referenceStateCount) referenceState[i] = src[i]; else if (i < n) referenceState[i] = 0.0f; else referenceState[i] = fixedLength; ++i; } while (i < MAX_STATE_DIM) { referenceState[i] = 0.0f; ++i; } float phi_acc = 0.0f; i = 0; while (i < referenceAngleCount) { phi_acc += referenceState[i]; referencePrefixAngles[i] = phi_acc; ++i; } while (i < MAX_ANGLE_DIM) { referencePrefixAngles[i] = 0.0f; ++i; } } __forceinline float TransitionAngleEnergy(const float* __restrict q) const noexcept { float acc = 0.0f; int i = 0; while (i < referenceAngleCount) { const float d = wrapped_angle_delta(q, i); acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionLengthEnergy(const float* __restrict q) const noexcept { if (!has_transition_reference_lengths()) return 0.0f; int i = 0; __m256 vacc = _mm256_setzero_ps(); const float* __restrict qL = q + n; const float* __restrict rL = referenceState + n; while (i + 8 <= referenceLengthCount) { const __m256 a = _mm256_loadu_ps(qL + i); const __m256 b = _mm256_loadu_ps(rL + i); const __m256 d = _mm256_sub_ps(a, b); vacc = _mm256_fmadd_ps(d, d, vacc); i += 8; } float acc = hsum256_ps(vacc); while (i < referenceLengthCount) { const float d = qL[i] - rL[i]; acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionPrefixEnergy(const float* __restrict q) const noexcept { if (!has_transition_reference_angles()) return 0.0f; float acc = 0.0f; float phi_acc = 0.0f; int i = 0; while (i < referenceAngleCount) { phi_acc += q[i]; const float d = wrap_pi(phi_acc - referencePrefixAngles[i]); acc = fmaf(d, d, acc); ++i; } return acc; } __forceinline float TransitionEnergy(const float* __restrict q) const noexcept { float acc = transitionEnergyWeight * TransitionAngleEnergy(q); acc = fmaf(transitionLengthEnergyWeight, TransitionLengthEnergy(q), acc); acc = fmaf(transitionPrefixEnergyWeight, TransitionPrefixEnergy(q), acc); return acc; } __declspec(noalias) __forceinline ManipCost( int _n, bool _variableLen, float _targetX, float _targetY, float _minTheta, float _fixedLength, float _stretchFactor, float* obstacleData, unsigned _obstacleCount) noexcept : n((_n <= 0) ? 0 : (_variableLen ? ((_n > MAX_VARLEN_N) ? MAX_VARLEN_N : _n) : ((_n > MAX_FIXED_N) ? MAX_FIXED_N : _n))), variableLen(_variableLen), fixedLength(_fixedLength), stretchFactor(_stretchFactor), targetX(_targetX), targetY(_targetY), minTheta(_minTheta), archBiasW(0.02f), archBiasK(3.0f), sharpW(0.05f), obstacleCount((_obstacleCount > MAX_OBSTACLES) ? MAX_OBSTACLES : _obstacleCount), constraintMode(0u), obstacleClearance(OBSTACLE_CLEARANCE) { unsigned i = 0u; while (i < MAX_OBSTACLES) { obstacles[i].cx = 0.0f; obstacles[i].cy = 0.0f; obstacles[i].half = 0.0f; obstacles[i].pad = 0.0f; ++i; } if (obstacleData) { i = 0u; while (i < obstacleCount) { obstacles[i].cx = obstacleData[3u * i + 0u]; obstacles[i].cy = obstacleData[3u * i + 1u]; obstacles[i].half = obstacleData[3u * i + 2u]; ++i; } } SetTransitionReference(nullptr, 0); } __declspec(noalias) __forceinline float link_length(const float* __restrict q, int i) const noexcept { return variableLen ? q[n + i] : fixedLength; } __declspec(noalias) __forceinline float compute_positioning_objective_from_pose( const float* __restrict q, float x, float y) const noexcept { const float* __restrict th = q; float archPen = 0.0f; int i = 0; while (i < n) { const float theta = th[i]; const float t = fmaf(-theta, archBiasK, 0.0f); float sp; if (t > 10.0f) { sp = t; } else { const float exp_val = fmaf(t, fmaf(t, fmaf(t, fmaf(t, 0.00833333377f, 0.0416666679f), 0.16666667f), 0.5f), 1.0f); sp = log1pf(exp_val); } archPen = fmaf(archBiasW, sp, archPen); ++i; } const float dx = x - targetX; const float dy = y - targetY; const float dist = sqrtf(fmaf(dx, dx, dy * dy)); return dist + archPen; } __declspec(noalias) __forceinline float compute_transition_objective_from_pose( const float* __restrict q, float x, float y) const noexcept { const float dx = x - targetX; const float dy = y - targetY; const float dist2 = fmaf(dx, dx, dy * dy); return fmaf(transitionCaptureWeight, dist2, TransitionEnergy(q)); } __declspec(noalias) __forceinline bool use_obstacle_constraints() const noexcept { return ((constraintMode & MANIP_CONSTRAINT_OBSTACLES) != 0u) && (obstacleCount > 0u); } __declspec(noalias) __forceinline bool use_nonlinear_constraints() const noexcept { return (constraintMode & MANIP_CONSTRAINT_NONLINEAR) != 0u; } __declspec(noalias) __forceinline unsigned total_constraints() const noexcept { unsigned total = transition_continuity_constraint_count(); total += joint_angle_limit_constraint_count(); if (use_obstacle_constraints()) total += obstacleCount; if (use_nonlinear_constraints()) total += transitionCheckInnerCircleAgainstLinks ? 3u : 2u; return total; } __declspec(noalias) __forceinline unsigned feasible_index() const noexcept { return total_constraints(); } static __declspec(noalias) __forceinline float nonlinear_constraint_0(float x, float y) noexcept { const float dx = x - 5.0f; return dx * dx + y * y - 25.0f; } static __declspec(noalias) __forceinline float nonlinear_constraint_1(float x, float y) noexcept { const float dx = x - 8.0f; const float dy = y + 3.0f; return 7.7f - (dx * dx + dy * dy); } __declspec(noalias) __forceinline bool evaluate_transition_continuity_only( const float* __restrict q, unsigned& out_index, float& out_value) const noexcept { unsigned idx = 0u; if (solveMode != ManipSolveMode::Transition || !has_transition_reference_angles()) return true; int i = 0; while (i < referenceAngleCount) { const float d = wrapped_angle_delta(q, i); const float viol = fabsf(d) - transitionMaxAngleStep; if (viol > 0.0f) { out_index = idx; out_value = viol; return false; } ++idx; ++i; } if (has_transition_reference_lengths()) { i = 0; while (i < referenceLengthCount) { const float dL = fabsf(q[n + i] - referenceState[n + i]) - max_length_step(i); if (dL > 0.0f) { out_index = idx; out_value = dL; return false; } ++idx; ++i; } } if (transitionPreserveJointMagnitudeFloor) { i = 0; while (i < referenceAngleCount) { const float refAbs = fabsf(referenceState[i]); if (refAbs > transitionJointSignDeadband) { const float minAbs = min_joint_abs_angle_bound(i); const float viol = minAbs - fabsf(q[i]); if (viol > 0.0f) { out_index = idx; out_value = viol; return false; } } ++idx; ++i; } } if (transitionPreserveJointSigns) { i = 0; while (i < referenceAngleCount) { const float ref = referenceState[i]; if (fabsf(ref) > transitionJointSignDeadband) { const float prod = q[i] * ref; if (prod < 0.0f) { out_index = idx; out_value = -prod; return false; } } ++idx; ++i; } } if (transitionPreservePrefixSigns) { float phi_acc = 0.0f; i = 0; while (i < referenceAngleCount) { phi_acc += q[i]; const float refPhi = referencePrefixAngles[i]; if (fabsf(refPhi) > transitionPrefixSignDeadband) { const float prod = phi_acc * refPhi; if (prod < 0.0f) { out_index = idx; out_value = -prod; return false; } } ++idx; ++i; } } return true; } __declspec(noalias) __forceinline bool evaluate_joint_angle_limits_only( const float* __restrict q, unsigned base_index, unsigned& out_index, float& out_value) const noexcept { if (!use_joint_angle_limit_constraints()) return true; const __m256 vlimit = _mm256_set1_ps(minTheta); const __m256 vzero = _mm256_setzero_ps(); const __m256 vabsmsk = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)); int i = 0; while (i + 8 <= n) { const __m256 vq = _mm256_loadu_ps(q + i); const __m256 vabs = _mm256_and_ps(vq, vabsmsk); const __m256 vviol = _mm256_sub_ps(vabs, vlimit); const __m256 vcmp = _mm256_cmp_ps(vviol, vzero, _CMP_GT_OQ); const int mask = _mm256_movemask_ps(vcmp); if (mask) { unsigned first = 0u; while (((mask >> first) & 1) == 0) ++first; alignas(32) float tmp[8]; _mm256_store_ps(tmp, vviol); out_index = base_index + static_cast<unsigned>(i) + first; out_value = tmp[first]; return false; } i += 8; } while (i < n) { const float viol = fabsf(q[i]) - minTheta; if (viol > 0.0f) { out_index = base_index + static_cast<unsigned>(i); out_value = viol; return false; } ++i; } return true; } __declspec(noalias) __forceinline bool evaluate_pose_constraints_from_arrays( const float* __restrict px, const float* __restrict py, float x, float y, unsigned base_index, unsigned& out_index, float& out_value, float* out_geom_clearance = nullptr) const noexcept { float geom_clearance = FLT_MAX; unsigned idx = base_index; if (use_obstacle_constraints()) { unsigned j = 0u; while (j < obstacleCount) { const float viol = polyline_square_violation(px, py, n, obstacles[j], obstacleClearance); const float clearance = -viol; if (clearance < geom_clearance) geom_clearance = clearance; if (viol > 0.0f) { out_index = idx; out_value = viol; if (out_geom_clearance) *out_geom_clearance = clearance; return false; } ++idx; ++j; } } if (use_nonlinear_constraints()) { if (transitionCheckInnerCircleAgainstLinks) { const float viol = polyline_circle_violation(px, py, n, 8.0f, -3.0f, sqrtf(7.7f), 0.0f); const float clearance = -viol; if (clearance < geom_clearance) geom_clearance = clearance; if (viol > 0.0f) { out_index = idx; out_value = viol; if (out_geom_clearance) *out_geom_clearance = clearance; return false; } ++idx; } const float c0 = nonlinear_constraint_0(x, y); const float c0_clearance = outer_circle_clearance(x, y); if (c0_clearance < geom_clearance) geom_clearance = c0_clearance; if (c0 > 0.0f) { out_index = idx; out_value = c0; if (out_geom_clearance) *out_geom_clearance = c0_clearance; return false; } ++idx; const float c1 = nonlinear_constraint_1(x, y); const float c1_clearance = inner_circle_clearance(x, y); if (c1_clearance < geom_clearance) geom_clearance = c1_clearance; if (c1 > 0.0f) { out_index = idx; out_value = c1; if (out_geom_clearance) *out_geom_clearance = c1_clearance; return false; } ++idx; } if (out_geom_clearance) *out_geom_clearance = geom_clearance; return true; } __declspec(noalias) __forceinline bool evaluate_state_without_transition_continuity( const float* __restrict q, float& out_x, float& out_y, unsigned& out_index, float& out_value, float* out_geom_clearance = nullptr) const noexcept { if (!evaluate_joint_angle_limits_only(q, transition_continuity_constraint_count(), out_index, out_value)) { if (out_geom_clearance) *out_geom_clearance = -FLT_MAX; return false; } alignas(32) float px[MAX_FIXED_N + 1]; alignas(32) float py[MAX_FIXED_N + 1]; compute_pose(q, out_x, out_y, px, py); if (!evaluate_pose_constraints_from_arrays( px, py, out_x, out_y, pose_constraint_base_index(), out_index, out_value, out_geom_clearance)) return false; return true; } __declspec(noalias) __forceinline float transition_segment_motion_bound( const float* __restrict qa, const float* __restrict qb) const noexcept { float tail_reach = 0.0f; float bound = 0.0f; int i = n - 1; while (i >= 0) { tail_reach += fmaxf(link_length(qa, i), link_length(qb, i)); bound = fmaf(tail_reach, fabsf(qb[i] - qa[i]), bound); --i; } if (variableLen) { i = 0; while (i < n) { bound = fmaf(1.0f, fabsf(qb[n + i] - qa[n + i]), bound); ++i; } } return bound; } __forceinline bool transition_sweep_requires_validation() const noexcept { return (solveMode == ManipSolveMode::Transition) && has_transition_reference_angles() && (use_obstacle_constraints() || use_nonlinear_constraints()); } __declspec(noalias) __forceinline bool build_transition_geom_sample( const float* __restrict q, TransitionGeomSample& out_sample, unsigned& out_index, float& out_value) const noexcept { copy_state_full_dim(q, out_sample.q); return evaluate_state_without_transition_continuity( out_sample.q, out_sample.x, out_sample.y, out_index, out_value, &out_sample.clearance); } __declspec(noalias) __forceinline bool validate_transition_segment_recursive( const TransitionGeomSample& a, const TransitionGeomSample& b, int depth, unsigned& out_index, float& out_value) const noexcept { constexpr float SWEEP_CLEARANCE_EPS = 1e-4f; constexpr float SWEEP_MIN_BOUND = 5e-4f; const float motion_bound = transition_segment_motion_bound(a.q, b.q); if (motion_bound <= SWEEP_CLEARANCE_EPS) return true; const float required_clearance = motion_bound + SWEEP_CLEARANCE_EPS; if (a.clearance > required_clearance && b.clearance > required_clearance) return true; TransitionGeomSample mid{}; lerp_state_full_dim(a.q, b.q, 0.5f, mid.q); if (!evaluate_state_without_transition_continuity( mid.q, mid.x, mid.y, out_index, out_value, &mid.clearance)) return false; const float half_required_clearance = 0.5f * motion_bound + SWEEP_CLEARANCE_EPS; if (a.clearance > half_required_clearance && mid.clearance > half_required_clearance && b.clearance > half_required_clearance) return true; if (depth <= 0 || motion_bound <= SWEEP_MIN_BOUND) { TransitionGeomSample q25{}, q75{}; lerp_state_full_dim(a.q, b.q, 0.25f, q25.q); if (!evaluate_state_without_transition_continuity( q25.q, q25.x, q25.y, out_index, out_value, &q25.clearance)) return false; lerp_state_full_dim(a.q, b.q, 0.75f, q75.q); if (!evaluate_state_without_transition_continuity( q75.q, q75.x, q75.y, out_index, out_value, &q75.clearance)) return false; return true; } return validate_transition_segment_recursive(a, mid, depth - 1, out_index, out_value) && validate_transition_segment_recursive(mid, b, depth - 1, out_index, out_value); } __declspec(noalias) __forceinline bool evaluate_transition_swept_motion( const float* __restrict q, float final_x, float final_y, float final_geom_clearance, unsigned& out_index, float& out_value) const noexcept { if (!transition_sweep_requires_validation()) return true; TransitionGeomSample start{}; if (!build_transition_geom_sample(referenceState, start, out_index, out_value)) return false; TransitionGeomSample finish{}; copy_state_full_dim(q, finish.q); finish.x = final_x; finish.y = final_y; finish.clearance = final_geom_clearance; constexpr int SWEEP_MAX_DEPTH = 10; return validate_transition_segment_recursive(start, finish, SWEEP_MAX_DEPTH, out_index, out_value); } __declspec(noalias) __forceinline bool evaluate_indexed( const float* __restrict q, float& out_x, float& out_y, unsigned& out_index, float& out_value) const noexcept { if (!evaluate_transition_continuity_only(q, out_index, out_value)) return false; float final_geom_clearance = FLT_MAX; if (!evaluate_state_without_transition_continuity( q, out_x, out_y, out_index, out_value, &final_geom_clearance)) return false; if (solveMode == ManipSolveMode::Transition && has_transition_reference_angles()) { if (!transition_sweep_requires_validation()) { if (!evaluate_joint_angle_limits_only( referenceState, transition_continuity_constraint_count(), out_index, out_value)) return false; } else if (!evaluate_transition_swept_motion( q, out_x, out_y, final_geom_clearance, out_index, out_value)) { return false; } } out_index = feasible_index(); if (solveMode == ManipSolveMode::Transition) { out_value = compute_transition_objective_from_pose(q, out_x, out_y); return true; } out_value = compute_positioning_objective_from_pose(q, out_x, out_y); return true; } __declspec(noalias) __forceinline float operator()( const float* __restrict q, float& out_x, float& out_y) const noexcept { compute_pose(q, out_x, out_y, nullptr, nullptr); if (solveMode == ManipSolveMode::Transition) return compute_transition_objective_from_pose(q, out_x, out_y); return compute_positioning_objective_from_pose(q, out_x, out_y); } };
Что я бы считал главным исправлением:
_mm256_load_ps/_store_ps на внешних или потенциально невыравненных данных на loadu/storeu;ManipCost, если кодовая база может выделять его через нестандартный allocator;Если хочешь, я следующим сообщением могу дать ещё более агрессивную версию именно под твой реальный диапазон n — например, отдельно заточенную под n <= 16 или n <= 32, где можно выжать ещё больше.