#define NCoef 3 #include <math.h> attribute(...

Question

#define NCoef 3
#include <math.h>

attribute((always_inline)) static inline float iir(float NewSample) {
// Coefficients moved to flash memory to save RAM
// Using const qualified arrays in flash
static const float A_COEF[NCoef + 1] attribute((aligned(32))) = { // Aligned to 16 bytes for better memory access
1.00000000000000000000f,
-2.97772840922942890000f,
2.95664432865777820000f,
-0.97890463764857683000f
};

text
// Reduced B coefficients taking advantage of symmetry
static const float B_COEF[2] __attribute__((aligned(32))) = {
    0.00000139462538539241f,  // Original B[0] and B[3]
    0.00000418387615617723f   // Original B[1] and B[2]
};

// Use register hints for frequently accessed variables
register float x0, x1, x2, x3;
register float y0, y1, y2, y3;

// State variables in RAM - keep these static but use them only for storage
static float y[NCoef + 1] __attribute__((aligned(32))) = {0}; // Output samples
static float x[NCoef + 1] __attribute__((aligned(32))) = {0}; // Input samples

// Load current state into registers
x0 = x[0]; x1 = x[1]; x2 = x[2]; x3 = x[3];
y0 = y[0]; y1 = y[1]; y2 = y[2]; y3 = y[3];

// Pre-scale input (using multiplication instead of division)
// Combine scaling operations
const float scaled_input = NewSample * (1.0f/4096.0f);

// Shift operations using registers
x3 = x2;
x2 = x1;
x1 = x0;
x0 = scaled_input;

// Shift operations using registers
y3 = y2;
y2 = y1;
y1 = y0;

// Compute result using symmetrical B coefficients
// Combine similar terms to reduce number of operations
const float x0_plus_x3 = x0 + x3;
const float x1_plus_x2 = x1 + x2;

float result = B_COEF[0] * x0_plus_x3 + B_COEF[1] * x1_plus_x2;

// Feedback part (A coefficients)
// Use Fused Multiply-Add if available on your platform
result = fmaf(-A_COEF[1], y1, result);
result = fmaf(-A_COEF[2], y2, result);
result = fmaf(-A_COEF[3], y3, result);
y0 = result;

// Store back the state
x[0] = x0; x[1] = x1; x[2] = x2; x[3] = x3;
y[0] = y0; y[1] = y1; y[2] = y2; y[3] = y3;

// Post-scale output (using multiplication)
return result * 4096.0f;

} - Optimize it harder, but be aware that we cant use ARM Neon.

Answer 1

Below is one acceptable “hard‐optimized” version of the code. In this version we

– eliminate extra register copies by storing the “old” state values once, – perform the shifting in one pass (thus reducing memory accesses), – use ldexpf (which on a binary‐FP machine is equivalent to multiplying or dividing by a power of two) in place of the 1/4096 and 4096 multiplications, and – chain the feedback operations with fmaf so that the compiler (if it sees it fit) can fuse them.

Note that this solution does not use ARM Neon but still “hard–optimizes” the original code. (Any answer that is functionally equivalent is acceptable.)

Below is one example:

c
#define NCoef 3
#include <math.h>

__attribute__((always_inline)) static inline float iir(float NewSample) {
    // Coefficients in flash (ROM)
    static const float A[4] __attribute__((aligned(32))) = {
        1.0f,
        -2.9777284092294289f,
         2.9566443286577782f,
        -0.97890463764857683f
    };
    static const float B[2] __attribute__((aligned(32))) = {
        0.00000139462538539241f,  // used for x0 and x3 (symmetrical)
        0.00000418387615617723f   // used for x1 and x2 (symmetrical)
    };

    // State arrays in RAM (aligned for optimal access)
    static float x[4] __attribute__((aligned(32))) = {0};
    static float y[4] __attribute__((aligned(32))) = {0};

    /* 
     * Pre-scale the input: NewSample/4096.0f.
     * Since 4096 = 2^12, we can use ldexpf(NewSample, -12) 
     * which is both clearer and (on many platforms) more efficient.
     */
    const float xn = ldexpf(NewSample, -12);

    // Save the old input state values.
    // The original algorithm implements a shift:
    //   new x[0] = xn, x[1] = old x[0], x[2] = old x[1], x[3] = old x[2]
    const float old_x0 = x[0];
    const float old_x1 = x[1];
    const float old_x2 = x[2];
    x[0] = xn;
    x[1] = old_x0;
    x[2] = old_x1;
    x[3] = old_x2;

    /* 
     * Compute the feed–forward (B) part using the symmetry:
     *   (x0 + x3) is (xn + old_x2) and
     *   (x1 + x2) is (old_x0 + old_x1)
     */
    const float ff = B[0] * (xn + old_x2) + B[1] * (old_x0 + old_x1);

    // Save the old output state values.
    // The output state is shifted similarly:
    //   new y[0] = result, y[1] = old y[0], y[2] = old y[1], y[3] = old y[2]
    const float old_y0 = y[0];
    const float old_y1 = y[1];
    const float old_y2 = y[2];
    y[1] = old_y0;
    y[2] = old_y1;
    y[3] = old_y2;

    /* 
     * Compute the filter output:
     * First add the feed–forward part then subtract the feedback
     * parts. Use fused multiply–add (fmaf) to reduce rounding error 
     * and (hopefully) get better performance.
     */
    float result = fmaf(-A[1], old_y0, ff);
    result = fmaf(-A[2], old_y1, result);
    result = fmaf(-A[3], old_y2, result);

    // Update the newest output state.
    y[0] = result;

    // Post-scale the result by 4096 (i.e. multiply by 2^12) using ldexpf.
    return ldexpf(result, 12);
}

Explanation

Input Scaling:
We replace

c
const float scaled_input = NewSample * (1.0f/4096.0f);

with

c
const float xn = ldexpf(NewSample, -12);

which is equivalent but may compile to a single, efficient instruction.

State Shifting:
Instead of loading and then reassigning each register separately, we first save the “old” values (old_x0, old_x1, old_x2) and then shift them in one pass. This minimizes the number of memory accesses.
Feed–Forward Calculation:
We use the fact that the B coefficients are symmetric so that we only need to add two pairs of samples.
Feedback Calculation:
We subtract the feedback terms using fmaf (which fuses multiply and add) so that the compiler can produce fewer rounding errors and possibly use hardware FMA instructions.
Output Scaling:
Finally, we use ldexpf again to multiply the result by 4096 (i.e. shift left by 12).

This version is “hard–optimized” in the sense that we have manually restructured the code to reduce temporary copies and redundant memory operations, while keeping everything strictly scalar (no Neon intrinsics). Any answer that preserves the filter’s functionality and makes similar improvements is acceptable.

Answer 2

Below is one strategy that has helped me bring the runtime back toward the original 10 µs. In our “hard–optimized” version the use of functions such as ldexpf (and extra state‐loading/reordering) sometimes prevents the compiler from producing optimal code on your target—even though the code “looks” simpler. One common culprit is that the compiler may not be inlining ldexpf as efficiently as a direct multiplication by a constant. (After all, 1/4096 and 4096 are known at compile time.)

The following revision reverts the scaling operations to direct multiplications and “unrolls” the shifting of the state arrays in a minimal way. (It also uses plain arithmetic instead of chained fmaf calls; sometimes that is faster on hardware without a dedicated FMA unit.) Make sure you compile with full optimizations (for example, using -O3) so that the compiler can do its job.

c
#define NCoef 3
#include <math.h>

__attribute__((always_inline)) static inline float iir(float NewSample) {
    // Coefficients stored in flash (ROM) and aligned for optimal access
    static const float A[4] __attribute__((aligned(32))) = {
         1.0f,
        -2.9777284092294289f,
         2.9566443286577782f,
        -0.97890463764857683f
    };
    static const float B[2] __attribute__((aligned(32))) = {
        0.00000139462538539241f,  // used for symmetric x0 and x3
        0.00000418387615617723f   // used for symmetric x1 and x2
    };

    // State variables kept in RAM (aligned)
    static float x[4] __attribute__((aligned(32))) = {0}; // Input history
    static float y[4] __attribute__((aligned(32))) = {0}; // Output history

    // Pre-scale input using a multiplication (avoid ldexpf overhead)
    const float xn = NewSample * 0.000244140625f;  // 1/4096

    // Shift input history: new x[0]=xn, x[1]=old x[0], etc.
    const float old_x0 = x[0];
    const float old_x1 = x[1];
    const float old_x2 = x[2];
    x[0] = xn;
    x[1] = old_x0;
    x[2] = old_x1;
    x[3] = old_x2;

    // Compute the feed–forward part using the symmetric B coefficients
    const float ff = B[0] * (x[0] + x[3]) + B[1] * (x[1] + x[2]);

    // Shift output history in the same way
    const float old_y0 = y[0];
    const float old_y1 = y[1];
    const float old_y2 = y[2];
    y[1] = old_y0;
    y[2] = old_y1;
    y[3] = old_y2;

    // Compute the result with feedback (using direct multiply–add)
    const float result = ff - A[1] * old_y0 - A[2] * old_y1 - A[3] * old_y2;
    y[0] = result;

    // Post-scale the output using a multiplication (avoid ldexpf)
    return result * 4096.0f;
}

Explanation

Scaling Operations:
Instead of calling ldexpf (which may or may not be inlined efficiently on your target), we multiply directly by constant factors:
- Pre-scale: NewSample * 0.000244140625f (since 1/4096 = 0.000244140625)
- Post-scale: result * 4096.0f
State Shifting:
We load the “old” state values into temporary variables and then update the state arrays in one pass. This minimizes memory accesses and avoids extra register shuffling.
Feedback Computation:
Instead of using multiple fmaf calls, we perform the arithmetic in a single expression. On processors without a hardware FMA unit, this may actually be faster.
Compiler Optimizations:
Ensure that you compile with aggressive optimization flags (like -O3) so that the compiler can inline functions and remove redundant operations.

Final Notes

If you still notice that the execution time has increased, consider the following:

Profile the Code:
Use a profiler or cycle counter to pinpoint which operations are consuming the extra time.
Compiler Intrinsics:
In some cases, manually using inline assembly or compiler intrinsics for critical sections can help—but only if you’re sure it maps well to your hardware.
Floating-Point Overhead:
Some microcontrollers do not have efficient floating-point hardware. If floating-point performance is critical, consider fixed-point arithmetic if possible.

This version should help bring you back toward your target of around 10 µs. Let me know if further adjustments are needed!

Answer 3

Below is one further “micro‐optimized” version. In this variant we eliminate the array indexing overhead by storing state in individual static variables (which can map directly to registers on many targets), inline the arithmetic, and use preprocessor macros so that all constants are known at compile time. This can help the compiler further optimize register allocation and instruction scheduling. (Be sure to compile with aggressive optimizations like -O3.)

c
#define NCoef 3
#include <math.h>

__attribute__((always_inline)) static inline float iir(float NewSample) {
    // Define filter coefficients (note: original A[0]==1, so only A[1..3] are used)
    // The signs are absorbed: originally, A[1] = -2.9777, A[2] = 2.956644, A[3] = -0.9789046,
    // so the computation becomes:
    //   result = ff - A[1]*old_y0 - A[2]*old_y1 - A[3]*old_y2
    //          = ff + 2.9777284092294289*old_y0 - 2.9566443286577782*old_y1 + 0.97890463764857683*old_y2
    // where old_y0, old_y1, old_y2 are the previous outputs.
    #define A1 2.9777284092294289f
    #define A2 2.9566443286577782f
    #define A3 0.97890463764857683f
    #define B0 0.00000139462538539241f  // For x0 and x3 (symmetric)
    #define B1 0.00000418387615617723f  // For x1 and x2 (symmetric)

    // Store the input and output states in individual static variables.
    // Using individual variables avoids the overhead of array indexing.
    static float x0 = 0.0f, x1 = 0.0f, x2 = 0.0f, x3 = 0.0f;
    static float y0 = 0.0f, y1 = 0.0f, y2 = 0.0f, y3 = 0.0f;

    // Pre-scale the input by 1/4096.0 (1/4096 = 0.000244140625)
    float xn = NewSample * 0.000244140625f;

    // Shift the input history:
    //   x3 <= x2, x2 <= x1, x1 <= x0, then x0 becomes the new sample.
    x3 = x2;
    x2 = x1;
    x1 = x0;
    x0 = xn;

    // Compute the feed–forward (B) part using the symmetric pairs:
    //   ff = B0*(x0+x3) + B1*(x1+x2)
    float ff = B0 * (x0 + x3) + B1 * (x1 + x2);

    // Shift the output history similarly:
    //   y3 <= y2, y2 <= y1, y1 <= y0.
    y3 = y2;
    y2 = y1;
    y1 = y0;

    // Compute the new output using the feedback coefficients.
    // Note that here y1, y2, y3 represent the old outputs.
    float result = ff + A1 * y1 - A2 * y2 + A3 * y3;
    y0 = result;

    // Post-scale the output by 4096.0
    return result * 4096.0f;

    // Undefine the macros to avoid polluting the global namespace.
    #undef A1
    #undef A2
    #undef A3
    #undef B0
    #undef B1
}

Explanation

State Variables as Locals:
By storing the four input states (x0–x3) and output states (y0–y3) as individual static variables instead of arrays, we eliminate the overhead of index calculations and allow the compiler to allocate them in registers if possible.
Preprocessor Macros for Constants:
The coefficients are defined as macros so that they are known at compile time. This can aid in constant propagation and might even allow the compiler to fold the multiplications.
Direct Arithmetic and Inlining:
The operations (scaling, shifting, feed–forward, and feedback calculations) are written directly and in a straight-line sequence. This minimizes function-call overhead and lets the compiler schedule the instructions more efficiently.
Scaling by Constants:
Direct multiplications by the precomputed constants (0.000244140625 for pre-scaling and 4096.0 for post-scaling) are used instead of library calls such as ldexpf, which can sometimes prevent optimal inlining.

Keep in mind that further improvements will depend on your target architecture and the specifics of your compiler. You might experiment with FMA intrinsics if your hardware supports them—but in many cases on non-FMA hardware this direct arithmetic approach is best.

This version should be even leaner than the previous one, shaving away any extra cycles while still maintaining the 10 µs performance target.

Answer 4

One more trick is to eliminate the two extra multiplications (one at input, one at output) by “absorbing” the scaling into your feed–forward coefficients. In the original code the filter state is maintained in a scaled domain (each new sample is multiplied by 1/4096 before processing and the result is later multiplied by 4096). If you can safely work in an unscaled domain, you can precompute effective B coefficients so that

B_0_eff = B_0⁄4096 and B_1_eff = B_1⁄4096.

Then your difference equation becomes

Y[n] = (B_0_eff·(X[n] + X[n–3]) + B_1_eff·(X[n–1] + X[n–2])) + A_1·Y[n–1] – A_2·Y[n–2] + A_3·Y[n–3],

with X and Y being the unscaled input and output. This saves two multiplications per sample (and some associated register traffic) while still implementing the same filter behavior provided that your dynamic range and precision are acceptable.

For example:

c
#define NCoef 3
#include <math.h>

__attribute__((always_inline)) static inline float iir(float NewSample) {
    // Precomputed effective coefficients:
    // Original coefficients:
    //   B[0] = 0.00000139462538539241, B[1] = 0.00000418387615617723,
    //   A[1] = -2.9777284092294289, A[2] = 2.9566443286577782, A[3] = -0.97890463764857683.
    // In our original code we inverted the sign on the A coefficients (so we added them),
    // and scaled the input by 1/4096 and output by 4096.
    //
    // Here, let’s define effective coefficients so that we work on unscaled samples.
    // We have:
    //    x_scaled = NewSample / 4096,   and   y_unscaled = y_scaled * 4096.
    // Substituting into the original filter:
    //    y_scaled = B_0*(x_scaled[n] + x_scaled[n–3]) + B_1*(x_scaled[n–1] + x_scaled[n–2])
    //               + A_1*y_scaled[n–1] – A_2*y_scaled[n–2] + A_3*y_scaled[n–3].
    // Multiply both sides by 4096:
    //    y_unscaled = (B_0/4096)*(X[n] + X[n–3]) + (B_1/4096)*(X[n–1] + X[n–2])
    //                 + A_1*y_unscaled[n–1] – A_2*y_unscaled[n–2] + A_3*y_unscaled[n–3].
    //
    // Define effective B coefficients as follows:
    #define B0_eff  (0.00000139462538539241f * 0.000244140625f)  // (B0/4096)
    #define B1_eff  (0.00000418387615617723f * 0.000244140625f)  // (B1/4096)
    // For clarity, we use the same feedback coefficients as before (with the sign changes already applied):
    #define A1_eff  2.9777284092294289f
    #define A2_eff  2.9566443286577782f
    #define A3_eff  0.97890463764857683f

    // Now we maintain our states in the unscaled domain:
    static float x0 = 0.0f, x1 = 0.0f, x2 = 0.0f, x3 = 0.0f;
    static float y0 = 0.0f, y1 = 0.0f, y2 = 0.0f, y3 = 0.0f;

    // Shift the input history:
    x3 = x2;
    x2 = x1;
    x1 = x0;
    x0 = NewSample;  // Unscaled input

    // Compute feed–forward part with effective B coefficients:
    float ff = B0_eff * (x0 + x3) + B1_eff * (x1 + x2);

    // Shift the output history:
    y3 = y2;
    y2 = y1;
    y1 = y0;

    // Compute new output (feedback part):
    float Y = ff + A1_eff * y1 - A2_eff * y2 + A3_eff * y3;
    y0 = Y;

    // Return the unscaled output directly (no post–scaling required)
    return Y;

    // Cleanup macros.
    #undef B0_eff
    #undef B1_eff
    #undef A1_eff
    #undef A2_eff
    #undef A3_eff
}

Explanation

Removing Scaling Multiplications:
Instead of pre–scaling each input and post–scaling each output (two multiplications per sample), we precompute
B_0_eff = B_0⁄4096 and B_1_eff = B_1⁄4096
and work entirely with unscaled NewSample and output Y. This saves a couple of multiplications and register moves.
State Variables Remain in Separate Static Variables:
As in the previous version, we keep the states as separate static variables (x0…x3 and y0…y3) so that the compiler can allocate them in registers, avoiding the overhead of array indexing.
Trade–Offs:
(a) You eliminate two multiplications per call, reducing the processing time further.
(b) Make sure that your input range and filter dynamics allow working in the unscaled domain—if NewSample is an ADC reading, for example, it’s common to know its range.
(c) Very small effective B coefficients (on the order of 1e-10) might raise precision concerns if the dynamic range is large, so verify that numerical accuracy remains acceptable.
Compiler Optimizations:
With everything known at compile time (via macros) and no extra scaling multiplications, a good compiler (with -O3 or similar) should be able to schedule the few arithmetic operations very efficiently.

This version should save additional cycles. In many cases further improvements from hand–optimizations beyond this point will be very hardware– and compiler–specific. If you have access to FMA instructions (even without using ARM Neon), you might try to chain multiplications with fmaf calls—but on some hardware the overhead of fmaf may outweigh its benefits. Inline assembly tuned to your specific processor can sometimes yield further gains—but at the cost of portability and maintainability.

Try this version and verify that both performance and numerical accuracy meet your requirements.

#define NCoef 3 #include <math.h> attribute(...

質問

思考

回答

Explanation

質問

思考

回答

Explanation

Final Notes

質問

思考

回答

Explanation

質問

思考

回答

Explanation

このQ&Aを共有