m8ta
You are not authenticated, login. |
|
{438} | ||
I needed to evaluate different fixed-point filters & implement them in a way which would more-or-less easily transfer to hand-coded blackfin assembly. Hence, I wrote a small program in C to test two likely alternatives: a 5th order elliptic filter, with 82db stopband rejection, or a 4th order elliptic, with 72 db stopband rejection. see {421} also, and this useful reference. (also at {584}) UPDATE: this filter is not stable/ suitable for the 1.15 signed fraction format of the blackfin processor. As in the reference, you must use a non-canonic form which puts the numerator before the denominator. I had a great deal of difficulty trying to determine why the output of the Direct Form II filter was giving crap results with poles/zeros designed for lowpass operation. The reason appears to be that the Direct FormII filter requires dynamic range > +-1 for the w's (first feedback section). The numerator / b coefficients for these highpass filters is usually [1 -2 1] - a highpass - which returns 0 when passed saturated data. See the signal flowchart below. The program for implementing these filters as cascaded biquad/ triquads follows. The method of generating the filter coefficients is in the comments. Note that the filter coefficients for the triquad exceed and absolute magnitude of 2, hence the triquad implementation has a fixed-point format of s.2.13 (one bit sign, 2 bits whole part, 13 bits fractional part). The biquad has s.1.14 format - one bit for the whole part. Also note that I incorporated a scale factor of x2 into the first stage, and x4 into the second stage, as the output of the ADC is only 12 bits and we can afford to expand it to the full range. The filter responses, as designed (in floating point). 5th order is on the left, 4th on the right. The filter output to white noise, in the time domain. 5th order has, in this impementation, a 'softer shoulder', which i do not think is appropriate for this application. The same output in fourier space, confirming the softer shoulder effect. this was actually kinda unespected .. I thought an extra pole/zero would help! I guess the triquad & lower res coefficient quantization is less efficient.. ? #include <stdio.h> // gcc -Wall filter_test.c -o filter_test //need to test fixed-point filtering in C (where it is easy) //before converting to hand-coded assembly. short w1_1[2]= {0,0}; short w1_2[3]= {0,0,0}; short w2_1[2]= {0, 0}; short w2_2[2]= {0, 0}; /* filter 1: 5th order. biquad then a triquad. [B1, A1] = ellip(5,0.2,84, 6/31.25); %84db = 14 bits. rb = roots(B1); ra = roots(A1); pa1 = poly(ra(1:2)); pa2 = poly(ra(3:5)); pb1 = poly(rb(1:2)); pb2 = poly(rb(3:5)); b1_1 = round(pb1*sqrt(B1(1))*2^15) a1_1 = round(pa1* 2^14)* -1 b1_2 = round(pb2*sqrt(B1(1))*2^15) a1_2 = round(pa2*2^13)*-1 % triquad. */ //biquad1: short b1_1[3] = {1199, 87, 1199}; short a1_1[2] = {24544 -14085}; //triquad1: short b1_2[4] = {1199, 2313, 2313, 1199}; short a1_2[3] = {18090, -14160, 3893}; /*filter 2: can be implemented by 2 biquads. [B3, A3] = ellip(4,0.5,72, 6/31.25); rb = roots(B3); ra = roots(A3); pa1 = poly(ra(1:2)); pa2 = poly(ra(3:4)); pb1 = poly(rb(1:2)); pb2 = poly(rb(3:4)); b2_1 = round(pb1*sqrt(B3(1))*2^15) a2_1 = round(pa1* 2^14)* -1 b2_2 = round(pb2*sqrt(B3(1))*2^16) %total gain = 8. a2_2 = round(pa2*2^14)*-1 */ //biquad2: short b2_1[3] = {2082, 914, 2082}; short a2_1[2] = {24338, -13537}; //biquad2 (2): short b2_2[3] = {4163, 6639, 4163}; short a2_2[2] = {24260, -9677}; int madd_sat(int acc, short a, short b){ //emulate the blackfin //signed-integer mode of operating. page 567 in the programming reference. int c; long long l, lo, hi; lo = (long long)(-2147483647); hi = (long long)(2147483647); c = (int)a * (int)b; l = (long long)acc + (long long)c; if(l < lo) l = lo; if(l > hi) l = hi; acc = (int)l; return acc; } //need to deal with samples one at a time, as they come in, from different channels //need to quantize the coeficients //need to utilize a biquad topology / filter structure. // (this is simple - just segregate the poles. order 5 filter req. biquad & triquad. short filter_biquad(short in, short* a1, short* b1, short* w1){ int acc; short out, w; //in varies from 0 to 0x0fff - ADC is straight binary. (0 to 4095). unsigned. acc = madd_sat(0, in, 16384); acc = madd_sat(acc, w1[0], a1[0]); acc = madd_sat(acc, w1[1], a1[1]); w = (short)(acc >> 14); //hopefully this does sign-extended shift. acc = madd_sat(0, w, b1[0]); acc = madd_sat(acc, w1[0], b1[1]); acc = madd_sat(acc, w1[1], b1[0]); //symmetry. out = (short)(acc >> 14); w1[1] = w1[0]; w1[0] = w; return out; } short filter_triquad(short in, short* a2, short* b2, short* w2){ int acc; short out, w; //this one is a bit different, as the coefficients are > 2 in the denom. // hence have to normalize by a different fraction. //multiply the numerator by 4 for a net gain of 8. acc = madd_sat(0 , in, 8192); acc = madd_sat(acc, w2[0], a2[0]); acc = madd_sat(acc, w2[1], a2[1]); acc = madd_sat(acc, w2[2], a2[2]); w = acc >> 13; acc = madd_sat(0 , w, b2[0]); acc = madd_sat(acc, w2[0], b2[1]); acc = madd_sat(acc, w2[1], b2[1]); acc = madd_sat(acc, w2[2], b2[0]); //symmetry. out = (short)(acc >> 13); w2[2] = w2[1]; w2[1] = w2[0]; w2[0] = w; return out; } int main( int argv, char* argc[]){ //read the samples from stdin. int in, out; while(scanf("%d", &in)){ //compare 5th and 4th order filters directly. in -= 2048; out = filter_biquad(in, a1_1, b1_1, w1_1); out = filter_triquad(out, a1_2, b1_2, w1_2); printf("%d ", out); out = filter_biquad(in, a2_1, b2_1, w2_1); out = filter_biquad(out, a2_2, b2_2, w2_2); printf("%d ", out); } return 0; } // gcc -Wall filter_test.c -o filter_test // cat filter_test_in.dat | ./filter_test > filter_test_out.dat Below, some matlab code to test the filtering. % make some noisy data. x = randn(2000, 1); x = x .* 1000; x = x + 2048; i = find(x >= 4096) x(i) = 4095; i = find(x < 0); x(i) = 0; x = round(x); fid = fopen('filter_test_in.dat', 'w'); for k = 1:length(x) fprintf(fid, '%d ', x(k)); end fprintf(fid, 'quit'); fclose(fid); Here is a blackfin assembly implementation of the filter, Note the filter weights (i0) and delays (i1, i2) need to be on two different cache/sram banks, or the processor will stall. Also note that , as per the second diagram above, the delays for biquad 1 output are synonymous to the delays for biquad 2 input, hence they are only represented once in memory. /* directform 1 biquad now, form II saturates 1.15 format. operate on the two samples in parallel (both in 1 32bit reg). r0 x(n) -- the input from the serial bus. r1 x(n-1) (yn-1) -- ping-pong the delayed registers. r2 x(n-2) (yn-2) -- do this so save read cycles. r3 y(n-1) (xn-1) r4 y(n-2) (xn-2) r5 b0 b1 -- (low high) r6 a0 a1 i0 reads the coeficients into the registers; it loops every 32 bytes (16 coef, 4 biquads) i1 reads the delays. it loops every 640 bytes = 10 delays * 4 bytes/delay * 16 stereo channels. only increments. i2 writes the delays, loops every 640 bytes. also only increments. if i1 and i2 are dereferenced in the same cycle, the processor will stall -- each of the 1k SRAM memory banks has only one port. format of delays in memory: [x1(n-1) , x1(n-2) , x2(n-1) aka y1(n-1) , x2(n-2) aka y1(n-2) , x3(n-1) aka y2(n-1) , x3(n-2) aka y2(n-2) , x4(n-1) aka y3(n-1) , x4(n-2) aka y3(n-2) , y4(n-1) , y4(n-2) ] --that's 10 delays, 4 bytes each. */ r5 = [i0++] || r1 = [i1++]; a0 = r0.l * r5.l , a1 = r0.h * r5.l || r6 = [i0++] || [i2++] = r0; a0 += r1.l * r5.h, a1 += r1.h * r5.h || r2 = [i1++] ; a0 += r2.l * r5.l, a1 += r2.h * r5.l || r3 = [i1++] ; a0 += r3.l * r6.l, a1 += r3.h * r6.l || r4 = [i1++] ; r0.l = (a0 += r4.l * r6.h), r0.h = (a1 += r4.h * r6.h) (s2rnd) || [i2++] = r1; r5 = [i0++] || [i2++] = r0; a0 = r0.l * r5.l, a1 += r0.h * r5.l || r6 = [i0++] || [i2++] = r3; a0 += r3.l * r5.h, a1 += r3.h * r5.h || r1 = [i1++]; a0 += r4.l * r5.l, a1 += r4.h * r5.l || r2 = [i1++]; a0 += r1.l * r6.l, a1 += r1.h * r6.l; r0.l = (a0 += r2.l * r6.h), r0.h = (a1 += r2.h * r6.h) (s2rnd); r5 = [i0++] || [i2++] = r0; a0 = r0.l * r5.l, a1 = r0.h * r5.l || r6 = [i0++] || [i2++] = r1; a0 += r1.l * r5.h, a1 += r1.h * r5.h || r3 = [i1++]; a0 += r2.l * r5.l, a1 += r2.h * r5.l || r4 = [i1++]; a0 += r3.l * r6.l, a1 += r3.h * r6.l; r0.l = (a0 += r4.l * r6.h), r0.h = (a1 += r4.h * r6.h) (s2rnd); r5 = [i0++] || [i2++] = r0; a0 = r0.l * r5.l, a1 += r0.h * r5.l || r6 = [i0++] || [i2++] = r3; a0 += r3.l * r5.h, a1 += r3.h * r5.h || r1 = [i1++]; a0 += r4.l * r5.l, a1 += r4.h * r5.l || r2 = [i1++]; a0 += r1.l * r6.l, a1 += r1.h * r6.l; r0.l = (a0 += r2.l * r6.h), r0.h = (a1 += r2.h * r6.h) (s2rnd); [i2++] = r0; //save the delays. [i2++] = r1; //normally this would be pipelined. |