// the following were split out into separate functions while optimizing; // they could be pushed back up but eh static void imdct_step3_iter0_loop(int n, float *e, int i_off, int k_off, float *A) { float *ee0 = e + i_off; float *ee2 = ee0 + k_off; int i; for (i=n-1; i >= 0; --i) { float k00_20 = ee0[ 0] - ee2[ 0]; float k01_21 = ee0[-1] - ee2[-1]; ee0[ 0] = ee0[ 0] + ee2[ 0]; ee0[-1] = ee0[-1] + ee2[-1]; ee2[ 0] = k00_20 * A[0] - k01_21 * A[1]; ee2[-1] = k01_21 * A[0] + k00_20 * A[1]; A += 8; ee0 -= 2; ee2 -= 2; } } static void imdct_step3_inner_r_loop(int lim, float *e, int d0, int k_off, float *A, int k1) { int i; float k00_20, k01_21; float *e0 = e + d0; float *e2 = e0 + k_off; for (i=lim; i > 0; i -= 2) { k00_20 = e0[-0] - e2[-0]; k01_21 = e0[-1] - e2[-1]; e0[-0] = e0[-0] + e2[-0]; e0[-1] = e0[-1] + e2[-1]; e2[-0] = (k00_20)*A[0] - (k01_21) * A[1]; e2[-1] = (k01_21)*A[0] + (k00_20) * A[1]; A += k1; k00_20 = e0[-2] - e2[-2]; k01_21 = e0[-3] - e2[-3]; e0[-2] = e0[-2] + e2[-2]; e0[-3] = e0[-3] + e2[-3]; e2[-2] = (k00_20)*A[0] - (k01_21) * A[1]; e2[-3] = (k01_21)*A[0] + (k00_20) * A[1]; e0 -= 4; e2 -= 4; A += k1; } } static void imdct_step3_inner_s_loop(int n, float *e, int i_off, int k_off, float *A, int a_off, int k0) { int i; float A0 = A[0]; float A1 = A[0+1]; float A2 = A[0+a_off]; float A3 = A[0+a_off+1]; float k00,k11; float *ee0 = e +i_off; float *ee2 = ee0+k_off; for (i=0; i < n; ++i) { k00 = ee0[ 0] - ee2[ 0]; k11 = ee0[-1] - ee2[-1]; ee0[ 0] = ee0[ 0] + ee2[ 0]; ee0[-1] = ee0[-1] + ee2[-1]; ee2[ 0] = (k00) * A0 - (k11) * A1; ee2[-1] = (k11) * A0 + (k00) * A1; k00 = ee0[-2] - ee2[-2]; k11 = ee0[-3] - ee2[-3]; ee0[-2] = ee0[-2] + ee2[-2]; ee0[-3] = ee0[-3] + ee2[-3]; ee2[-2] = (k00) * A2 - (k11) * A3; ee2[-3] = (k11) * A2 + (k00) * A3; ee0 -= k0; ee2 -= k0; } } typedef struct { float data[4]; } f4; static void inverse_mdct(float *buffer, int n, vorb *f, int blocktype) { int i,k,k2,k4, n2 = n >> 1, n4 = n >> 2, n8 = n >> 3, l; int n3_4 = n - n4, ld; uint16 *bitrev = f->bit_reverse[blocktype]; // @OPTIMIZE: reduce register pressure by using fewer variables? int save_point = temp_alloc_save(f); float *v = (float *) temp_alloc(f, n2 * sizeof(*v)); float *u,*z; // twiddle factors float *A = f->A[blocktype], *B = f->B[blocktype], *C = f->C[blocktype]; // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio" // See notes about bugs in that paper in less-optimal implementation 'inverse_mdct_old'. // copy and reflect spectral data u = buffer; #define UMIRROR(z) (-u[n-(z)-1]) // kernel from paper // step 1, combined with step 0 (omitted above) z = v+n2; for (k2=k4=0; k2 < n4; k2+=2, k4+=4) { float u01 = u[k4+0] - UMIRROR(n-k4-1); float u23 = u[k4+2] - UMIRROR(n-k4-3); z[-k2-1] = u01 * A[k2] - u23*A[k2+1]; z[-k2-2] = u01 * A[k2+1] + u23*A[k2]; } for (; k2 < n2; k2+=2, k4+=4) { float u01 = UMIRROR(k4+0) - u[n-k4-1]; float u23 = UMIRROR(k4+2) - u[n-k4-3]; z[-k2-1] = u01 * A[k2 ] - u23*A[k2+1]; z[-k2-2] = u01 * A[k2+1] + u23*A[k2]; } // step 2 (paper output is w, now u) { int n4k2 = n4; float *AA = &A[n2-4]; for (k2=0; n4k2 < n2; k2 += 2) { float v41_21 = v[n4k2+1] - v[k2+1]; float v40_20 = v[n4k2+0] - v[k2+0]; u[n4k2+1] = v[n4k2+1] + v[k2+1]; u[n4k2+0] = v[n4k2+0] + v[k2+0]; u[k2+1] = v41_21*AA[0] - v40_20*AA[1]; u[k2+0] = v40_20*AA[0] + v41_21*AA[1]; AA -= 4; n4k2 += 2; } } // step 3 ld = ilog(n) - 1; // ilog is off-by-one from normal definitions // optimized step 3: // the original step3 loop can be nested r inside s or s inside r; // it's written originally as inside r, but this is dumb when r // iterates many times, and s few. So I have two copies of it and // switch between them halfway. // now, to optimize the s-inside loop: // 1. optimize away all the index calculations, using // two separate pointers each for d & e // 2. move off the last 'l' iteration to its own copy // 3. in the main one, rlim is now always odd; unroll r loop once // 4. move the first update of r values after second copy (by updating all references to it) // 5. rename all variables from second copy // 6. move all variables up into first loop (the point of this strategy is // that they're actually all small deltas away) // 7. condense down to the small deltas (now we're updating 8 items per iteration) // further split: we split off the first iterationsand the second-to-last iteration // separately (a la item #2 above) // this is iteration 0 of step 3 for (i=0; i < 2; ++i) imdct_step3_iter0_loop(n >> 4, u, n2-1-(n>>2)*i, -(n >> 3), A); l = 1; for (; l < (ld-3)>>1; ++l) { int k0 = n >> (l+2), k0_2 = k0>>1; int lim = 1 << (l+1); for (i=0; i < lim; ++i) imdct_step3_inner_r_loop(n >> (l+4), u, n2-1 - k0*i, -k0_2, A, 1 << (l+3)); } for (; l < ld-4; ++l) { int k0 = n >> (l+2), k1 = 1 << (l+3), k0_2 = k0>>1; int rlim = n >> (l+4), r; int k_off = -k0_2 * sizeof(float); int lim = 1 << (l+1); for (r=0; r < rlim;) { float *A0 = &A[r*k1]; imdct_step3_inner_s_loop(lim, u, n2-1-r*2, -k0_2, A0, k1, k0); r+=2; } } // this is iteration ld-4 (the ld-3'th iteration) of step #3 //l = ld-4; { float A0 = A[0]; float A1 = A[1]; float *a = u+n2-1; while (a > u) { float x02,x13; x02 = a[-0] - a[-2]; x13 = a[-1] - a[-3]; a[-0] = a[-0] + a[-2]; a[-1] = a[-1] + a[-3]; a[-2] = x02 * A0 - x13 * A1; a[-3] = x13 * A0 + x02 * A1; x02 = a[-4] - a[-6]; x13 = a[-5] - a[-7]; a[-4] = a[-4] + a[-6]; a[-5] = a[-5] + a[-7]; a[-6] = x02 * A0 - x13 * A1; a[-7] = x13 * A0 + x02 * A1; a -= 8; } } // output is u // step 4, 5, and 6 for (i=0; i < n8; ++i) { int j = bitrev[i]; int k2 = i << 1, k4 = j << 2; v[n2-1-k2] = u[k4+0]; v[n2-2-k2] = u[k4+1]; v[n4-1-k2] = u[k4+2]; v[n4-2-k2] = u[k4+3]; } // (paper output is u, now v) // input is v // step 7 (paper output is v, now v) // this is now in place for (k2=0; k2 < n4; k2 += 2) { int pp = n2 - k2; float vk20 = v[k2+0]; float vk21 = v[k2+1]; float vpp1 = v[pp-1]; float vpp2 = v[pp-2]; float vv_20_p2 = vk20-vpp2; float vv_21_p1 = vk21+vpp1; vk20 += vpp2; vk21 -= vpp1; vpp1 = C[k2+1]*vv_20_p2 + C[k2]*vv_21_p1; vpp2 = C[k2+1]*vv_21_p1 - C[k2]*vv_20_p2; v[k2+0] = ( vk20 + vpp1); v[pp-2] = ( vk20 - vpp1); v[k2+1] = ( vk21 + vpp2); v[pp-1] = ( vpp2 - vk21); } // step 8 (paper output is X, now v) // this is in-place writing to k*2,k*2+1 instead of k,n2-1-k; // the final 'decode kernel' step then fetches from these spots // doing this lets us push steps 6+7 into step 5 for (k=0; k < n4; ++k) { float p0 = v[k*2 ]*B[k*2 ] + v[k*2+1]*B[k*2+1]; float p1 = v[k*2 ]*B[k*2+1] - v[k*2+1]*B[k*2 ]; v[k*2 ] = p0; // was v[k] v[k*2+1] = p1; // was v[n2-1-k] } #define vlow(x) v[(x)*2] #define vhi(x) v[(n2-1-(x))*2+1] // temp[k*2+1] = v[n2-1-k] // temp[y] = v[x] // y=k*2+1 // x=n2-1-k // k=n2-1-x // y=(n2-1-x)*2+1 // decode kernel to output // there's a scale value that's been hoisted into the B[] array; that // value was determined experimentally (first making inverse_mdct_slow work, // then making this match that). No factor of 1/n. for (i=0; i < n4 ; ++i) buffer[i] = vhi(i+n4); for ( ; i < n2 ; ++i) buffer[i] = -vhi(n3_4 - i - 1); for ( ; i < n3_4; ++i) buffer[i] = -vlow(n3_4 - i - 1); for ( ; i < n ; ++i) buffer[i] = -vlow(i - n3_4); temp_alloc_restore(f,save_point); #undef n_2 #undef n_4 #undef n_8 }