typedef struct { float data[4]; } f4; static void inverse_mdct(float *buffer, int n, vorb *f, int blocktype) { int i,k,k2,k4, n2 = n >> 1, n4 = n >> 2, n8 = n >> 3, l; int n3_4 = n - n4, ld; uint16 *bitrev = f->bit_reverse[blocktype]; // @OPTIMIZE: reduce register pressure by using fewer variables? int save_point = temp_alloc_save(f); float *v = (float *) temp_alloc(f, n2 * sizeof(*v)); float *u,*d,*e,*z; // twiddle factors float *A = f->A[blocktype], *B = f->B[blocktype], *C = f->C[blocktype]; // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio" // Note there are bugs in that pseudocode, presumably due to them attempting // to rename the arrays nicely rather than representing the way their actual // implementation bounces buffers back and forth. As a result, even in the // "some formulars corrected" version, a direct implementation fails. These // are noted below as "paper bug". // copy and reflect spectral data u = buffer; #define UMIRROR(z) (-u[n-(z)-1]) // kernel from paper // step 1 z = v+n2; for (k=k2=k4=0; k2 < n4; k+=1, k2+=2, k4+=4) { z[-k2-1] = (u[k4] - UMIRROR(n-k4-1)) * A[k2] - (u[k4+2] - UMIRROR(n-k4-3))*A[k2+1]; z[-k2-2] = (u[k4] - UMIRROR(n-k4-1)) * A[k2+1] + (u[k4+2] - UMIRROR(n-k4-3))*A[k2]; } for (; k < n4; k+=1, k2+=2, k4+=4) { z[-k2-1] = (UMIRROR(k4) - u[n-k4-1]) * A[k2] - (UMIRROR(k4+2) - u[n-k4-3])*A[k2+1]; z[-k2-2] = (UMIRROR(k4) - u[n-k4-1]) * A[k2+1] + (UMIRROR(k4+2) - u[n-k4-3])*A[k2]; } z = v+n4; // step 2 (paper output is w, now u) for (k=k4=k2=0; k < n8; k+=1, k2 += 2, k4+=4) { u[n4+1+k2] = z[1+k2] + v[k2+1]; u[n4+0+k2] = z[0+k2] + v[k2+0]; u[k2+1] = (z[1+k2] - v[k2+1])*A[n2-4-k4] - (z[0+k2]-v[k2+0])*A[n2-3-k4]; u[k2+0] = (z[0+k2] - v[k2+0])*A[n2-4-k4] + (z[1+k2]-v[k2+1])*A[n2-3-k4]; } // step 3 ld = ilog(n) - 1; // ilog is off-by-one from normal definitions e = u; d = v; // now d and e are the two buffers for (l=0; l < ld-3; ++l) { int k0 = n >> (l+2), k1 = 1 << (l+3), k0_2 = k0>>1, k1_2=k1>>1; int rlim = n >> (l+4), r4, r,r2; int s2lim = 1 << (l+2), s2,s; for (r=r2=r4=0; r < rlim; r4+=4,++r,r2+=2) { for (s=s2=0; s2 < s2lim; s2+=2,++s) { d[n2-1-k0_2*s2-r2] = e[n2-1-k0_2*s2-r2] + e[n2-1-k0_2*(s2+1)-r2]; d[n2-2-k0_2*s2-r2] = e[n2-2-k0_2*s2-r2] + e[n2-2-k0_2*(s2+1)-r2]; d[n2-1-k0_2*(s2+1)-r2] = (e[n2-1-k0_2*s2-r2] - e[n2-1-k0_2*(s2+1)-r2]) * A[r*k1] - (e[n2-2-k0_2*s2-r2] - e[n2-2-k0_2*(s2+1)-r2]) * A[r*k1+1]; d[n2-2-k0_2*(s2+1)-r2] = (e[n2-2-k0_2*s2-r2] - e[n2-2-k0_2*(s2+1)-r2]) * A[r*k1] + (e[n2-1-k0_2*s2-r2] - e[n2-1-k0_2*(s2+1)-r2]) * A[r*k1+1]; } } // swap the buffers--missing from the paper { float *z = e; e = d; d = z; } } // now we need to bit-reverse, and leave the result in u. // so if it's currently in u, we bit-reverse in place; if // it's currently in v, we bit-reverse by copying. // step 4 (paper output is v, now u) if (e == u) { for (i=0; i < n8; ++i) { int j = bitrev[i]; assert(j < n8); if (i < j) { int j4 = j << 2, i4 = i << 2; #if 1 f4 temp; temp = *(f4 *) (u+j4); *(f4 *)(u+j4) = *(f4 *)(u+i4); *(f4 *)(u+i4) = temp; #else float w,x,y,z; w = u[j4+0]; u[j4+0] = u[i4+0]; u[i4+0] = w; x = u[j4+1]; u[j4+1] = u[i4+1]; u[i4+1] = x; y = u[j4+2]; u[j4+2] = u[i4+2]; u[i4+2] = y; z = u[j4+3]; u[j4+3] = u[i4+3]; u[i4+3] = z; #endif } } } else { for (i=0; i < n8; ++i) { int j = bitrev[i]; assert(j < n8); if (i == j) { // paper bug: this case wasn't included, despite the formulas // show this as a copy, not in-place; but if you're // not in-place, this copy is needed. int i4 = i << 2; #if 1 *(f4 *)(u+i4) = *(f4 *)(v+i4); #else u[i4+0] = v[i4+0]; u[i4+1] = v[i4+1]; u[i4+2] = v[i4+2]; u[i4+3] = v[i4+3]; #endif } else if (i < j) { int i4 = i << 2, j4 = j << 2; #if 1 *(f4 *)(u+j4) = *(f4 *)(v+i4); *(f4 *)(u+i4) = *(f4 *)(v+j4); #else u[j4+0] = v[i4+0], u[i4+0] = v[j4+0]; u[j4+1] = v[i4+1], u[i4+1] = v[j4+1]; u[j4+2] = v[i4+2], u[i4+2] = v[j4+2]; u[j4+3] = v[i4+3], u[i4+3] = v[j4+3]; #endif } } } // step 5 (folded into step 6) // step 6 (paper output is u, now v) for (k2=k4=0; k2 < n4; k2 += 2, k4 += 4) { v[n2-1-k2] = u[k4+0]; v[n2-2-k2] = u[k4+1]; v[n4 - 1 - k2] = u[k4+2]; v[n4 - 2 - k2] = u[k4+3]; } // step 7 (paper output is v, now u) for (k2=0; k2 < n4; k2 += 2) { u[ k2] = ( v[ k2] + v[n2-2-k2] + C[k2+1]*(v[ k2]-v[n2-2-k2]) + C[k2]*(v[k2+1]+v[n2-2-k2+1])); u[n2-2- k2] = ( v[ k2] + v[n2-2-k2] - C[k2+1]*(v[ k2]-v[n2-2-k2]) - C[k2]*(v[k2+1]+v[n2-2-k2+1])); u[ 1+ k2] = ( v[ 1+k2] - v[n2-1-k2] + C[k2+1]*(v[1+k2]+v[n2-1-k2]) - C[k2]*(v[k2 ]-v[n2-2-k2 ])); u[n2-1- k2] = (-v[ 1+k2] + v[n2-1-k2] + C[k2+1]*(v[1+k2]+v[n2-1-k2]) - C[k2]*(v[k2 ]-v[n2-2-k2 ])); } // step 8 (paper output is X, now v) for (k=0; k < n4; ++k) { v[k] = u[k*2 ]*B[k*2 ] + u[k*2+1 ]*B[k*2+1]; v[n2-1-k] = u[k*2 ]*B[k*2+1] - u[k*2+1 ]*B[k*2 ]; } // decode kernel to output // there's a scale value that's been hoisted into the B[] array; that // value was determined experimentally (first making inverse_mdct_slow work, // then making this match that). Weirdly, factor of n is missing! // (probably vorbis encoder premultiplies by n or n/2, to save it on the decoder?) for (i=0; i < n4 ; ++i) buffer[i] = v[i+n4]; for ( ; i < n3_4; ++i) buffer[i] = -v[n3_4 - i - 1]; for ( ; i < n ; ++i) buffer[i] = -v[i - n3_4]; temp_alloc_restore(f,save_point); }