// the following were split out into separate functions while optimizing; // they could be pushed back up but eh. __forceinline showed no change; // they're probably already being inlined. static void imdct_step3_iter0_loop(int n, float *e, int i_off, int k_off, float *A) { float *ee0 = e + i_off; float *ee2 = ee0 + k_off; int i; assert((n & 3) == 0); for (i=(n>>2); i > 0; --i) { float k00_20, k01_21; k00_20 = ee0[ 0] - ee2[ 0]; k01_21 = ee0[-1] - ee2[-1]; ee0[ 0] += ee2[ 0];//ee0[ 0] = ee0[ 0] + ee2[ 0]; ee0[-1] += ee2[-1];//ee0[-1] = ee0[-1] + ee2[-1]; ee2[ 0] = k00_20 * A[0] - k01_21 * A[1]; ee2[-1] = k01_21 * A[0] + k00_20 * A[1]; A += 8; k00_20 = ee0[-2] - ee2[-2]; k01_21 = ee0[-3] - ee2[-3]; ee0[-2] += ee2[-2];//ee0[-2] = ee0[-2] + ee2[-2]; ee0[-3] += ee2[-3];//ee0[-3] = ee0[-3] + ee2[-3]; ee2[-2] = k00_20 * A[0] - k01_21 * A[1]; ee2[-3] = k01_21 * A[0] + k00_20 * A[1]; A += 8; k00_20 = ee0[-4] - ee2[-4]; k01_21 = ee0[-5] - ee2[-5]; ee0[-4] += ee2[-4];//ee0[-4] = ee0[-4] + ee2[-4]; ee0[-5] += ee2[-5];//ee0[-5] = ee0[-5] + ee2[-5]; ee2[-4] = k00_20 * A[0] - k01_21 * A[1]; ee2[-5] = k01_21 * A[0] + k00_20 * A[1]; A += 8; k00_20 = ee0[-6] - ee2[-6]; k01_21 = ee0[-7] - ee2[-7]; ee0[-6] += ee2[-6];//ee0[-6] = ee0[-6] + ee2[-6]; ee0[-7] += ee2[-7];//ee0[-7] = ee0[-7] + ee2[-7]; ee2[-6] = k00_20 * A[0] - k01_21 * A[1]; ee2[-7] = k01_21 * A[0] + k00_20 * A[1]; A += 8; ee0 -= 8; ee2 -= 8; } } static void imdct_step3_inner_r_loop(int lim, float *e, int d0, int k_off, float *A, int k1) { int i; float k00_20, k01_21; float *e0 = e + d0; float *e2 = e0 + k_off; for (i=lim >> 2; i > 0; --i) { k00_20 = e0[-0] - e2[-0]; k01_21 = e0[-1] - e2[-1]; e0[-0] += e2[-0];//e0[-0] = e0[-0] + e2[-0]; e0[-1] += e2[-1];//e0[-1] = e0[-1] + e2[-1]; e2[-0] = (k00_20)*A[0] - (k01_21) * A[1]; e2[-1] = (k01_21)*A[0] + (k00_20) * A[1]; A += k1; k00_20 = e0[-2] - e2[-2]; k01_21 = e0[-3] - e2[-3]; e0[-2] += e2[-2];//e0[-2] = e0[-2] + e2[-2]; e0[-3] += e2[-3];//e0[-3] = e0[-3] + e2[-3]; e2[-2] = (k00_20)*A[0] - (k01_21) * A[1]; e2[-3] = (k01_21)*A[0] + (k00_20) * A[1]; A += k1; k00_20 = e0[-4] - e2[-4]; k01_21 = e0[-5] - e2[-5]; e0[-4] += e2[-4];//e0[-4] = e0[-4] + e2[-4]; e0[-5] += e2[-5];//e0[-5] = e0[-5] + e2[-5]; e2[-4] = (k00_20)*A[0] - (k01_21) * A[1]; e2[-5] = (k01_21)*A[0] + (k00_20) * A[1]; A += k1; k00_20 = e0[-6] - e2[-6]; k01_21 = e0[-7] - e2[-7]; e0[-6] += e2[-6];//e0[-6] = e0[-6] + e2[-6]; e0[-7] += e2[-7];//e0[-7] = e0[-7] + e2[-7]; e2[-6] = (k00_20)*A[0] - (k01_21) * A[1]; e2[-7] = (k01_21)*A[0] + (k00_20) * A[1]; e0 -= 8; e2 -= 8; A += k1; } } static void imdct_step3_inner_s_loop(int n, float *e, int i_off, int k_off, float *A, int a_off, int k0) { int i; float A0 = A[0]; float A1 = A[0+1]; float A2 = A[0+a_off]; float A3 = A[0+a_off+1]; float A4 = A[0+a_off*2+0]; float A5 = A[0+a_off*2+1]; float A6 = A[0+a_off*3+0]; float A7 = A[0+a_off*3+1]; float k00,k11; float *ee0 = e +i_off; float *ee2 = ee0+k_off; for (i=n; i > 0; --i) { k00 = ee0[ 0] - ee2[ 0]; k11 = ee0[-1] - ee2[-1]; ee0[ 0] = ee0[ 0] + ee2[ 0]; ee0[-1] = ee0[-1] + ee2[-1]; ee2[ 0] = (k00) * A0 - (k11) * A1; ee2[-1] = (k11) * A0 + (k00) * A1; k00 = ee0[-2] - ee2[-2]; k11 = ee0[-3] - ee2[-3]; ee0[-2] = ee0[-2] + ee2[-2]; ee0[-3] = ee0[-3] + ee2[-3]; ee2[-2] = (k00) * A2 - (k11) * A3; ee2[-3] = (k11) * A2 + (k00) * A3; k00 = ee0[-4] - ee2[-4]; k11 = ee0[-5] - ee2[-5]; ee0[-4] = ee0[-4] + ee2[-4]; ee0[-5] = ee0[-5] + ee2[-5]; ee2[-4] = (k00) * A4 - (k11) * A5; ee2[-5] = (k11) * A4 + (k00) * A5; k00 = ee0[-6] - ee2[-6]; k11 = ee0[-7] - ee2[-7]; ee0[-6] = ee0[-6] + ee2[-6]; ee0[-7] = ee0[-7] + ee2[-7]; ee2[-6] = (k00) * A6 - (k11) * A7; ee2[-7] = (k11) * A6 + (k00) * A7; ee0 -= k0; ee2 -= k0; } } static __forceinline void iter_54(float *z) { float k00,k11,k22,k33; float y0,y1,y2,y3; k00 = z[ 0] - z[-4]; y0 = z[ 0] + z[-4]; y2 = z[-2] + z[-6]; k22 = z[-2] - z[-6]; z[-0] = y0 + y2; // z0 + z4 + z2 + z6 z[-2] = y0 - y2; // z0 + z4 - z2 - z6 // done with y0,y2 k33 = z[-3] - z[-7]; z[-4] = k00 + k33; // z0 - z4 + z3 - z7 z[-6] = k00 - k33; // z0 - z4 - z3 + z7 // done with k33 k11 = z[-1] - z[-5]; y1 = z[-1] + z[-5]; y3 = z[-3] + z[-7]; z[-1] = y1 + y3; // z1 + z5 + z3 + z7 z[-3] = y1 - y3; // z1 + z5 - z3 - z7 z[-5] = k11 - k22; // z1 - z5 + z2 - z6 z[-7] = k11 + k22; // z1 - z5 - z2 + z6 } static void imdct_step3_inner_s_loop_ld654(int n, float *e, int i_off, float *A, int base_n) { int k_off = -8; int a_off = base_n >> 3; float A2 = A[0+a_off]; float *z = e + i_off; float *base = z - 16 * n; while (z > base) { float k00,k11; k00 = z[-0] - z[-8]; k11 = z[-1] - z[-9]; z[-0] = z[-0] + z[-8]; z[-1] = z[-1] + z[-9]; z[-8] = k00; z[-9] = k11 ; k00 = z[ -2] - z[-10]; k11 = z[ -3] - z[-11]; z[ -2] = z[ -2] + z[-10]; z[ -3] = z[ -3] + z[-11]; z[-10] = (k00+k11) * A2; z[-11] = (k11-k00) * A2; k00 = z[-12] - z[ -4]; // reverse to avoid a unary negation k11 = z[ -5] - z[-13]; z[ -4] = z[ -4] + z[-12]; z[ -5] = z[ -5] + z[-13]; z[-12] = k11; z[-13] = k00; k00 = z[-14] - z[ -6]; // reverse to avoid a unary negation k11 = z[ -7] - z[-15]; z[ -6] = z[ -6] + z[-14]; z[ -7] = z[ -7] + z[-15]; z[-14] = (k00+k11) * A2; z[-15] = (k00-k11) * A2; iter_54(z); iter_54(z-8); z -= 16; } } typedef struct { float data[4]; } f4; static void inverse_mdct(float *buffer, int n, vorb *f, int blocktype) { int i, n2 = n >> 1, n4 = n >> 2, n8 = n >> 3, l; int n3_4 = n - n4, ld; uint16 *bitrev = f->bit_reverse[blocktype]; // @OPTIMIZE: reduce register pressure by using fewer variables? int save_point = temp_alloc_save(f); float *v = (float *) temp_alloc(f, n2 * sizeof(*v)); float *u; // twiddle factors float *A = f->A[blocktype]; // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio" // See notes about bugs in that paper in less-optimal implementation 'inverse_mdct_old'. // copy and reflect spectral data u = buffer; #define UMIRROR(z) (-u[n-(z)-1]) // kernel from paper // step 1, combined with step 0 (omitted above) // note that it turns out that the items added together during // this step are, in fact, being added to themselves (as reflected // by step 0). inexplicable inefficiency! // so there's a missing 'times 2' here. this propogates through // linearly to the end, where the numbers are 1/2 too small... { float *d,*e, *AA, *e_stop; d = &v[n2-2]; AA = A; e = &u[0]; e_stop = &u[n2]; while (e != e_stop) { d[1] = (e[0] * AA[0] - e[2]*AA[1]); d[0] = (e[0] * AA[1] + e[2]*AA[0]); d -= 2; AA += 2; e += 4; } e = &u[n2-3]; while (d >= v) { d[1] = (-e[2] * AA[0] - -e[0]*AA[1]); d[0] = (-e[2] * AA[1] + -e[0]*AA[0]); d -= 2; AA += 2; e -= 4; } } // step 2 (paper output is w, now u) { int n4k2 = n4; float *AA = &A[n2-4]; //float *d, *e; //int off; int k2=0; for (; n4k2 < n2;) { float v40_20, v41_21; v41_21 = v[n4k2+1] - v[k2+1]; v40_20 = v[n4k2+0] - v[k2+0]; u[n4k2+1] = v[n4k2+1] + v[k2+1]; u[n4k2+0] = v[n4k2+0] + v[k2+0]; u[k2+1] = v41_21*AA[0] - v40_20*AA[1]; u[k2+0] = v40_20*AA[0] + v41_21*AA[1]; v41_21 = v[n4k2+3] - v[k2+3]; v40_20 = v[n4k2+2] - v[k2+2]; u[n4k2+3] = v[n4k2+3] + v[k2+3]; u[n4k2+2] = v[n4k2+2] + v[k2+2]; u[k2+3] = v41_21*AA[2] - v40_20*AA[3]; u[k2+2] = v40_20*AA[2] + v41_21*AA[3]; AA -= 8; n4k2 += 4; k2 += 4; } } // step 3 ld = ilog(n) - 1; // ilog is off-by-one from normal definitions // optimized step 3: // the original step3 loop can be nested r inside s or s inside r; // it's written originally as inside r, but this is dumb when r // iterates many times, and s few. So I have two copies of it and // switch between them halfway. // this is iteration 0 of step 3 imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*0, -(n >> 3), A); imdct_step3_iter0_loop(n >> 4, u, n2-1-n4*1, -(n >> 3), A); // this is iteration 1 of step 3 imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*0, -(n >> 4), A, 16); imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*1, -(n >> 4), A, 16); imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*2, -(n >> 4), A, 16); imdct_step3_inner_r_loop(n >> 5, u, n2-1 - n8*3, -(n >> 4), A, 16); l=2; for (; l < (ld-3)>>1; ++l) { int k0 = n >> (l+2), k0_2 = k0>>1; int lim = 1 << (l+1); for (i=0; i < lim; ++i) imdct_step3_inner_r_loop(n >> (l+4), u, n2-1 - k0*i, -k0_2, A, 1 << (l+3)); } for (; l < ld-6; ++l) { int k0 = n >> (l+2), k1 = 1 << (l+3), k0_2 = k0>>1; int rlim = n >> (l+4), r; int lim = 1 << (l+1); assert(rlim >= 4); for (r=0; r < rlim;) { float *A0 = &A[r*k1]; imdct_step3_inner_s_loop(lim, u, n2-1-r*2, -k0_2, A0, k1, k0); r+=4; } } // iterations with count: // ld=6,5,4 all interleaved together // the big win comes from getting rid of needless flops // due to the constants on pass 5 & 4 being all 1 and 0; // combining them to be simultaneous to improve cache made little difference imdct_step3_inner_s_loop_ld654(n >> 5, u, n2-1, A, n); // output is u // step 4, 5, and 6 for (i=0; i < n8; ++i) { int j = bitrev[i]; int k2 = i << 1, k4 = j << 2; v[n2-k2-1] = u[k4+0]; v[n2-k2-2] = u[k4+1]; v[n4-k2-1] = u[k4+2]; v[n4-k2-2] = u[k4+3]; } // (paper output is u, now v) // input is v // step 7 (paper output is v, now v) // this is now in place { float *C = f->C[blocktype]; float *d, *e; d = v; e = v + n2 - 2; while (d < e) { float a02 = d[0] - e[0]; float a11 = d[1] + e[1]; float b0 = C[1]*a02 + C[0]*a11; float b1 = C[1]*a11 - C[0]*a02; float b2 = d[0] + e[ 0]; float b3 = d[1] - e[ 1]; d[ 0] = b2 + b0; d[ 1] = b3 + b1; e[ 0] = b2 - b0; e[ 1] = b1 - b3; C += 2; d += 2; e -= 2; } } // step 8 (paper output is X, now v) // this is in-place writing to k*2,k*2+1 instead of k,n2-1-k; // the final 'decode kernel' step then fetches from these spots // doing this lets us push steps 5+6 into step 4 { float *B = f->B[blocktype] + n2 - 4; float *d = v + n2 - 4; while (d >= v) { float p0,p1; p0 = d[0]*B[0] + d[1]*B[1]; p1 = d[0]*B[1] - d[1]*B[0]; d[0] = p0; d[1] = p1; p0 = d[2]*B[2] + d[3]*B[3]; p1 = d[2]*B[3] - d[3]*B[2]; d[2] = p0; d[3] = p1; B -= 4; d -= 4; } } // decode kernel to output // there's a scale value that's been hoisted into the B[] array; that // value was determined experimentally (first making inverse_mdct_slow work, // then making this match that). No factor of 1/n. It used to have a factor // of 1/2, but that cancelled out with the factor of 2 deleted from the step 1. { float *d0,*d1,*d2,*d3 = buffer; float *e; e = &v[n2-2]; d0 = &buffer[0]; d1 = &buffer[n2-1]; d2 = &buffer[n2]; d3 = &buffer[n-1]; while (e >= v) { //for (i=0; i < n4; ++i, e-=2) { *d0++ = e[1]; *d1-- = - e[1]; *d2++ = - e[0]; *d3-- = - e[0]; e -= 2; } } temp_alloc_restore(f,save_point); #undef n_2 #undef n_4 #undef n_8 }