typedef struct { float data[4]; } f4;
static void inverse_mdct(float *buffer, int n, vorb *f, int blocktype)
{
   int i,k,k2,k4, n2 = n >> 1, n4 = n >> 2, n8 = n >> 3, l;
   int n3_4 = n - n4, ld;
   uint16 *bitrev = f->bit_reverse[blocktype];
   // @OPTIMIZE: reduce register pressure by using fewer variables?
   int save_point = temp_alloc_save(f);
   float *v = (float *) temp_alloc(f, n2 * sizeof(*v));
   float *u,*d,*e,*z;
   // twiddle factors
   float *A = f->A[blocktype], *B = f->B[blocktype], *C = f->C[blocktype];

   // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
   // Note there are bugs in that pseudocode, presumably due to them attempting
   // to rename the arrays nicely rather than representing the way their actual
   // implementation bounces buffers back and forth. As a result, even in the
   // "some formulars corrected" version, a direct implementation fails. These
   // are noted below as "paper bug".

   // copy and reflect spectral data
   u = buffer;
   #define UMIRROR(z) (-u[n-(z)-1])

   // kernel from paper
   // step 1
   z = v+n2;
   for (k=k2=k4=0; k2 < n4; k+=1, k2+=2, k4+=4) {
      z[-k2-1] = (u[k4] - UMIRROR(n-k4-1)) * A[k2]   - (u[k4+2] - UMIRROR(n-k4-3))*A[k2+1];
      z[-k2-2] = (u[k4] - UMIRROR(n-k4-1)) * A[k2+1] + (u[k4+2] - UMIRROR(n-k4-3))*A[k2];
   }
   for (; k < n4; k+=1, k2+=2, k4+=4) {
      z[-k2-1] = (UMIRROR(k4) - u[n-k4-1]) * A[k2]   - (UMIRROR(k4+2) - u[n-k4-3])*A[k2+1];
      z[-k2-2] = (UMIRROR(k4) - u[n-k4-1]) * A[k2+1] + (UMIRROR(k4+2) - u[n-k4-3])*A[k2];
   }

   z = v+n4;
   // step 2    (paper output is w, now u)
   for (k=k4=k2=0; k < n8; k+=1, k2 += 2, k4+=4) {
      u[n4+1+k2] = z[1+k2] + v[k2+1];
      u[n4+0+k2] = z[0+k2] + v[k2+0];
      u[k2+1]    = (z[1+k2] - v[k2+1])*A[n2-4-k4] - (z[0+k2]-v[k2+0])*A[n2-3-k4];
      u[k2+0]    = (z[0+k2] - v[k2+0])*A[n2-4-k4] + (z[1+k2]-v[k2+1])*A[n2-3-k4];
   }
   // step 3
   ld = ilog(n) - 1; // ilog is off-by-one from normal definitions
   e = u;
   d = v;

   // now d and e are the two buffers
   for (l=0; l < ld-3; ++l) {
      int k0 = n >> (l+2), k1 = 1 << (l+3), k0_2 = k0>>1, k1_2=k1>>1;
      int rlim = n >> (l+4), r4, r,r2;
      int s2lim = 1 << (l+2), s2,s;
      for (r=r2=r4=0; r < rlim; r4+=4,++r,r2+=2) {
         for (s=s2=0; s2 < s2lim; s2+=2,++s) {
            d[n2-1-k0_2*s2-r2] = e[n2-1-k0_2*s2-r2] + e[n2-1-k0_2*(s2+1)-r2];
            d[n2-2-k0_2*s2-r2] = e[n2-2-k0_2*s2-r2] + e[n2-2-k0_2*(s2+1)-r2];
            d[n2-1-k0_2*(s2+1)-r2] = (e[n2-1-k0_2*s2-r2] - e[n2-1-k0_2*(s2+1)-r2]) * A[r*k1]
                                - (e[n2-2-k0_2*s2-r2] - e[n2-2-k0_2*(s2+1)-r2]) * A[r*k1+1];
            d[n2-2-k0_2*(s2+1)-r2] = (e[n2-2-k0_2*s2-r2] - e[n2-2-k0_2*(s2+1)-r2]) * A[r*k1]
                                + (e[n2-1-k0_2*s2-r2] - e[n2-1-k0_2*(s2+1)-r2]) * A[r*k1+1];
         }
      }

      // swap the buffers--missing from the paper
      {
         float *z = e;
         e = d;
         d = z;
      }
   }

   // now we need to bit-reverse, and leave the result in u.
   // so if it's currently in u, we bit-reverse in place; if
   // it's currently in v, we bit-reverse by copying.

   // step 4   (paper output is v, now u)
   if (e == u) {
      for (i=0; i < n8; ++i) {
         int j = bitrev[i];
         assert(j < n8);
         if (i < j) {
            int j4 = j << 2, i4 = i << 2;
            #if 1
            f4 temp;
            temp = *(f4 *) (u+j4);
            *(f4 *)(u+j4) = *(f4 *)(u+i4);
            *(f4 *)(u+i4) = temp;
            #else
            float w,x,y,z;
            w = u[j4+0]; u[j4+0] = u[i4+0]; u[i4+0] = w;
            x = u[j4+1]; u[j4+1] = u[i4+1]; u[i4+1] = x;
            y = u[j4+2]; u[j4+2] = u[i4+2]; u[i4+2] = y;
            z = u[j4+3]; u[j4+3] = u[i4+3]; u[i4+3] = z;
            #endif
         }
      }
   } else {
      for (i=0; i < n8; ++i) {
         int j = bitrev[i];
         assert(j < n8);
         if (i == j) {
            // paper bug: this case wasn't included, despite the formulas
            //            show this as a copy, not in-place; but if you're
            //            not in-place, this copy is needed.
            int i4 = i << 2;
            #if 1
            *(f4 *)(u+i4) = *(f4 *)(v+i4);
            #else
            u[i4+0] = v[i4+0];
            u[i4+1] = v[i4+1];
            u[i4+2] = v[i4+2];
            u[i4+3] = v[i4+3];
            #endif
         } else if (i < j) {
            int i4 = i << 2, j4 = j << 2;
            #if 1
            *(f4 *)(u+j4) = *(f4 *)(v+i4);
            *(f4 *)(u+i4) = *(f4 *)(v+j4);
            #else
            u[j4+0] = v[i4+0], u[i4+0] = v[j4+0];
            u[j4+1] = v[i4+1], u[i4+1] = v[j4+1];
            u[j4+2] = v[i4+2], u[i4+2] = v[j4+2];
            u[j4+3] = v[i4+3], u[i4+3] = v[j4+3];
            #endif
         }
      }
   }

   // step 5   (folded into step 6)
   // step 6   (paper output is u, now v)
   for (k2=k4=0; k2 < n4; k2 += 2, k4 += 4) {
      v[n2-1-k2] = u[k4+0];
      v[n2-2-k2] = u[k4+1];
      v[n4 - 1 - k2] = u[k4+2];
      v[n4 - 2 - k2] = u[k4+3];
   }
   // step 7   (paper output is v, now u)
   for (k2=0; k2 < n4; k2 += 2) {
      u[      k2] = ( v[     k2] + v[n2-2-k2] + C[k2+1]*(v[  k2]-v[n2-2-k2]) + C[k2]*(v[k2+1]+v[n2-2-k2+1]));
      u[n2-2- k2] = ( v[     k2] + v[n2-2-k2] - C[k2+1]*(v[  k2]-v[n2-2-k2]) - C[k2]*(v[k2+1]+v[n2-2-k2+1]));
      u[   1+ k2] = ( v[   1+k2] - v[n2-1-k2] + C[k2+1]*(v[1+k2]+v[n2-1-k2]) - C[k2]*(v[k2  ]-v[n2-2-k2  ]));
      u[n2-1- k2] = (-v[   1+k2] + v[n2-1-k2] + C[k2+1]*(v[1+k2]+v[n2-1-k2]) - C[k2]*(v[k2  ]-v[n2-2-k2  ]));
   }
   // step 8   (paper output is X, now v)
   for (k=0; k < n4; ++k) {
      v[k]      = u[k*2   ]*B[k*2  ] + u[k*2+1   ]*B[k*2+1];
      v[n2-1-k] = u[k*2   ]*B[k*2+1] - u[k*2+1   ]*B[k*2  ];
   }

   // decode kernel to output
   // there's a scale value that's been hoisted into the B[] array; that
   // value was determined experimentally (first making inverse_mdct_slow work,
   // then making this match that). Weirdly, factor of n is missing!
   // (probably vorbis encoder premultiplies by n or n/2, to save it on the decoder?)
   for (i=0; i < n4  ; ++i) buffer[i] = v[i+n4];
   for (   ; i < n3_4; ++i) buffer[i] = -v[n3_4 - i - 1];
   for (   ; i < n   ; ++i) buffer[i] = -v[i - n3_4];

   temp_alloc_restore(f,save_point);
}