// the following were split out into separate functions while optimizing;
// they could be pushed back up but eh
static void imdct_step3_iter0_loop(int n, float *e, int i_off, int k_off, float *A)
{
   float *ee0 = e + i_off;
   float *ee2 = ee0 + k_off;
   int i;

   for (i=n-1; i >= 0; --i) {
      float k00_20 = ee0[ 0] - ee2[ 0];
      float k01_21 = ee0[-1] - ee2[-1];
      ee0[ 0] = ee0[ 0] + ee2[ 0];
      ee0[-1] = ee0[-1] + ee2[-1];
      ee2[ 0] = k00_20 * A[0] - k01_21 * A[1];
      ee2[-1] = k01_21 * A[0] + k00_20 * A[1];
      A += 8;
      ee0 -= 2;
      ee2 -= 2;
   }
}

static void imdct_step3_inner_r_loop(int lim, float *e, int d0, int k_off, float *A, int k1)
{
   int i;
   float k00_20, k01_21;

   float *e0 = e + d0;
   float *e2 = e0 + k_off;

   for (i=lim; i > 0; i -= 2) {
      k00_20 = e0[-0] - e2[-0];
      k01_21 = e0[-1] - e2[-1];
      e0[-0] = e0[-0] + e2[-0];
      e0[-1] = e0[-1] + e2[-1];
      e2[-0] = (k00_20)*A[0] - (k01_21) * A[1];
      e2[-1] = (k01_21)*A[0] + (k00_20) * A[1];

      A += k1;

      k00_20 = e0[-2] - e2[-2];
      k01_21 = e0[-3] - e2[-3];
      e0[-2] = e0[-2] + e2[-2];
      e0[-3] = e0[-3] + e2[-3];
      e2[-2] = (k00_20)*A[0] - (k01_21) * A[1];
      e2[-3] = (k01_21)*A[0] + (k00_20) * A[1];

      e0 -= 4;
      e2 -= 4;

      A += k1;
   }
}

static void imdct_step3_inner_s_loop(int n, float *e, int i_off, int k_off, float *A, int a_off, int k0)
{
   int i;
   float A0 = A[0];
   float A1 = A[0+1];
   float A2 = A[0+a_off];
   float A3 = A[0+a_off+1];

   float k00,k11;

   float *ee0 = e  +i_off;
   float *ee2 = ee0+k_off;

   for (i=0; i < n; ++i) {
      k00 = ee0[ 0] - ee2[ 0];
      k11 = ee0[-1] - ee2[-1];
      ee0[ 0] =  ee0[ 0] + ee2[ 0];
      ee0[-1] =  ee0[-1] + ee2[-1];
      ee2[ 0] = (k00) * A0 - (k11) * A1;
      ee2[-1] = (k11) * A0 + (k00) * A1;

      k00 = ee0[-2] - ee2[-2];
      k11 = ee0[-3] - ee2[-3];
      ee0[-2] =  ee0[-2] + ee2[-2];
      ee0[-3] =  ee0[-3] + ee2[-3];
      ee2[-2] = (k00) * A2 - (k11) * A3;
      ee2[-3] = (k11) * A2 + (k00) * A3;

      ee0 -= k0;
      ee2 -= k0;
   }
}

typedef struct { float data[4]; } f4;
static void inverse_mdct(float *buffer, int n, vorb *f, int blocktype)
{
   int i,k,k2,k4, n2 = n >> 1, n4 = n >> 2, n8 = n >> 3, l;
   int n3_4 = n - n4, ld;
   uint16 *bitrev = f->bit_reverse[blocktype];
   // @OPTIMIZE: reduce register pressure by using fewer variables?
   int save_point = temp_alloc_save(f);
   float *v = (float *) temp_alloc(f, n2 * sizeof(*v));
   float *u,*z;
   // twiddle factors
   float *A = f->A[blocktype], *B = f->B[blocktype], *C = f->C[blocktype];

   // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
   // See notes about bugs in that paper in less-optimal implementation 'inverse_mdct_old'.

   // copy and reflect spectral data
   u = buffer;
   #define UMIRROR(z) (-u[n-(z)-1])

   // kernel from paper
   // step 1, combined with step 0 (omitted above)
   z = v+n2;

   for (k2=k4=0; k2 < n4; k2+=2, k4+=4) {
      float u01 = u[k4+0] - UMIRROR(n-k4-1);
      float u23 = u[k4+2] - UMIRROR(n-k4-3);
      z[-k2-1] = u01 * A[k2]   - u23*A[k2+1];
      z[-k2-2] = u01 * A[k2+1] + u23*A[k2];
   }
   for (; k2 < n2; k2+=2, k4+=4) {
      float u01 = UMIRROR(k4+0) - u[n-k4-1];
      float u23 = UMIRROR(k4+2) - u[n-k4-3];
      z[-k2-1] = u01 * A[k2  ]  - u23*A[k2+1];
      z[-k2-2] = u01 * A[k2+1]  + u23*A[k2];
   }

   // step 2    (paper output is w, now u)
   {
      int n4k2 = n4;
      float *AA = &A[n2-4];
      for (k2=0; n4k2 < n2; k2 += 2) {
         float v41_21 = v[n4k2+1] - v[k2+1];
         float v40_20 = v[n4k2+0] - v[k2+0];
         u[n4k2+1]  = v[n4k2+1] + v[k2+1];
         u[n4k2+0]  = v[n4k2+0] + v[k2+0];
         u[k2+1]    = v41_21*AA[0] - v40_20*AA[1];
         u[k2+0]    = v40_20*AA[0] + v41_21*AA[1];
         AA -= 4;
         n4k2 += 2;
      }
   }

   // step 3
   ld = ilog(n) - 1; // ilog is off-by-one from normal definitions

   // optimized step 3:

   // the original step3 loop can be nested r inside s or s inside r;
   // it's written originally as inside r, but this is dumb when r
   // iterates many times, and s few. So I have two copies of it and
   // switch between them halfway.

   // now, to optimize the s-inside loop:
   //    1. optimize away all the index calculations, using
   //         two separate pointers each for d & e
   //    2. move off the last 'l' iteration to its own copy
   //    3. in the main one, rlim is now always odd; unroll r loop once
   //    4. move the first update of r values after second copy (by updating all references to it)
   //    5. rename all variables from second copy
   //    6. move all variables up into first loop (the point of this strategy is
   //           that they're actually all small deltas away)
   //    7. condense down to the small deltas (now we're updating 8 items per iteration)

   // further split: we split off the first iterationsand the second-to-last iteration
   //    separately (a la item #2 above)

   // this is iteration 0 of step 3
   for (i=0; i < 2; ++i)
      imdct_step3_iter0_loop(n >> 4, u, n2-1-(n>>2)*i, -(n >> 3), A);

   l = 1;
   for (; l < (ld-3)>>1; ++l) {
      int k0 = n >> (l+2), k0_2 = k0>>1;
      int lim = 1 << (l+1);
      for (i=0; i < lim; ++i)
         imdct_step3_inner_r_loop(n >> (l+4), u, n2-1 - k0*i, -k0_2, A, 1 << (l+3));
   }

   for (; l < ld-4; ++l) {
      int k0 = n >> (l+2), k1 = 1 << (l+3), k0_2 = k0>>1;
      int rlim = n >> (l+4), r;
      int k_off = -k0_2 * sizeof(float);
      int lim = 1 << (l+1);
      for (r=0; r < rlim;) {
         float *A0 = &A[r*k1];
         imdct_step3_inner_s_loop(lim, u, n2-1-r*2, -k0_2, A0, k1, k0);
         r+=2;
      }
   }

   // this is iteration ld-4 (the ld-3'th iteration) of step #3
   //l = ld-4;
   {
      float A0 = A[0];
      float A1 = A[1];
      float *a = u+n2-1;

      while (a > u) {
         float x02,x13;

         x02 = a[-0] - a[-2];
         x13 = a[-1] - a[-3];
         a[-0] = a[-0] + a[-2];
         a[-1] = a[-1] + a[-3];
         a[-2] = x02 * A0 - x13 * A1;
         a[-3] = x13 * A0 + x02 * A1;

         x02 = a[-4] - a[-6];
         x13 = a[-5] - a[-7];
         a[-4] = a[-4] + a[-6];
         a[-5] = a[-5] + a[-7];
         a[-6] = x02 * A0 - x13 * A1;
         a[-7] = x13 * A0 + x02 * A1;

         a -= 8;
      }
   }

   // output is u

   // step 4, 5, and 6
   for (i=0; i < n8; ++i) {
      int j = bitrev[i];
      int k2 = i << 1, k4 = j << 2;
      v[n2-1-k2] = u[k4+0];
      v[n2-2-k2] = u[k4+1];
      v[n4-1-k2] = u[k4+2];
      v[n4-2-k2] = u[k4+3];
   }
   // (paper output is u, now v)

   // input is v

   // step 7   (paper output is v, now v)
   // this is now in place
   for (k2=0; k2 < n4; k2 += 2) {
      int pp = n2 - k2;
      float vk20 = v[k2+0];
      float vk21 = v[k2+1];
      float vpp1 = v[pp-1];
      float vpp2 = v[pp-2];
      float vv_20_p2 = vk20-vpp2;
      float vv_21_p1 = vk21+vpp1;

      vk20 += vpp2;
      vk21 -= vpp1;

      vpp1 = C[k2+1]*vv_20_p2 + C[k2]*vv_21_p1;
      vpp2 = C[k2+1]*vv_21_p1 - C[k2]*vv_20_p2;

      v[k2+0] = ( vk20 + vpp1);
      v[pp-2] = ( vk20 - vpp1);
      v[k2+1] = ( vk21 + vpp2);
      v[pp-1] = ( vpp2 - vk21);
   }

   // step 8   (paper output is X, now v)
   // this is in-place writing to k*2,k*2+1 instead of k,n2-1-k;
   // the final 'decode kernel' step then fetches from these spots
   // doing this lets us push steps 6+7 into step 5

   for (k=0; k < n4; ++k) {
      float p0 = v[k*2   ]*B[k*2  ] + v[k*2+1]*B[k*2+1]; 
      float p1 = v[k*2   ]*B[k*2+1] - v[k*2+1]*B[k*2  ];
      v[k*2  ] = p0; // was v[k]
      v[k*2+1] = p1; // was v[n2-1-k]
   }

   #define vlow(x)  v[(x)*2]
   #define vhi(x)   v[(n2-1-(x))*2+1]
   //   temp[k*2+1] = v[n2-1-k]
   //   temp[y] = v[x]
   //     y=k*2+1
   //     x=n2-1-k
   //     k=n2-1-x
   //     y=(n2-1-x)*2+1

   // decode kernel to output
   // there's a scale value that's been hoisted into the B[] array; that
   // value was determined experimentally (first making inverse_mdct_slow work,
   // then making this match that). No factor of 1/n.
   for (i=0; i < n4  ; ++i) buffer[i] =  vhi(i+n4);
   for (   ; i < n2  ; ++i) buffer[i] = -vhi(n3_4 - i - 1);
   for (   ; i < n3_4; ++i) buffer[i] = -vlow(n3_4 - i - 1);
   for (   ; i < n   ; ++i) buffer[i] = -vlow(i - n3_4);

   temp_alloc_restore(f,save_point);
   #undef n_2
   #undef n_4
   #undef n_8
}