static void inverse_mdct(float *buffer, int n, vorb *f, int blocktype)
{
   int i,k,k2,k4, n2 = n >> 1, n4 = n >> 2, n8 = n >> 3, l;
   int n3_4 = n - n4, ld;
   // @OPTIMIZE: reduce register pressure by using fewer variables
   int save_point = temp_alloc_save(f);
   float *v = (float *) temp_alloc(f, n * sizeof(*v));
   float *u,*d,*e;
   // twiddle factors
   float *A = f->A[blocktype], *B = f->B[blocktype], *C = f->C[blocktype];

   // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio"
   // Note there are bugs in that pseudocode, presumably due to them attempting
   // to rename the arrays nicely rather than representing the way their actual
   // implementation bounces buffers back and forth. As a result, even in the
   // "some formulars corrected" version, a direct implementation fails. These
   // are noted below as "paper bug".

   // copy and reflect spectral data
   //   paper's u = buffer
   u = buffer;
   for (k = n2; k < n ; ++k) u[k] = -u[n - k - 1];
   // kernel from paper
   // step 1
   for (k=k2=k4=0; k < n4; k+=1, k2+=2, k4+=4) {
      v[n-k4-1] = (u[k4] - u[n-k4-1]) * A[k2]   - (u[k4+2] - u[n-k4-3])*A[k2+1];
      v[n-k4-3] = (u[k4] - u[n-k4-1]) * A[k2+1] + (u[k4+2] - u[n-k4-3])*A[k2];
   }
   // step 2    (paper output is w, now u)
   for (k=k4=0; k < n8; k+=1, k4+=4) {
      u[n2+3+k4] = v[n2+3+k4] + v[k4+3];
      u[n2+1+k4] = v[n2+1+k4] + v[k4+1];
      u[k4+3]    = (v[n2+3+k4] - v[k4+3])*A[n2-4-k4] - (v[n2+1+k4]-v[k4+1])*A[n2-3-k4];
      u[k4+1]    = (v[n2+1+k4] - v[k4+1])*A[n2-4-k4] + (v[n2+3+k4]-v[k4+3])*A[n2-3-k4];
   }
   // step 3
   ld = ilog(n) - 1; // ilog is off-by-one from normal definitions
   e = u;
   d = v;
   // now d and e are the two buffers
   for (l=0; l < ld-3; ++l) {
      int k0 = n >> (l+2), k1 = 1 << (l+3);
      int rlim = n >> (l+4), r4, r;
      int s2lim = 1 << (l+2), s2;
      for (r=r4=0; r < rlim; r4+=4,++r) {
         for (s2=0; s2 < s2lim; s2+=2) {
            d[n-1-k0*s2-r4] = e[n-1-k0*s2-r4] + e[n-1-k0*(s2+1)-r4];
            d[n-3-k0*s2-r4] = e[n-3-k0*s2-r4] + e[n-3-k0*(s2+1)-r4];
            d[n-1-k0*(s2+1)-r4] = (e[n-1-k0*s2-r4] - e[n-1-k0*(s2+1)-r4]) * A[r*k1]
                                - (e[n-3-k0*s2-r4] - e[n-3-k0*(s2+1)-r4]) * A[r*k1+1];
            d[n-3-k0*(s2+1)-r4] = (e[n-3-k0*s2-r4] - e[n-3-k0*(s2+1)-r4]) * A[r*k1]
                                + (e[n-1-k0*s2-r4] - e[n-1-k0*(s2+1)-r4]) * A[r*k1+1];
         }
      }

      // swap the buffers--missing from the paper
      {
         float *z = e;
         e = d;
         d = z;
      }
   }

   // now we need to bit-reverse, and leave the result in u.
   // so if it's currently in u, we bit-reverse in place; if
   // it's currently in v, we bit-reverse by copying.

   // step 4   (paper output is v, now u)
   if (e == u) {
      for (i=0; i < n8; ++i) {
         int j = bit_reverse(i) >> (32-ld+3);
         assert(j < n8);
         if (i < j) {
            int i8 = i << 3, j8 = j << 3;
            float w,x,y,z;
            w = u[j8+1]; u[j8+1] = u[i8+1]; u[i8+1] = w;
            x = u[j8+3]; u[j8+3] = u[i8+3]; u[i8+3] = x;
            y = u[j8+5]; u[j8+5] = u[i8+5]; u[i8+5] = y;
            z = u[j8+7]; u[j8+7] = u[i8+7]; u[i8+7] = z;
         }
      }
   } else {
      for (i=0; i < n8; ++i) {
         int j = bit_reverse(i) >> (32-ld+3);
         assert(j < n8);
         if (i == j) {
            // paper bug: this case wasn't included, despite the formulas
            //            show this as a copy, not in-place; but if you're
            //            not in-place, this copy is needed.
            int i8 = i << 3;
            u[i8+1] = v[i8+1];
            u[i8+3] = v[i8+3];
            u[i8+5] = v[i8+5];
            u[i8+7] = v[i8+7];
         } else if (i < j) {
            int i8 = i << 3, j8 = j << 3;
            u[j8+1] = v[i8+1], u[i8+1] = v[j8 + 1];
            u[j8+3] = v[i8+3], u[i8+3] = v[j8 + 3];
            u[j8+5] = v[i8+5], u[i8+5] = v[j8 + 5];
            u[j8+7] = v[i8+7], u[i8+7] = v[j8 + 7];
         }
      }
   }

   // step 5   (folded into step 6)
   // step 6   (paper output is u, now v)
   for (k=k2=k4=0; k < n8; ++k, k2 += 2, k4 += 4) {
      v[n-1-k2] = u[k4*2+1];
      v[n-2-k2] = u[k4*2+3];
      v[n3_4 - 1 - k2] = u[k4*2+5];
      v[n3_4 - 2 - k2] = u[k4*2+7];
   }
   // step 7   (paper output is v, now u)
   for (k=k2=0; k < n8; ++k, k2 += 2) {
      u[n2 + k2 ] = ( v[n2 + k2] + v[n-2-k2] + C[k2+1]*(v[n2+k2]-v[n-2-k2]) + C[k2]*(v[n2+k2+1]+v[n-2-k2+1]))/2;
      u[n-2 - k2] = ( v[n2 + k2] + v[n-2-k2] - C[k2+1]*(v[n2+k2]-v[n-2-k2]) - C[k2]*(v[n2+k2+1]+v[n-2-k2+1]))/2;
      u[n2+1+ k2] = ( v[n2+1+k2] - v[n-1-k2] + C[k2+1]*(v[n2+1+k2]+v[n-1-k2]) - C[k2]*(v[n2+k2]-v[n-2-k2]))/2;
      u[n-1 - k2] = (-v[n2+1+k2] + v[n-1-k2] + C[k2+1]*(v[n2+1+k2]+v[n-1-k2]) - C[k2]*(v[n2+k2]-v[n-2-k2]))/2;
   }
   // step 8   (paper output is X, now v)
   for (k=k2=0; k < n4; ++k,k2 += 2) {
      v[k]      = u[k2+n2]*B[k2  ] + u[k2+1+n2]*B[k2+1];
      v[n2-1-k] = u[k2+n2]*B[k2+1] - u[k2+1+n2]*B[k2  ];
   }

   // decode kernel to output
   // there's a scale value that's been hoisted into the B[] array; that
   // value was determined experimentally (first making inverse_mdct_slow work,
   // then making this match that). Weirdly, factor of n is missing!
   // (probably vorbis encoder premultiplies by n or n/2, to save it on the decoder?)
   for (i=0; i < n4  ; ++i) buffer[i] = v[i+n4];
   for (   ; i < n3_4; ++i) buffer[i] = -v[n3_4 - i - 1];
   for (   ; i < n   ; ++i) buffer[i] = -v[i - n3_4];

   temp_alloc_restore(f,save_point);
}