static void inverse_mdct(float *buffer, int n, vorb *f, int blocktype) { int i,k,k2,k4, n2 = n >> 1, n4 = n >> 2, n8 = n >> 3, l; int n3_4 = n - n4, ld; // @OPTIMIZE: reduce register pressure by using fewer variables int save_point = temp_alloc_save(f); float *v = (float *) temp_alloc(f, n * sizeof(*v)); float *u,*d,*e; // twiddle factors float *A = f->A[blocktype], *B = f->B[blocktype], *C = f->C[blocktype]; // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio" // Note there are bugs in that pseudocode, presumably due to them attempting // to rename the arrays nicely rather than representing the way their actual // implementation bounces buffers back and forth. As a result, even in the // "some formulars corrected" version, a direct implementation fails. These // are noted below as "paper bug". // copy and reflect spectral data // paper's u = buffer u = buffer; for (k = n2; k < n ; ++k) u[k] = -u[n - k - 1]; // kernel from paper // step 1 for (k=k2=k4=0; k < n4; k+=1, k2+=2, k4+=4) { v[n-k4-1] = (u[k4] - u[n-k4-1]) * A[k2] - (u[k4+2] - u[n-k4-3])*A[k2+1]; v[n-k4-3] = (u[k4] - u[n-k4-1]) * A[k2+1] + (u[k4+2] - u[n-k4-3])*A[k2]; } // step 2 (paper output is w, now u) for (k=k4=0; k < n8; k+=1, k4+=4) { u[n2+3+k4] = v[n2+3+k4] + v[k4+3]; u[n2+1+k4] = v[n2+1+k4] + v[k4+1]; u[k4+3] = (v[n2+3+k4] - v[k4+3])*A[n2-4-k4] - (v[n2+1+k4]-v[k4+1])*A[n2-3-k4]; u[k4+1] = (v[n2+1+k4] - v[k4+1])*A[n2-4-k4] + (v[n2+3+k4]-v[k4+3])*A[n2-3-k4]; } // step 3 ld = ilog(n) - 1; // ilog is off-by-one from normal definitions e = u; d = v; // now d and e are the two buffers for (l=0; l < ld-3; ++l) { int k0 = n >> (l+2), k1 = 1 << (l+3); int rlim = n >> (l+4), r4, r; int s2lim = 1 << (l+2), s2; for (r=r4=0; r < rlim; r4+=4,++r) { for (s2=0; s2 < s2lim; s2+=2) { d[n-1-k0*s2-r4] = e[n-1-k0*s2-r4] + e[n-1-k0*(s2+1)-r4]; d[n-3-k0*s2-r4] = e[n-3-k0*s2-r4] + e[n-3-k0*(s2+1)-r4]; d[n-1-k0*(s2+1)-r4] = (e[n-1-k0*s2-r4] - e[n-1-k0*(s2+1)-r4]) * A[r*k1] - (e[n-3-k0*s2-r4] - e[n-3-k0*(s2+1)-r4]) * A[r*k1+1]; d[n-3-k0*(s2+1)-r4] = (e[n-3-k0*s2-r4] - e[n-3-k0*(s2+1)-r4]) * A[r*k1] + (e[n-1-k0*s2-r4] - e[n-1-k0*(s2+1)-r4]) * A[r*k1+1]; } } // swap the buffers--missing from the paper { float *z = e; e = d; d = z; } } // now we need to bit-reverse, and leave the result in u. // so if it's currently in u, we bit-reverse in place; if // it's currently in v, we bit-reverse by copying. // step 4 (paper output is v, now u) if (e == u) { for (i=0; i < n8; ++i) { int j = bit_reverse(i) >> (32-ld+3); assert(j < n8); if (i < j) { int i8 = i << 3, j8 = j << 3; float w,x,y,z; w = u[j8+1]; u[j8+1] = u[i8+1]; u[i8+1] = w; x = u[j8+3]; u[j8+3] = u[i8+3]; u[i8+3] = x; y = u[j8+5]; u[j8+5] = u[i8+5]; u[i8+5] = y; z = u[j8+7]; u[j8+7] = u[i8+7]; u[i8+7] = z; } } } else { for (i=0; i < n8; ++i) { int j = bit_reverse(i) >> (32-ld+3); assert(j < n8); if (i == j) { // paper bug: this case wasn't included, despite the formulas // show this as a copy, not in-place; but if you're // not in-place, this copy is needed. int i8 = i << 3; u[i8+1] = v[i8+1]; u[i8+3] = v[i8+3]; u[i8+5] = v[i8+5]; u[i8+7] = v[i8+7]; } else if (i < j) { int i8 = i << 3, j8 = j << 3; u[j8+1] = v[i8+1], u[i8+1] = v[j8 + 1]; u[j8+3] = v[i8+3], u[i8+3] = v[j8 + 3]; u[j8+5] = v[i8+5], u[i8+5] = v[j8 + 5]; u[j8+7] = v[i8+7], u[i8+7] = v[j8 + 7]; } } } // step 5 (folded into step 6) // step 6 (paper output is u, now v) for (k=k2=k4=0; k < n8; ++k, k2 += 2, k4 += 4) { v[n-1-k2] = u[k4*2+1]; v[n-2-k2] = u[k4*2+3]; v[n3_4 - 1 - k2] = u[k4*2+5]; v[n3_4 - 2 - k2] = u[k4*2+7]; } // step 7 (paper output is v, now u) for (k=k2=0; k < n8; ++k, k2 += 2) { u[n2 + k2 ] = ( v[n2 + k2] + v[n-2-k2] + C[k2+1]*(v[n2+k2]-v[n-2-k2]) + C[k2]*(v[n2+k2+1]+v[n-2-k2+1]))/2; u[n-2 - k2] = ( v[n2 + k2] + v[n-2-k2] - C[k2+1]*(v[n2+k2]-v[n-2-k2]) - C[k2]*(v[n2+k2+1]+v[n-2-k2+1]))/2; u[n2+1+ k2] = ( v[n2+1+k2] - v[n-1-k2] + C[k2+1]*(v[n2+1+k2]+v[n-1-k2]) - C[k2]*(v[n2+k2]-v[n-2-k2]))/2; u[n-1 - k2] = (-v[n2+1+k2] + v[n-1-k2] + C[k2+1]*(v[n2+1+k2]+v[n-1-k2]) - C[k2]*(v[n2+k2]-v[n-2-k2]))/2; } // step 8 (paper output is X, now v) for (k=k2=0; k < n4; ++k,k2 += 2) { v[k] = u[k2+n2]*B[k2 ] + u[k2+1+n2]*B[k2+1]; v[n2-1-k] = u[k2+n2]*B[k2+1] - u[k2+1+n2]*B[k2 ]; } // decode kernel to output // there's a scale value that's been hoisted into the B[] array; that // value was determined experimentally (first making inverse_mdct_slow work, // then making this match that). Weirdly, factor of n is missing! // (probably vorbis encoder premultiplies by n or n/2, to save it on the decoder?) for (i=0; i < n4 ; ++i) buffer[i] = v[i+n4]; for ( ; i < n3_4; ++i) buffer[i] = -v[n3_4 - i - 1]; for ( ; i < n ; ++i) buffer[i] = -v[i - n3_4]; temp_alloc_restore(f,save_point); }