// the following were split out into separate functions while debugging; // they could be pushed back up but eh static void imdct_step3_iter0_loop(int n, float *d, float *e, int i_off, int k_off, float *A) { float *dd0 = d + i_off; float *dd2 = dd0 + k_off; float *ee0 = e + i_off; float *ee2 = ee0 + k_off; int i; for (i=n-1; i >= 0; --i) { float k00_20 = ee0[ 0] - ee2[ 0]; float k01_21 = ee0[-1] - ee2[-1]; dd0[ 0] = ee0[ 0] + ee2[ 0]; dd0[-1] = ee0[-1] + ee2[-1]; dd2[ 0] = k00_20 * A[0] - k01_21 * A[1]; dd2[-1] = k01_21 * A[0] + k00_20 * A[1]; A += 8; dd0 -= 2; dd2 -= 2; ee0 -= 2; ee2 -= 2; } } static void imdct_step3_inner_r_loop(int lim, float *d, float *e, int d0, int k_off, float *AA, int k1) { int i; int d2 = d0 + k_off; float k00_20, k01_21; for (i=lim; i > 0; i -= 2) { k00_20 = e[d0-0] - e[d2-0]; k01_21 = e[d0-1] - e[d2-1]; d[d0 ] = e[d0-0] + e[d2-0]; d[d0-1] = e[d0-1] + e[d2-1]; d[d2 ] = (k00_20)*AA[0] - (k01_21) * AA[1]; d[d2-1] = (k01_21)*AA[0] + (k00_20) * AA[1]; AA += k1; k00_20 = e[d0-2] - e[d2-2]; k01_21 = e[d0-3] - e[d2-3]; d[d0-2] = e[d0-2] + e[d2-2]; d[d0-3] = e[d0-3] + e[d2-3]; d[d2-2] = (k00_20)*AA[0] - (k01_21) * AA[1]; d[d2-3] = (k01_21)*AA[0] + (k00_20) * AA[1]; d0 -= 4; d2 -= 4; AA += k1; } } static void imdct_step3_inner_s_loop(int n, float *d, float *e, int i_off, int k_off, float *A, int a_off, int k0) { int i; float A0 = A[0]; float A1 = A[0+1]; float A2 = A[0+a_off]; float A3 = A[0+a_off+1]; float k00,k11; // with VC6, it didn't work to try to just use two pointers // plus one offset, whether I stored the offset premultiplied // by sizeof(float) or not; the best result was using four // pointers, from which VC6 notices the parallelism and reduces // it to only 3(!) plus an offset. float *dd0 = d +i_off; float *ee0 = e +i_off; float *dd2 = dd0+k_off; float *ee2 = ee0+k_off; for (i=0; i < n; ++i) { k00 = ee0[ 0] - ee2[ 0]; k11 = ee0[-1] - ee2[-1]; dd0[ 0] = ee0[ 0] + ee2[ 0]; dd0[-1] = ee0[-1] + ee2[-1]; dd2[ 0] = (k00) * A0 - (k11) * A1; dd2[-1] = (k11) * A0 + (k00) * A1; k00 = ee0[-2] - ee2[-2]; k11 = ee0[-3] - ee2[-3]; dd0[-2] = ee0[-2] + ee2[-2]; dd0[-3] = ee0[-3] + ee2[-3]; dd2[-2] = (k00) * A2 - (k11) * A3; dd2[-3] = (k11) * A2 + (k00) * A3; dd0 -= k0; dd2 -= k0; ee0 -= k0; ee2 -= k0; } } typedef struct { float data[4]; } f4; static void inverse_mdct(float *buffer, int n, vorb *f, int blocktype) { int i,k,k2,k4, n2 = n >> 1, n4 = n >> 2, n8 = n >> 3, l; int n3_4 = n - n4, ld; uint16 *bitrev = f->bit_reverse[blocktype]; // @OPTIMIZE: reduce register pressure by using fewer variables? int save_point = temp_alloc_save(f); float *v = (float *) temp_alloc(f, n2 * sizeof(*v)); float *u,*d,*e,*z; // twiddle factors float *A = f->A[blocktype], *B = f->B[blocktype], *C = f->C[blocktype]; // IMDCT algorithm from "The use of multirate filter banks for coding of high quality digital audio" // See notes about bugs in that paper in less-optimal implementation 'inverse_mdct_old'. // copy and reflect spectral data u = buffer; #define UMIRROR(z) (-u[n-(z)-1]) // kernel from paper // step 1 z = v+n2; #define n_2 n2 #define n_4 n4 #define n_8 n8 for (k2=k4=0; k2 < n4; k2+=2, k4+=4) { z[-k2-1] = (u[k4] - UMIRROR(n-k4-1)) * A[k2] - (u[k4+2] - UMIRROR(n-k4-3))*A[k2+1]; z[-k2-2] = (u[k4] - UMIRROR(n-k4-1)) * A[k2+1] + (u[k4+2] - UMIRROR(n-k4-3))*A[k2]; } for (; k2 < n_2; k2+=2, k4+=4) { z[-k2-1] = (UMIRROR(k4) - u[n-k4-1]) * A[k2] - (UMIRROR(k4+2) - u[n-k4-3])*A[k2+1]; z[-k2-2] = (UMIRROR(k4) - u[n-k4-1]) * A[k2+1] + (UMIRROR(k4+2) - u[n-k4-3])*A[k2]; } z = v+n4; // step 2 (paper output is w, now u) for (k4=k2=0; k2 < n_4; k2 += 2, k4+=4) { u[n4+1+k2] = z[1+k2] + v[k2+1]; u[n4+0+k2] = z[0+k2] + v[k2+0]; u[k2+1] = (z[1+k2] - v[k2+1])*A[n2-4-k4] - (z[0+k2]-v[k2+0])*A[n2-3-k4]; u[k2+0] = (z[0+k2] - v[k2+0])*A[n2-4-k4] + (z[1+k2]-v[k2+1])*A[n2-3-k4]; } // step 3 ld = ilog(n) - 1; // ilog is off-by-one from normal definitions e = u; d = v; // now d and e are the two buffers #define SWAP_d_AND_e() { float *z = e; e = d; d = z; } // optimized step 3: // the original step3 loop can be nested r inside s or s inside r; // it's written originally as inside r, but this is dumb when r // iterates many times, and s few. So I have two copies of it and // switch between them halfway. // now, to optimize the s-inside loop: // 1. optimize away all the index calculations, using // two separate pointers each for d & e // 2. move off the last 'l' iteration to its own copy // 3. in the main one, rlim is now always odd; unroll r loop once // 4. move the first update of r values after second copy (by updating all references to it) // 5. rename all variables from second copy // 6. move all variables up into first loop (the point of this strategy is // that they're actually all small deltas away) // 7. condense down to the small deltas (now we're updating 8 items per iteration) // further split: we split off the first iterationsand the second-to-last iteration // separately (a la item #2 above) // this is iteration 0 of step 3 for (i=0; i < 2; ++i) imdct_step3_iter0_loop(n >> 4, d, e, n2-1-(n>>2)*i, -(n >> 3), A); SWAP_d_AND_e(); l = 1; for (; l < (ld-3)>>1; ++l) { int k0 = n >> (l+2), k0_2 = k0>>1; int lim = 1 << (l+1); for (i=0; i < lim; ++i) imdct_step3_inner_r_loop(n >> (l+4), d, e, n2-1 - k0*i, -k0_2, A, 1 << (l+3)); SWAP_d_AND_e(); } for (; l < ld-4; ++l) { int k0 = n >> (l+2), k1 = 1 << (l+3), k0_2 = k0>>1; int rlim = n >> (l+4), r; int k_off = -k0_2 * sizeof(float); int lim = 1 << (l+1); for (r=0; r < rlim;) { float *A0 = &A[r*k1]; imdct_step3_inner_s_loop(lim, d, e, n2-1-r*2, -k0_2, A0, k1, k0); r+=2; } SWAP_d_AND_e(); } // this is iteration ld-4 (the ld-3'th iteration) of step #3 l = ld-4; { float A0 = A[0]; float A1 = A[1]; float *b = d+n2-1; float *a = e+n2-1; while (b > d) { b[ 0] = a[ 0] + a[-2]; b[-1] = a[-1] + a[-3]; b[-2] = (a[ 0] - a[-2]) * A0 - (a[-1] - a[-3]) * A1; b[-3] = (a[-1] - a[-3]) * A0 + (a[ 0] - a[-2]) * A1; b[-4] = a[-4] + a[-6]; b[-5] = a[-5] + a[-7]; b[-6] = (a[-4] - a[-6]) * A0 - (a[-5] - a[-7]) * A1; b[-7] = (a[-5] - a[-7]) * A0 + (a[-4] - a[-6]) * A1; b -= 8; a -= 8; } } SWAP_d_AND_e(); // now we need to bit-reverse, and leave the result in u. // so if it's currently in u, we bit-reverse in place; if // it's currently in v, we bit-reverse by copying. // step 4 (paper output is v, now u) if (e == u) { for (i=0; i < n8; ++i) { int j = bitrev[i]; assert(j < n8); if (i < j) { int j4 = j << 2, i4 = i << 2; f4 temp; temp = *(f4 *) (u+j4); *(f4 *)(u+j4) = *(f4 *)(u+i4); *(f4 *)(u+i4) = temp; } } } else { for (i=0; i < n8; ++i) { int j = bitrev[i]; assert(j < n8); if (i == j) { // paper bug: this case wasn't included, despite the formulas // show this as a copy, not in-place; but if you're // not in-place, this copy is needed. int i4 = i << 2; *(f4 *)(u+i4) = *(f4 *)(v+i4); } else if (i < j) { int i4 = i << 2, j4 = j << 2; *(f4 *)(u+j4) = *(f4 *)(v+i4); *(f4 *)(u+i4) = *(f4 *)(v+j4); } } } // step 5 (folded into step 6) // step 6 (paper output is u, now v) for (k2=k4=0; k2 < n4; k2 += 2, k4 += 4) { v[n2-1-k2] = u[k4+0]; v[n2-2-k2] = u[k4+1]; v[n4-1-k2] = u[k4+2]; v[n4-2-k2] = u[k4+3]; } // step 7 (paper output is v, now u) for (k2=0; k2 < n4; k2 += 2) { u[ k2] = ( v[ k2] + v[n2-2-k2] + C[k2+1]*(v[ k2]-v[n2-2-k2]) + C[k2]*(v[k2+1]+v[n2-2-k2+1])); u[n2-2- k2] = ( v[ k2] + v[n2-2-k2] - C[k2+1]*(v[ k2]-v[n2-2-k2]) - C[k2]*(v[k2+1]+v[n2-2-k2+1])); u[ 1+ k2] = ( v[ 1+k2] - v[n2-1-k2] + C[k2+1]*(v[1+k2]+v[n2-1-k2]) - C[k2]*(v[k2 ]-v[n2-2-k2 ])); u[n2-1- k2] = (-v[ 1+k2] + v[n2-1-k2] + C[k2+1]*(v[1+k2]+v[n2-1-k2]) - C[k2]*(v[k2 ]-v[n2-2-k2 ])); } // step 8 (paper output is X, now v) for (k=0; k < n4; ++k) { v[k] = u[k*2 ]*B[k*2 ] + u[k*2+1 ]*B[k*2+1]; v[n2-1-k] = u[k*2 ]*B[k*2+1] - u[k*2+1 ]*B[k*2 ]; } // decode kernel to output // there's a scale value that's been hoisted into the B[] array; that // value was determined experimentally (first making inverse_mdct_slow work, // then making this match that). Weirdly, factor of n is missing! // (probably vorbis encoder premultiplies by n or n/2, to save it on the decoder?) for (i=0; i < n4 ; ++i) buffer[i] = v[i+n4]; for ( ; i < n3_4; ++i) buffer[i] = -v[n3_4 - i - 1]; for ( ; i < n ; ++i) buffer[i] = -v[i - n3_4]; temp_alloc_restore(f,save_point); #undef n_2 #undef n_4 #undef n_8 }