	for (m = 0; m < mmax; m += 2) {
	    j = m + mmax;
	    tempr = (float) wr *(data1r = data[j]);
	    tempi = (float) wi *(data1i = data[j + 1]);
	    for (i = m; i < n - mmax * 2; i += mmax * 2) {
		/* mixed precision not significantly more
		 * accurate here; if removing float casts,
		 * tempr and tempi should be double */
		tempr -= tempi;
		tempi = (float) wr *data1i + (float) wi *data1r;
		/* don't expect compiler to analyze j > i+1 */
		data1r = data[j + mmax * 2];
		data1i = data[j + mmax * 2 + 1];
		data[i] = (datar = data[i]) + tempr;
		data[i + 1] = (datai = data[i + 1]) + tempi;
		data[j] = datar - tempr;
		data[j + 1] = datai - tempi;
		tempr = (float) wr *data1r;
		tempi = (float) wi *data1i;
		j += mmax * 2;
	    }
	    tempr -= tempi;
	    tempi = (float) wr *data1i + (float) wi *data1r;
	    data[i] = (datar = data[i]) + tempr;
	    data[i + 1] = (datai = data[i + 1]) + tempi;
	    data[j] = datar - tempr;
	    data[j + 1] = datai - tempi;
	    wr = (wtemp = wr) * wpr - wi * wpi;
	    wi = wtemp * wpi + wi * wpr;
	}
