Rewrite of fixed_compute_best_predictor functions

The code of fixed_compute_best_predictor was presumable once optimized,
but it appears a much more readable form is now equally fast or even faster,
see https://github.com/xiph/flac/pull/337 for details. 
diff --git a/src/libFLAC/fixed.c b/src/libFLAC/fixed.c
index f6dc9ac..9e53f29 100644
--- a/src/libFLAC/fixed.c
+++ b/src/libFLAC/fixed.c
@@ -219,21 +219,39 @@
 uint32_t FLAC__fixed_compute_best_predictor(const FLAC__int32 data[], uint32_t data_len, FLAC__fixedpoint residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1])
 #endif
 {
+	FLAC__uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0;
+	uint32_t order;
+#if 0
+	/* This code has been around a long time, and was written when compilers weren't able
+	 * to vectorize code. These days, compilers are better in optimizing the next block
+	 * which is also much more readable
+	 */
 	FLAC__int32 last_error_0 = data[-1];
 	FLAC__int32 last_error_1 = data[-1] - data[-2];
 	FLAC__int32 last_error_2 = last_error_1 - (data[-2] - data[-3]);
 	FLAC__int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]);
 	FLAC__int32 error, save;
-	FLAC__uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0;
-	uint32_t i, order;
-
-	for(i = 0; i < data_len; i++) {
+	/* total_error_* are 64-bits to avoid overflow when encoding
+	 * erratic signals when the bits-per-sample and blocksize are
+	 * large.
+	 */
+	for(uint32_t i = 0; i < data_len; i++) {
 		error  = data[i]     ; total_error_0 += local_abs(error);                      save = error;
 		error -= last_error_0; total_error_1 += local_abs(error); last_error_0 = save; save = error;
 		error -= last_error_1; total_error_2 += local_abs(error); last_error_1 = save; save = error;
 		error -= last_error_2; total_error_3 += local_abs(error); last_error_2 = save; save = error;
 		error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save;
 	}
+#else
+	for(int i = 0; i < (int)data_len; i++) {
+		total_error_0 += local_abs(data[i]);
+		total_error_1 += local_abs(data[i] - data[i-1]);
+		total_error_2 += local_abs(data[i] - 2 * data[i-1] + data[i-2]);
+		total_error_3 += local_abs(data[i] - 3 * data[i-1] + 3 * data[i-2] - data[i-3]);
+		total_error_4 += local_abs(data[i] - 4 * data[i-1] + 6 * data[i-2] - 4 * data[i-3] + data[i-4]);
+	}
+#endif
+
 
 	/* prefer lower order */
 	if(total_error_0 <= flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4))
@@ -278,24 +296,15 @@
 uint32_t FLAC__fixed_compute_best_predictor_wide(const FLAC__int32 data[], uint32_t data_len, FLAC__fixedpoint residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1])
 #endif
 {
-	FLAC__int32 last_error_0 = data[-1];
-	FLAC__int32 last_error_1 = data[-1] - data[-2];
-	FLAC__int32 last_error_2 = last_error_1 - (data[-2] - data[-3]);
-	FLAC__int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]);
-	FLAC__int32 error, save;
-	/* total_error_* are 64-bits to avoid overflow when encoding
-	 * erratic signals when the bits-per-sample and blocksize are
-	 * large.
-	 */
 	FLAC__uint64 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0;
-	uint32_t i, order;
+	uint32_t order;
 
-	for(i = 0; i < data_len; i++) {
-		error  = data[i]     ; total_error_0 += local_abs(error);                      save = error;
-		error -= last_error_0; total_error_1 += local_abs(error); last_error_0 = save; save = error;
-		error -= last_error_1; total_error_2 += local_abs(error); last_error_1 = save; save = error;
-		error -= last_error_2; total_error_3 += local_abs(error); last_error_2 = save; save = error;
-		error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save;
+	for(int i = 0; i < (int)data_len; i++) {
+		total_error_0 += local_abs(data[i]);
+		total_error_1 += local_abs(data[i] - data[i-1]);
+		total_error_2 += local_abs(data[i] - 2 * data[i-1] + data[i-2]);
+		total_error_3 += local_abs(data[i] - 3 * data[i-1] + 3 * data[i-2] - data[i-3]);
+		total_error_4 += local_abs(data[i] - 4 * data[i-1] + 6 * data[i-2] - 4 * data[i-3] + data[i-4]);
 	}
 
 	/* prefer lower order */