From f3070d2e718e2b1687abfaba2f167978e88e01cc Mon Sep 17 00:00:00 2001
From: Thomas Schmitt <scdbackup@gmx.net>
Date: Mon, 19 Oct 2009 11:56:25 +0000
Subject: [PATCH] Optimizations with parity computation, clarification about
 nature of logarithms

---
 cdrskin/cdrskin_timestamp.h |   2 +-
 libburn/ecma130ab.c         | 147 ++++++++++++++++++++++++++----------
 2 files changed, 108 insertions(+), 41 deletions(-)

diff --git a/cdrskin/cdrskin_timestamp.h b/cdrskin/cdrskin_timestamp.h
index 71a0247..deeee30 100644
--- a/cdrskin/cdrskin_timestamp.h
+++ b/cdrskin/cdrskin_timestamp.h
@@ -1 +1 @@
-#define Cdrskin_timestamP "2009.10.17.131852"
+#define Cdrskin_timestamP "2009.10.19.115722"
diff --git a/libburn/ecma130ab.c b/libburn/ecma130ab.c
index 1368c13..ed80e19 100644
--- a/libburn/ecma130ab.c
+++ b/libburn/ecma130ab.c
@@ -76,7 +76,9 @@
    Multiplication and division would demand polynomial division, e.g. by the
    euclidian algorithm. The computing path over logarithms and powers follows
    algebra and allows to reduce the arithmetic task to table lookups, additions
-   modulo 255, and exor operations.
+   modulo 255, and exor operations. Note that the logarithms are natural
+   numbers, not polynomials. They get added or subtracted by the usual addition
+   (not by exor) and their polynomial power depends on their value modulo 255.
 
    Needed are a logarithm table and a power table (or inverse logarithm table)
    for Galois Field GF(2^8) which will serve to perform the peculiar
@@ -136,9 +138,15 @@
 
 /* Power and logarithm tables for GF(2^8).
    Generated by burn_rspc_setup_tables() and burn_rspc_print_tables().
+
+   The highest possible sum of gflog[] values is is 510. So the table gfpow[]
+   with period 255 was manually unrolled to 511 elements to avoid one modulo
+   255 operation in burn_rspc_mult().
+   Idea by D. Hugh Redelmeier.
+   
 */
 
-static unsigned char gfpow[256] = {
+static unsigned char gfpow[511] = {
 	  1,   2,   4,   8,  16,  32,  64, 128,  29,  58, 
 	116, 232, 205, 135,  19,  38,  76, 152,  45,  90, 
 	180, 117, 234, 201, 143,   3,   6,  12,  24,  48, 
@@ -164,7 +172,34 @@ static unsigned char gfpow[256] = {
 	172,  69, 138,   9,  18,  36,  72, 144,  61, 122, 
 	244, 245, 247, 243, 251, 235, 203, 139,  11,  22, 
 	 44,  88, 176, 125, 250, 233, 207, 131,  27,  54, 
-	108, 216, 173,  71, 142
+	108, 216, 173,  71, 142,
+	  1,   2,   4,   8,  16,  32,  64, 128,  29,  58, 
+	116, 232, 205, 135,  19,  38,  76, 152,  45,  90, 
+	180, 117, 234, 201, 143,   3,   6,  12,  24,  48, 
+	 96, 192, 157,  39,  78, 156,  37,  74, 148,  53, 
+	106, 212, 181, 119, 238, 193, 159,  35,  70, 140, 
+	  5,  10,  20,  40,  80, 160,  93, 186, 105, 210, 
+	185, 111, 222, 161,  95, 190,  97, 194, 153,  47, 
+	 94, 188, 101, 202, 137,  15,  30,  60, 120, 240, 
+	253, 231, 211, 187, 107, 214, 177, 127, 254, 225, 
+	223, 163,  91, 182, 113, 226, 217, 175,  67, 134, 
+	 17,  34,  68, 136,  13,  26,  52, 104, 208, 189, 
+	103, 206, 129,  31,  62, 124, 248, 237, 199, 147, 
+	 59, 118, 236, 197, 151,  51, 102, 204, 133,  23, 
+	 46,  92, 184, 109, 218, 169,  79, 158,  33,  66, 
+	132,  21,  42,  84, 168,  77, 154,  41,  82, 164, 
+	 85, 170,  73, 146,  57, 114, 228, 213, 183, 115, 
+	230, 209, 191,  99, 198, 145,  63, 126, 252, 229, 
+	215, 179, 123, 246, 241, 255, 227, 219, 171,  75, 
+	150,  49,  98, 196, 149,  55, 110, 220, 165,  87, 
+	174,  65, 130,  25,  50, 100, 200, 141,   7,  14, 
+	 28,  56, 112, 224, 221, 167,  83, 166,  81, 162, 
+	 89, 178, 121, 242, 249, 239, 195, 155,  43,  86, 
+	172,  69, 138,   9,  18,  36,  72, 144,  61, 122, 
+	244, 245, 247, 243, 251, 235, 203, 139,  11,  22, 
+	 44,  88, 176, 125, 250, 233, 207, 131,  27,  54, 
+	108, 216, 173,  71, 142,
+	  1
 };
 
 static unsigned char gflog[256] = {
@@ -443,49 +478,55 @@ static unsigned char ecma_130_annex_b[2340] = {
 
 
 /* This is the new implementation of P- and Q-parity generation.
-   It is totally unoptimized and thus needs about 50 percent more time than the
-   old implementation (both with gcc -O2 on AMD 64 bit). Measurements indicate
-   that about 400 MIPS are needed for 48x CD speed (7.1 MB/s).
+   It needs about the same computing time as the old implementation (both
+   with gcc -O2 on AMD 64 bit). Measurements indicate that about 280 MIPS
+   are needed for 48x CD speed (7.1 MB/s).
 */
 
 static unsigned char burn_rspc_mult(unsigned char a, unsigned char b)
 {
 	if (a == 0 || b == 0)
 		return 0;
-        return gfpow[(gflog[a] + gflog[b]) % 255];
+        return gfpow[gflog[a] + gflog[b]];
+	/* % 255 not necessary because gfpow is unrolled up to index 510 */
 }
 
 
-static unsigned char burn_rspc_div(unsigned char a, unsigned char b)
+/* Divides by polynomial 0x03. Derived from burn_rspc_div() */
+static unsigned char burn_rspc_div_3(unsigned char a)
 {
-	int d;
-
 	if (a == 0)
 		return 0;
-	if (b == 0)
-		return -1;
-	d = gflog[a] - gflog[b];
-	if (d < 0)
-		d += 255;
-	return gfpow[d];
+	if (gflog[a] >= 25)
+		return gfpow[gflog[a] - 25];
+	else
+		return gfpow[230 + gflog[a]];
 }
 
 
-static int burn_rspc_p0p1(unsigned char *sector, int col, int msb,
-                          unsigned char *p0, unsigned char *p1)
+static int burn_rspc_p0p1(unsigned char *sector, int col, 
+                          unsigned char *p0_lsb, unsigned char *p0_msb,
+                          unsigned char *p1_lsb, unsigned char *p1_msb)
 {
 	unsigned char *start, b;
-	unsigned int i, sum_v = 0, hxv = 0;
+	unsigned int i, sum_v_lsb = 0, sum_v_msb = 0;
+	unsigned int hxv_lsb = 0, hxv_msb = 0;
 
-	start = sector + 12 + 2 * col + !!msb;
+	start = sector + 12 + 2 * col;
 	for(i = 0; i < 24; i++) {
-		b = start[i * 86];
-		sum_v ^= b;
-		hxv ^= burn_rspc_mult(b, gfpow[25 - i]);
+		b = *start;
+		sum_v_lsb ^= b;
+		hxv_lsb ^= burn_rspc_mult(b, gfpow[25 - i]);
+		b = *(start + 1);
+		sum_v_msb ^= b;
+		hxv_msb ^= burn_rspc_mult(b, gfpow[25 - i]);
+		start += 86;
 	}
-	*p0 = burn_rspc_div(burn_rspc_mult(gfpow[1], sum_v) ^ hxv,
-					   3); /* gfpow[1] ^ gfpow[0]); */
-	*p1 = sum_v ^ *p0;
+	*p0_lsb = burn_rspc_div_3(burn_rspc_mult(2, sum_v_lsb) ^ hxv_lsb);
+				/* 2 = gfpow[1] , 3 = gfpow[1] ^ gfpow[0]); */
+	*p1_lsb = sum_v_lsb ^ *p0_lsb;
+	*p0_msb = burn_rspc_div_3(burn_rspc_mult(2, sum_v_msb) ^ hxv_msb);
+	*p1_msb = sum_v_msb ^ *p0_msb;
 	return 1;
 }
 
@@ -497,8 +538,7 @@ int burn_rspc_parity_p(unsigned char *sector)
 
 	/* Loop over P columns */
 	for(i = 0; i < 43; i++) {
-		burn_rspc_p0p1(sector, i, 0, &p0_lsb, &p1_lsb);
-		burn_rspc_p0p1(sector, i, 1, &p0_msb, &p1_msb);
+		burn_rspc_p0p1(sector, i, &p0_lsb, &p0_msb, &p1_lsb, &p1_msb);
 		sector[2162 + 2 * i]     =  p0_lsb;
 		sector[2162 + 2 * i + 1] =  p0_msb;
 		sector[2076 + 2 * i]     =  p1_lsb;
@@ -519,21 +559,30 @@ int burn_rspc_parity_p(unsigned char *sector)
 }
 
 
-static int burn_rspc_q0q1(unsigned char *sector, int diag, int msb,
-                          unsigned char *q0, unsigned char *q1)
+static int burn_rspc_q0q1(unsigned char *sector, int diag,
+                          unsigned char *q0_lsb, unsigned char *q0_msb,
+                          unsigned char *q1_lsb, unsigned char *q1_msb)
 {
 	unsigned char *start, b;
-	unsigned int i, sum_v = 0, hxv = 0;
+	unsigned int i, idx, sum_v_lsb = 0, sum_v_msb = 0;
+	unsigned int hxv_lsb = 0, hxv_msb = 0, lsb_start;
 
 	start = sector + 12;
+	lsb_start = 2 * 43 * diag;
 	for(i = 0; i < 43; i++) {
-		b = start[(2 * 43 * diag + i * 88 + !!msb) % 2236];
-		sum_v ^= b;
-		hxv ^= burn_rspc_mult(b, gfpow[44 - i]);
+		idx = (lsb_start + i * 88) % 2236;
+		b = start[idx];
+		sum_v_lsb ^= b;
+		hxv_lsb ^= burn_rspc_mult(b, gfpow[44 - i]);
+		b = start[idx + 1];
+		sum_v_msb ^= b;
+		hxv_msb ^= burn_rspc_mult(b, gfpow[44 - i]);
 	}
-	*q0 = burn_rspc_div(burn_rspc_mult(gfpow[1], sum_v) ^ hxv,
-					   3); /* gfpow[1] ^ gfpow[0]); */
-	*q1 = sum_v ^ *q0;
+	*q0_lsb = burn_rspc_div_3(burn_rspc_mult(2, sum_v_lsb) ^ hxv_lsb);
+				/* 2 = gfpow[1] ; 3 = gfpow[1] ^ gfpow[0]); */
+	*q1_lsb = sum_v_lsb ^ *q0_lsb;
+	*q0_msb = burn_rspc_div_3(burn_rspc_mult(2, sum_v_msb) ^ hxv_msb);
+	*q1_msb = sum_v_msb ^ *q0_msb;
 	return 1;
 }
 
@@ -545,8 +594,7 @@ int burn_rspc_parity_q(unsigned char *sector)
 
 	/* Loop over Q diagonals */
 	for(i = 0; i < 26; i++) {
-		burn_rspc_q0q1(sector, i, 0, &q0_lsb, &q1_lsb);
-		burn_rspc_q0q1(sector, i, 1, &q0_msb, &q1_msb);
+		burn_rspc_q0q1(sector, i, &q0_lsb, &q0_msb, &q1_lsb, &q1_msb);
 		sector[2300 + 2 * i]     =  q0_lsb;
 		sector[2300 + 2 * i + 1] =  q0_msb;
 		sector[2248 + 2 * i]     =  q1_lsb;
@@ -624,7 +672,7 @@ static int burn_rspc_print_tables(void)
 {
  int i;
 
- printf("static unsigned char gfpow[256] = {");
+ printf("static unsigned char gfpow[255] = {");
  printf("\n\t"); 
  for(i= 0; i < 255; i++) {
    printf("%3u, ", gfpow[i]);
@@ -697,5 +745,24 @@ static int print_ecma_130_scrambler(void)
 	return 1;
 }
 
+
+/* This is a general polynomial division function.
+   burn_rspc_div_3() has been derived from this by setting b to constant 3.
+*/
+static unsigned char burn_rspc_div(unsigned char a, unsigned char b)
+{
+	int d;
+
+	if (a == 0)
+		return 0;
+	if (b == 0)
+		return -1;
+	d = gflog[a] - gflog[b];
+	if (d < 0)
+		d += 255;
+	return gfpow[d];
+}
+
+
 #endif /* Libburn_with_lec_generatoR */