More optimizations with parity computation

2009-10-20 16:14:14 +00:00
parent f3070d2e71
commit e646686145
4 changed files with 121 additions and 39 deletions
--- a/cdrskin/cdrskin_timestamp.h
+++ b/cdrskin/cdrskin_timestamp.h
@@ -1 +1 @@
-#define Cdrskin_timestamP "2009.10.19.115722"
+#define Cdrskin_timestamP "2009.10.20.160131"
--- a/libburn/ecma130ab.c
+++ b/libburn/ecma130ab.c
@@ -116,6 +116,10 @@
   Comparing the output of both alternatives with the old scrambler output
   lets 15 bit win for now.

+   So the prescription is to start with 15 bit value 1, to use the lowest bit
+   as output, to shift the bits down by one, to exor the output bit with the
+   next lowest bit, and to put that exor result into bit 14 of the register.
+
   -------------------------------------------------------------------------


@@ -136,17 +140,17 @@
 /* ------------------------------------------------------------------------- */


-/* Power and logarithm tables for GF(2^8).
+/* Power and logarithm tables for GF(2^8), parity matrices for ECMA-130.
   Generated by burn_rspc_setup_tables() and burn_rspc_print_tables().

-   The highest possible sum of gflog[] values is is 510. So the table gfpow[]
-   with period 255 was manually unrolled to 511 elements to avoid one modulo
+   The highest possible sum of gflog[] values is is 508. So the table gfpow[]
+   with period 255 was manually unrolled to 509 elements to avoid one modulo
   255 operation in burn_rspc_mult().
-   Idea by D. Hugh Redelmeier.
+   Proposed by D. Hugh Redelmeier.
   
 */

-static unsigned char gfpow[511] = {
+static unsigned char gfpow[509] = {
 	  1,   2,   4,   8,  16,  32,  64, 128,  29,  58, 
 	116, 232, 205, 135,  19,  38,  76, 152,  45,  90, 
 	180, 117, 234, 201, 143,   3,   6,  12,  24,  48, 
@@ -198,8 +202,7 @@ static unsigned char gfpow[511] = {
 	172,  69, 138,   9,  18,  36,  72, 144,  61, 122, 
 	244, 245, 247, 243, 251, 235, 203, 139,  11,  22, 
 	 44,  88, 176, 125, 250, 233, 207, 131,  27,  54, 
-	108, 216, 173,  71, 142,
-	  1
+	108, 216, 173,  71,
 };

 static unsigned char gflog[256] = {
@@ -232,6 +235,39 @@ static unsigned char gflog[256] = {
 };


+#define Libburn_use_h_matriceS 1
+
+#ifdef Libburn_use_h_matriceS
+
+/* On my AMD 2x64 bit 3000 MHz processor h[i] costs about 7 % more time
+   than using gfpow[25-i] resp. gfpow[44-1]. I blame this on the more
+   condensed data representation which slightly increases the rate of cache
+   hits.
+   Nevertheless this effect is very likely depending on the exact cache
+   size and architecture. In general, using h[] saves more than 8000
+   subtractions per sector.
+*/
+
+/* Parity matrices H as prescribed by ECMA-130 Annex A.
+   Actually just reverted order start pieces of gfpow[].
+*/
+static unsigned char h26[26] = {
+           3, 143, 201, 234, 117, 180,  90,  45, 152,  76,
+          38,  19, 135, 205, 232, 116,  58,  29, 128,  64,
+          32,  16,   8,   4,   2,   1,
+};
+
+static unsigned char h45[45] = {
+         238, 119, 181, 212, 106,  53, 148,  74,  37, 156,
+          78,  39, 157, 192,  96,  48,  24,  12,   6,   3,
+         143, 201, 234, 117, 180,  90,  45, 152,  76,  38,
+          19, 135, 205, 232, 116,  58,  29, 128,  64,  32,
+          16,   8,   4,   2,   1,
+};
+
+#endif /* Libburn_use_h_matriceS */
+
+
 /* Pseudo-random bytes which of course are exactly the same as with the
   previously used code.
   Generated by function print_ecma_130_scrambler().
@@ -487,24 +523,28 @@ static unsigned char burn_rspc_mult(unsigned char a, unsigned char b)
 {
 	if (a == 0 || b == 0)
 		return 0;
+	/* Optimization of (a == 0 || b == 0) by D. Hugh Redelmeier
+	if((((int)a - 1) | ((int)b - 1)) < 0)
+		return 0;
+	*/
+
        return gfpow[gflog[a] + gflog[b]];
 	/* % 255 not necessary because gfpow is unrolled up to index 510 */
 }


-/* Divides by polynomial 0x03. Derived from burn_rspc_div() */
+/* Divide by polynomial 0x03. Derived from burn_rspc_div() and using the
+   unrolled size of the gfpow[] array.
+*/
 static unsigned char burn_rspc_div_3(unsigned char a)
 {
 	if (a == 0)
 		return 0;
-	if (gflog[a] >= 25)
-		return gfpow[gflog[a] - 25];
-	else
-		return gfpow[230 + gflog[a]];
+	return gfpow[230 + gflog[a]];
 }


-static int burn_rspc_p0p1(unsigned char *sector, int col, 
+static void burn_rspc_p0p1(unsigned char *sector, int col, 
                          unsigned char *p0_lsb, unsigned char *p0_msb,
                          unsigned char *p1_lsb, unsigned char *p1_msb)
 {
@@ -516,22 +556,34 @@ static int burn_rspc_p0p1(unsigned char *sector, int col,
 	for(i = 0; i < 24; i++) {
 		b = *start;
 		sum_v_lsb ^= b;
+
+#ifdef Libburn_use_h_matriceS
+		hxv_lsb ^= burn_rspc_mult(b, h26[i]);
+#else
 		hxv_lsb ^= burn_rspc_mult(b, gfpow[25 - i]);
+#endif
+
 		b = *(start + 1);
 		sum_v_msb ^= b;
+
+#ifdef Libburn_use_h_matriceS
+		hxv_msb ^= burn_rspc_mult(b, h26[i]);
+#else
 		hxv_msb ^= burn_rspc_mult(b, gfpow[25 - i]);
+#endif
+
 		start += 86;
 	}
+
+				/* 3 = gfpow[1] ^ gfpow[0] , 2 = gfpow[1] */
 	*p0_lsb = burn_rspc_div_3(burn_rspc_mult(2, sum_v_lsb) ^ hxv_lsb);
-				/* 2 = gfpow[1] , 3 = gfpow[1] ^ gfpow[0]); */
-	*p1_lsb = sum_v_lsb ^ *p0_lsb;
 	*p0_msb = burn_rspc_div_3(burn_rspc_mult(2, sum_v_msb) ^ hxv_msb);
+	*p1_lsb = sum_v_lsb ^ *p0_lsb;
 	*p1_msb = sum_v_msb ^ *p0_msb;
-	return 1;
 }


-int burn_rspc_parity_p(unsigned char *sector)
+void burn_rspc_parity_p(unsigned char *sector)
 {
 	int i;
 	unsigned char p0_lsb, p0_msb, p1_lsb, p1_msb;
@@ -555,39 +607,51 @@ int burn_rspc_parity_p(unsigned char *sector)
 #endif /* Libburn_with_lec_generatoR */

 	}
-	return 1 ;
 }


-static int burn_rspc_q0q1(unsigned char *sector, int diag,
+static void burn_rspc_q0q1(unsigned char *sector, int diag,
                          unsigned char *q0_lsb, unsigned char *q0_msb,
                          unsigned char *q1_lsb, unsigned char *q1_msb)
 {
 	unsigned char *start, b;
 	unsigned int i, idx, sum_v_lsb = 0, sum_v_msb = 0;
-	unsigned int hxv_lsb = 0, hxv_msb = 0, lsb_start;
+	unsigned int hxv_lsb = 0, hxv_msb = 0;

 	start = sector + 12;
-	lsb_start = 2 * 43 * diag;
+	idx = 2 * 43 * diag;
 	for(i = 0; i < 43; i++) {
-		idx = (lsb_start + i * 88) % 2236;
+		if (idx >= 2236)
+			idx -= 2236;
 		b = start[idx];
 		sum_v_lsb ^= b;
+
+#ifdef Libburn_use_h_matriceS
+		hxv_lsb ^= burn_rspc_mult(b, h45[i]);
+#else
 		hxv_lsb ^= burn_rspc_mult(b, gfpow[44 - i]);
+#endif
+
 		b = start[idx + 1];
 		sum_v_msb ^= b;
+
+#ifdef Libburn_use_h_matriceS
+		hxv_msb ^= burn_rspc_mult(b, h45[i]);
+#else
 		hxv_msb ^= burn_rspc_mult(b, gfpow[44 - i]);
+#endif
+
+		idx += 88;
 	}
+				/* 3 = gfpow[1] ^ gfpow[0] , 2 = gfpow[1] */
 	*q0_lsb = burn_rspc_div_3(burn_rspc_mult(2, sum_v_lsb) ^ hxv_lsb);
-				/* 2 = gfpow[1] ; 3 = gfpow[1] ^ gfpow[0]); */
-	*q1_lsb = sum_v_lsb ^ *q0_lsb;
 	*q0_msb = burn_rspc_div_3(burn_rspc_mult(2, sum_v_msb) ^ hxv_msb);
+	*q1_lsb = sum_v_lsb ^ *q0_lsb;
 	*q1_msb = sum_v_msb ^ *q0_msb;
-	return 1;
 }


-int burn_rspc_parity_q(unsigned char *sector)
+void burn_rspc_parity_q(unsigned char *sector)
 {
 	int i;
 	unsigned char q0_lsb, q0_msb, q1_lsb, q1_msb;
@@ -611,7 +675,6 @@ int burn_rspc_parity_q(unsigned char *sector)
 #endif /* Libburn_with_lec_generatoR */

 	}
-	return 1;
 }

 /* ------------------------------------------------------------------------- */
@@ -622,7 +685,7 @@ int burn_rspc_parity_q(unsigned char *sector)
   Measurements indicate that about 50 MIPS are needed for 48x CD speed.
 */

-int burn_ecma130_scramble(unsigned char *sector) 
+void burn_ecma130_scramble(unsigned char *sector) 
 {
        int i;
 	unsigned char *s;
@@ -630,7 +693,6 @@ int burn_ecma130_scramble(unsigned char *sector)
 	s = sector + 12;
        for (i = 0; i < 2340; i++)
                s[i] ^= ecma_130_annex_b[i];
-	return 1;
 }


@@ -667,6 +729,7 @@ static int burn_rspc_setup_tables(void)

 /* This function printed the content of gflog[] and gfpow[] as C code
   and compared the content with the tables of the old implementation.
+   h26[] and h45[] are reverted order copies of gfpow[]
 */
 static int burn_rspc_print_tables(void)
 {
@@ -700,16 +763,31 @@ static int burn_rspc_print_tables(void)
   if((i % 10) == 9)
     printf("\n\t"); 
 }
- printf("\n};\n");
+ printf("\n};\n\n");
+
+ printf("static unsigned char h26[26] = {");
+ printf("\n\t");
+ for(i= 0; i < 26; i++) {
+   printf(" %3u,", gfpow[25 - i]);
+   if((i % 10) == 9)
+     printf("\n\t"); 
+ }
+ printf("\n};\n\n");
+
+ printf("static unsigned char h45[45] = {");
+ printf("\n\t");
+ for(i= 0; i < 45; i++) {
+   printf(" %3u,",gfpow[44 - i]);
+   if((i % 10) == 9)
+     printf("\n\t"); 
+ }
+ printf("\n};\n\n");

 return 0;
 }


-/* This code was used to generate the content of array ecma_130_annex_b[]
-   It implements the prescription to use the lowest bit as output, to shift
-   the bits down by one, to exor the output bit with the next lowest bit,
-   and to put that exor result into bit 14 of the register.
+/* This code was used to generate the content of array ecma_130_annex_b[].
 */
 static unsigned short ecma_130_fsr = 1;

@@ -746,6 +824,8 @@ static int print_ecma_130_scrambler(void)
 }


+#ifdef Libburn_with_general_rspc_diV
+
 /* This is a general polynomial division function.
   burn_rspc_div_3() has been derived from this by setting b to constant 3.
 */
@@ -763,6 +843,8 @@ static unsigned char burn_rspc_div(unsigned char a, unsigned char b)
 	return gfpow[d];
 }

+#endif /* Libburn_with_general_rspc_diV */
+

 #endif /* Libburn_with_lec_generatoR */

--- a/libburn/ecma130ab.h
+++ b/libburn/ecma130ab.h
@@ -13,11 +13,11 @@
 #ifndef Libburn_ecma130ab_includeD
 #define Libburn_ecma130ab_includeD 1

-int burn_rspc_parity_p(unsigned char *sector);
+void burn_rspc_parity_p(unsigned char *sector);

-int burn_rspc_parity_q(unsigned char *sector);
+void burn_rspc_parity_q(unsigned char *sector);

-int burn_ecma130_scramble(unsigned char *sector);
+void burn_ecma130_scramble(unsigned char *sector);

 #endif /* ! Libburn_ecma130ab_includeD */

--- a/libburn/libburn.h
+++ b/libburn/libburn.h
@@ -148,7 +148,7 @@ enum burn_write_types
 	               if this mode is attempted.
 	               @since 0.7.2
 	    ts A91016: Re-implemented according to ECMA-130 Annex A and B.
-	               Slower but understood and explained.
+	               Now understood, explained and not stemming from cdrdao.
 	               @since 0.7.4
 	*/
 	BURN_WRITE_RAW,