From: "H. Peter Anvin" <hpa@zytor.com>

As expected, when it hit mainline I started getting real bug reports... 
the attached patch does the following:

- Fixes a few x86-64 specific bugs;

- Removes MMX and SSE-1 from x86-64 (if we have x86-64 we have SSE-2);

- Slightly astracts the integer code to make it easier to add
  architecture-specific optimizations later (e.g.  IA64 once gcc gets
  better IA64 intrinsics support);

- Resurrects the user-space testbench, and makes it not output the known
  false positive of the D+Q case (D+Q is equivalent to a RAID-5 recovery,
  so I didn't implement it in the user-space testbench.)



---

 drivers/md/raid6.h            |   27 +++--------
 drivers/md/raid6algos.c       |    4 +
 drivers/md/raid6int.uc        |   61 ++++++++++++++++++++++++--
 drivers/md/raid6mmx.c         |    2 
 drivers/md/raid6recov.c       |    2 
 drivers/md/raid6sse1.c        |    2 
 drivers/md/raid6test/Makefile |   36 ++++++++-------
 drivers/md/raid6test/test.c   |   21 +++++----
 drivers/md/raid6x86.h         |   97 +++++++++++++++++++++---------------------
 9 files changed, 152 insertions(+), 100 deletions(-)

diff -puN drivers/md/raid6algos.c~raid6-3 drivers/md/raid6algos.c
--- 25/drivers/md/raid6algos.c~raid6-3	2004-01-21 21:56:17.000000000 -0800
+++ 25-akpm/drivers/md/raid6algos.c	2004-01-21 21:56:17.000000000 -0800
@@ -46,7 +46,7 @@ const struct raid6_calls * const raid6_a
 	&raid6_intx16,
 	&raid6_intx32,
 #endif
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__)
 	&raid6_mmxx1,
 	&raid6_mmxx2,
 	&raid6_sse1x1,
@@ -55,6 +55,8 @@ const struct raid6_calls * const raid6_a
 	&raid6_sse2x2,
 #endif
 #if defined(__x86_64__)
+	&raid6_sse2x1,
+	&raid6_sse2x2,
 	&raid6_sse2x4,
 #endif
 	NULL
diff -puN drivers/md/raid6.h~raid6-3 drivers/md/raid6.h
--- 25/drivers/md/raid6.h~raid6-3	2004-01-21 21:56:17.000000000 -0800
+++ 25-akpm/drivers/md/raid6.h	2004-01-21 21:56:17.000000000 -0800
@@ -20,6 +20,7 @@
 
 #include <linux/module.h>
 #include <linux/stddef.h>
+#include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
@@ -45,11 +46,15 @@ extern const char raid6_empty_zero_page[
 #else /* ! __KERNEL__ */
 /* Used for testing in user space */
 
-#include <stddef.h>
-#include <sys/types.h>
-#include <inttypes.h>
 #include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stddef.h>
 #include <sys/mman.h>
+#include <sys/types.h>
+
+/* Not standard, but glibc defines it */
+#define BITS_PER_LONG __WORDSIZE
 
 typedef uint8_t  u8;
 typedef uint16_t u16;
@@ -63,27 +68,13 @@ extern const char raid6_empty_zero_page[
 
 #define __init
 #define __exit
+#define __attribute_const__ __attribute__((const))
 
 #define preempt_enable()
 #define preempt_disable()
 
 #endif /* __KERNEL__ */
 
-/* Change this from BITS_PER_LONG if there is something better... */
-#if BITS_PER_LONG == 64
-# define NBYTES(x) ((x) * 0x0101010101010101UL)
-# define NSIZE  8
-# define NSHIFT 3
-# define NSTRING "64"
-typedef u64 unative_t;
-#else
-# define NBYTES(x) ((x) * 0x01010101U)
-# define NSIZE  4
-# define NSHIFT 2
-# define NSTRING "32"
-typedef u32 unative_t;
-#endif
-
 /* Routine choices */
 struct raid6_calls {
 	void (*gen_syndrome)(int, size_t, void **);
diff -puN drivers/md/raid6int.uc~raid6-3 drivers/md/raid6int.uc
--- 25/drivers/md/raid6int.uc~raid6-3	2004-01-21 21:56:17.000000000 -0800
+++ 25-akpm/drivers/md/raid6int.uc	2004-01-21 21:56:17.000000000 -0800
@@ -1,6 +1,6 @@
 /* -*- linux-c -*- ------------------------------------------------------- *
  *
- *   Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *   Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
  *
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
@@ -21,11 +21,63 @@
 #include "raid6.h"
 
 /*
+ * This is the C data type to use
+ */
+
+/* Change this from BITS_PER_LONG if there is something better... */
+#if BITS_PER_LONG == 64
+# define NBYTES(x) ((x) * 0x0101010101010101UL)
+# define NSIZE  8
+# define NSHIFT 3
+# define NSTRING "64"
+typedef u64 unative_t;
+#else
+# define NBYTES(x) ((x) * 0x01010101U)
+# define NSIZE  4
+# define NSHIFT 2
+# define NSTRING "32"
+typedef u32 unative_t;
+#endif
+
+
+
+/*
  * IA-64 wants insane amounts of unrolling.  On other architectures that
  * is just a waste of space.
  */
+#if ($# <= 8) || defined(__ia64__)
+
+
+/*
+ * These sub-operations are separate inlines since they can sometimes be
+ * specially optimized using architecture-specific hacks.
+ */
+
+/*
+ * The SHLBYTE() operation shifts each byte left by 1, *not*
+ * rolling over into the next byte
+ */
+static inline __attribute_const__ unative_t SHLBYTE(unative_t v)
+{
+	unative_t vv;
+
+	vv = (v << 1) & NBYTES(0xfe);
+	return vv;
+}
+
+/*
+ * The MASK() operation returns 0xFF in any byte for which the high
+ * bit is 1, 0x00 for any byte for which the high bit is 0.
+ */
+static inline __attribute_const__ unative_t MASK(unative_t v)
+{
+	unative_t vv;
+
+	vv = v & NBYTES(0x80);
+	vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */
+	return vv;
+}
 
-#if ($# <= 8) || defined(_ia64__)
 
 static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
 {
@@ -44,9 +96,8 @@ static void raid6_int$#_gen_syndrome(int
 		for ( z = z0-1 ; z >= 0 ; z-- ) {
 			wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
 			wp$$ ^= wd$$;
-			w2$$ = wq$$ & NBYTES(0x80);
-			w1$$ = (wq$$ << 1) & NBYTES(0xfe);
-			w2$$ = (w2$$ << 1) - (w2$$ >> 7);
+			w2$$ = MASK(wq$$);
+			w1$$ = SHLBYTE(wq$$);
 			w2$$ &= NBYTES(0x1d);
 			w1$$ ^= w2$$;
 			wq$$ = w1$$ ^ wd$$;
diff -puN drivers/md/raid6mmx.c~raid6-3 drivers/md/raid6mmx.c
--- 25/drivers/md/raid6mmx.c~raid6-3	2004-01-21 21:56:17.000000000 -0800
+++ 25-akpm/drivers/md/raid6mmx.c	2004-01-21 21:56:17.000000000 -0800
@@ -16,7 +16,7 @@
  * MMX implementation of RAID-6 syndrome functions
  */
 
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__)
 
 #include "raid6.h"
 #include "raid6x86.h"
diff -puN drivers/md/raid6recov.c~raid6-3 drivers/md/raid6recov.c
--- 25/drivers/md/raid6recov.c~raid6-3	2004-01-21 21:56:17.000000000 -0800
+++ 25-akpm/drivers/md/raid6recov.c	2004-01-21 21:56:17.000000000 -0800
@@ -117,7 +117,7 @@ void raid6_dual_recov(int disks, size_t 
 		} else {
 			/* data+Q failure.  Reconstruct data from P,
 			   then rebuild syndrome. */
-			/* FIX */
+			/* NOT IMPLEMENTED - equivalent to RAID-5 */
 		}
 	} else {
 		if ( failb == disks-2 ) {
diff -puN drivers/md/raid6sse1.c~raid6-3 drivers/md/raid6sse1.c
--- 25/drivers/md/raid6sse1.c~raid6-3	2004-01-21 21:56:17.000000000 -0800
+++ 25-akpm/drivers/md/raid6sse1.c	2004-01-21 21:56:17.000000000 -0800
@@ -21,7 +21,7 @@
  * worthwhile as a separate implementation.
  */
 
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__)
 
 #include "raid6.h"
 #include "raid6x86.h"
diff -puN drivers/md/raid6test/Makefile~raid6-3 drivers/md/raid6test/Makefile
--- 25/drivers/md/raid6test/Makefile~raid6-3	2004-01-21 21:56:17.000000000 -0800
+++ 25-akpm/drivers/md/raid6test/Makefile	2004-01-21 21:56:17.000000000 -0800
@@ -3,10 +3,11 @@
 # from userspace.
 #
 
-CC	= gcc
-CFLAGS	= -I.. -O2 -g -march=i686
-LD	= ld
-PERL	= perl
+CC	 = gcc
+OPTFLAGS = -O2			# Adjust as desired
+CFLAGS	 = -I.. -g $(OPTFLAGS)
+LD	 = ld
+PERL	 = perl
 
 .c.o:
 	$(CC) $(CFLAGS) -c -o $@ $<
@@ -17,12 +18,10 @@ PERL	= perl
 %.uc: ../%.uc
 	cp -f $< $@
 
-%.pl: ../%.pl
-	cp -f $< $@
-
 all:	raid6.o raid6test
 
 raid6.o: raid6int1.o raid6int2.o raid6int4.o raid6int8.o raid6int16.o \
+	 raid6int32.o \
 	 raid6mmx.o raid6sse1.o raid6sse2.o \
 	 raid6recov.o raid6algos.o \
 	 raid6tables.o
@@ -31,20 +30,23 @@ raid6.o: raid6int1.o raid6int2.o raid6in
 raid6test: raid6.o test.c
 	$(CC) $(CFLAGS) -o raid6test $^
 
-raid6int1.c: raid6int.uc unroller.pl
-	$(PERL) ./unroller.pl 1 < raid6int.uc > $@
+raid6int1.c: raid6int.uc ../unroll.pl
+	$(PERL) ../unroll.pl 1 < raid6int.uc > $@
+
+raid6int2.c: raid6int.uc ../unroll.pl
+	$(PERL) ../unroll.pl 2 < raid6int.uc > $@
 
-raid6int2.c: raid6int.uc unroller.pl
-	$(PERL) ./unroller.pl 2 < raid6int.uc > $@
+raid6int4.c: raid6int.uc ../unroll.pl
+	$(PERL) ../unroll.pl 4 < raid6int.uc > $@
 
-raid6int4.c: raid6int.uc unroller.pl
-	$(PERL) ./unroller.pl 4 < raid6int.uc > $@
+raid6int8.c: raid6int.uc ../unroll.pl
+	$(PERL) ../unroll.pl 8 < raid6int.uc > $@
 
-raid6int8.c: raid6int.uc unroller.pl
-	$(PERL) ./unroller.pl 8 < raid6int.uc > $@
+raid6int16.c: raid6int.uc ../unroll.pl
+	$(PERL) ../unroll.pl 16 < raid6int.uc > $@
 
-raid6int16.c: raid6int.uc unroller.pl
-	$(PERL) ./unroller.pl 16 < raid6int.uc > $@
+raid6int32.c: raid6int.uc ../unroll.pl
+	$(PERL) ../unroll.pl 32 < raid6int.uc > $@
 
 raid6tables.c: mktables
 	./mktables > raid6tables.c
diff -puN drivers/md/raid6test/test.c~raid6-3 drivers/md/raid6test/test.c
--- 25/drivers/md/raid6test/test.c~raid6-3	2004-01-21 21:56:17.000000000 -0800
+++ 25-akpm/drivers/md/raid6test/test.c	2004-01-21 21:56:17.000000000 -0800
@@ -73,14 +73,19 @@ int main(int argc, char *argv[])
 					erra = memcmp(data[i], recovi, PAGE_SIZE);
 					errb = memcmp(data[j], recovj, PAGE_SIZE);
 
-					printf("algo=%-8s  faila=%3d(%c)  failb=%3d(%c)  %s\n",
-					       raid6_call.name,
-					       i, (i==NDISKS-2)?'P':'D',
-					       j, (j==NDISKS-1)?'Q':(j==NDISKS-2)?'P':'D',
-					       (!erra && !errb) ? "OK" :
-					       !erra ? "ERRB" :
-					       !errb ? "ERRA" :
-					       "ERRAB");
+					if ( i < NDISKS-2 && j == NDISKS-1 ) {
+						/* We don't implement the DQ failure scenario, since it's
+						   equivalent to a RAID-5 failure (XOR, then recompute Q) */
+					} else {
+						printf("algo=%-8s  faila=%3d(%c)  failb=%3d(%c)  %s\n",
+						       raid6_call.name,
+						       i, (i==NDISKS-2)?'P':'D',
+						       j, (j==NDISKS-1)?'Q':(j==NDISKS-2)?'P':'D',
+						       (!erra && !errb) ? "OK" :
+						       !erra ? "ERRB" :
+						       !errb ? "ERRA" :
+						       "ERRAB");
+					}
 
 					dataptrs[i] = data[i];
 					dataptrs[j] = data[j];
diff -puN drivers/md/raid6x86.h~raid6-3 drivers/md/raid6x86.h
--- 25/drivers/md/raid6x86.h~raid6-3	2004-01-21 21:56:17.000000000 -0800
+++ 25-akpm/drivers/md/raid6x86.h	2004-01-21 21:56:17.000000000 -0800
@@ -1,7 +1,7 @@
 #ident "$Id: raid6x86.h,v 1.3 2002/12/12 22:41:27 hpa Exp $"
 /* ----------------------------------------------------------------------- *
  *
- *   Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *   Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
  *
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
@@ -22,54 +22,75 @@
 
 #if defined(__i386__) || defined(__x86_64__)
 
+#ifdef __x86_64__
+
 typedef struct {
 	unsigned int fsave[27];
-	unsigned int cr0;
-} raid6_mmx_save_t;
+	unsigned long cr0;
+} raid6_mmx_save_t __attribute__((aligned(16)));
 
 /* N.B.: For SSE we only save %xmm0-%xmm7 even for x86-64, since
    the code doesn't know about the additional x86-64 registers */
-/* The +3 is so we can make sure the area is aligned properly */
 typedef struct {
-	unsigned int sarea[8*4+3];
+	unsigned int sarea[8*4];
 	unsigned int cr0;
 } raid6_sse_save_t __attribute__((aligned(16)));
 
-#ifdef __x86_64__
-
 /* This is for x86-64-specific code which uses all 16 XMM registers */
 typedef struct {
-	unsigned int sarea[16*4+3];
-	unsigned int cr0;
+	unsigned int sarea[16*4];
+	unsigned long cr0;
 } raid6_sse16_save_t __attribute__((aligned(16)));
 
+/* On x86-64 the stack is 16-byte aligned */
+#define SAREA(x) (x->sarea)
+
+#else /* __i386__ */
+
+typedef struct {
+	unsigned int fsave[27];
+	unsigned long cr0;
+} raid6_mmx_save_t;
+
+/* On i386, the stack is only 8-byte aligned, but SSE requires 16-byte
+   alignment.  The +3 is so we have the slack space to manually align
+   a properly-sized area correctly.  */
+typedef struct {
+	unsigned int sarea[8*4+3];
+	unsigned long cr0;
+} raid6_sse_save_t;
+
+#define SAREA(x) ((unsigned int *)((((unsigned long)&(x)->sarea)+15) & ~15))
+
 #endif
 
 #ifdef __KERNEL__ /* Real code */
 
- static inline u32 raid6_get_fpu(void)
+/* Note: %cr0 is 32 bits on i386 and 64 bits on x86-64 */
+
+static inline unsigned long raid6_get_fpu(void)
 {
-	u32 cr0;
+	unsigned long cr0;
 
 	preempt_disable();
-	asm volatile("movl %%cr0,%0 ; clts" : "=r" (cr0));
+	asm volatile("mov %%cr0,%0 ; clts" : "=r" (cr0));
 	return cr0;
 }
 
-static inline void raid6_put_fpu(u32 cr0)
+static inline void raid6_put_fpu(unsigned long cr0)
 {
-	asm volatile("movl %0,%%cr0" : : "r" (cr0));
+	asm volatile("mov %0,%%cr0" : : "r" (cr0));
 	preempt_enable();
 }
 
 #else /* Dummy code for user space testing */
 
-static inline u32 raid6_get_fpu(void)
+static inline unsigned long raid6_get_fpu(void)
 {
 	return 0xf00ba6;
 }
 
-static inline void raid6_put_fpu(u32 cr0)
+static inline void raid6_put_fpu(unsigned long cr0)
 {
 	(void)cr0;
 }
@@ -90,13 +111,8 @@ static inline void raid6_after_mmx(raid6
 
 static inline void raid6_before_sse(raid6_sse_save_t *s)
 {
-#ifdef __x86_64__
-	unsigned int *rsa = s->sarea;
-#else
-	/* On i386 the save area may not be aligned */
-	unsigned int *rsa =
-		(unsigned int *)((((unsigned long)&s->sarea)+15) & ~15);
-#endif
+	unsigned int *rsa = SAREA(s);
+
 	s->cr0 = raid6_get_fpu();
 
 	asm volatile("movaps %%xmm0,%0" : "=m" (rsa[0]));
@@ -111,13 +127,8 @@ static inline void raid6_before_sse(raid
 
 static inline void raid6_after_sse(raid6_sse_save_t *s)
 {
-#ifdef __x86_64__
-	unsigned int *rsa = s->sarea;
-#else
-	/* On i386 the save area may not be aligned */
-	unsigned int *rsa =
-		(unsigned int *)((((unsigned long)&s->sarea)+15) & ~15);
-#endif
+	unsigned int *rsa = SAREA(s);
+
 	asm volatile("movaps %0,%%xmm0" : : "m" (rsa[0]));
 	asm volatile("movaps %0,%%xmm1" : : "m" (rsa[4]));
 	asm volatile("movaps %0,%%xmm2" : : "m" (rsa[8]));
@@ -132,13 +143,8 @@ static inline void raid6_after_sse(raid6
 
 static inline void raid6_before_sse2(raid6_sse_save_t *s)
 {
-#ifdef __x86_64__
-	unsigned int *rsa = &s->sarea;
-#else
-	/* On i386 the save area may not be aligned */
-	unsigned int *rsa =
-		(unsigned int *)((((unsigned long)&s->sarea)+15) & ~15);
-#endif
+	unsigned int *rsa = SAREA(s);
+
 	s->cr0 = raid6_get_fpu();
 
 	asm volatile("movdqa %%xmm0,%0" : "=m" (rsa[0]));
@@ -153,13 +159,8 @@ static inline void raid6_before_sse2(rai
 
 static inline void raid6_after_sse2(raid6_sse_save_t *s)
 {
-#ifdef __x86_64__
-	unsigned int *rsa = s->sarea;
-#else
-	/* On i386 the save area may not be aligned */
-	unsigned int *rsa =
-		(unsigned int *)((((unsigned long)&s->sarea)+15) & ~15);
-#endif
+	unsigned int *rsa = SAREA(s);
+
 	asm volatile("movdqa %0,%%xmm0" : : "m" (rsa[0]));
 	asm volatile("movdqa %0,%%xmm1" : : "m" (rsa[4]));
 	asm volatile("movdqa %0,%%xmm2" : : "m" (rsa[8]));
@@ -174,9 +175,9 @@ static inline void raid6_after_sse2(raid
 
 #ifdef __x86_64__
 
-static inline raid6_before_sse16(raid6_sse16_save_t *s)
+static inline void raid6_before_sse16(raid6_sse16_save_t *s)
 {
-	unsigned int *rsa = s->sarea;
+	unsigned int *rsa = SAREA(s);
 
 	s->cr0 = raid6_get_fpu();
 
@@ -198,9 +199,9 @@ static inline raid6_before_sse16(raid6_s
 	asm volatile("movdqa %%xmm15,%0" : "=m" (rsa[60]));
 }
 
-static inline raid6_after_sse16(raid6_sse16_save_t *s)
+static inline void raid6_after_sse16(raid6_sse16_save_t *s)
 {
-	unsigned int *rsa = s->sarea;
+	unsigned int *rsa = SAREA(s);
 
 	asm volatile("movdqa %0,%%xmm0" : : "m" (rsa[0]));
 	asm volatile("movdqa %0,%%xmm1" : : "m" (rsa[4]));

_