Attempt to do something intelligent with IRQ handlers which don't return
IRQ_HANDLED.

- If they return neither IRQ_HANDLED nor IRQ_NONE, complain.

- If they return IRQ_NONE more than 99900 times in 100000 interrupts, complain
  and disable the IRQ.

  I did have it at 750-in-1000, but someone had an otherwise-functioning
  system which triggered it.

  The 99.9% ratio is designed to address the problem wherein the babbling
  device shares an IRQ with a good device.  We don't want the good device's
  trickle of IRQ_HANDLED callouts to defeat the lockup detector.  (fat chance
  os this working right).

- Add a kernel boot parameter `noirqdebug' to turn the whole thing off.




 25-akpm/Documentation/kernel-parameters.txt |    3 
 25-akpm/arch/i386/kernel/irq.c              |  118 +++++++++++++++++++++-------
 25-akpm/include/linux/irq.h                 |    2 
 3 files changed, 96 insertions(+), 27 deletions(-)

diff -puN arch/i386/kernel/irq.c~irq-check-rate-limit arch/i386/kernel/irq.c
--- 25/arch/i386/kernel/irq.c~irq-check-rate-limit	Tue May 27 14:42:35 2003
+++ 25-akpm/arch/i386/kernel/irq.c	Tue May 27 14:55:57 2003
@@ -66,8 +66,12 @@
 /*
  * Controller mappings for all interrupt sources:
  */
-irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned =
-	{ [0 ... NR_IRQS-1] = { 0, &no_irq_type, NULL, 0, SPIN_LOCK_UNLOCKED}};
+irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
+	[0 ... NR_IRQS-1] = {
+		.handler = &no_irq_type,
+		.lock = SPIN_LOCK_UNLOCKED
+	}
+};
 
 static void register_irq_proc (unsigned int irq);
 
@@ -209,7 +213,6 @@ int handle_IRQ_event(unsigned int irq,
 {
 	int status = 1;	/* Force the "do bottom halves" bit */
 	int retval = 0;
-	struct irqaction *first_action = action;
 
 	if (!(action->flags & SA_INTERRUPT))
 		local_irq_enable();
@@ -222,30 +225,88 @@ int handle_IRQ_event(unsigned int irq,
 	if (status & SA_SAMPLE_RANDOM)
 		add_interrupt_randomness(irq);
 	local_irq_disable();
-	if (retval != 1) {
-		static int count = 100;
-		if (count) {
-			count--;
-			if (retval) {
-				printk("irq event %d: bogus retval mask %x\n",
-					irq, retval);
-			} else {
-				printk("irq %d: nobody cared!\n", irq);
-			}
-			dump_stack();
-			printk("handlers:\n");
-			action = first_action;
-			do {
-				printk("[<%p>]", action->handler);
-				print_symbol(" (%s)",
-					(unsigned long)action->handler);
-				printk("\n");
-				action = action->next;
-			} while (action);
-		}
+	return retval;
+}
+
+static void __report_bad_irq(int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+	struct irqaction *action;
+
+	if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
+		printk(KERN_ERR "irq event %d: bogus return value %x\n",
+				irq, action_ret);
+	} else {
+		printk(KERN_ERR "irq %d: nobody cared!\n", irq);
 	}
+	dump_stack();
+	printk(KERN_ERR "handlers:\n");
+	action = desc->action;
+	do {
+		printk(KERN_ERR "[<%p>]", action->handler);
+		print_symbol(" (%s)",
+			(unsigned long)action->handler);
+		printk("\n");
+		action = action->next;
+	} while (action);
+}
+
+static void report_bad_irq(int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+	static int count = 100;
 
-	return status;
+	if (count) {
+		count--;
+		__report_bad_irq(irq, desc, action_ret);
+	}
+}
+
+static int noirqdebug;
+
+static int __init noirqdebug_setup(char *str)
+{
+	noirqdebug = 1;
+	printk("IRQ lockup detection disabled\n");
+	return 1;
+}
+
+__setup("noirqdebug", noirqdebug_setup);
+
+/*
+ * If 99,900 of the previous 100,000 interrupts have not been handled then
+ * assume that the IRQ is stuck in some manner.  Drop a diagnostic and try to
+ * turn the IRQ off.
+ *
+ * (The other 100-of-100,000 interrupts may have been a correctly-functioning
+ *  device sharing an IRQ with the failing one)
+ *
+ * Called under desc->lock
+ */
+static void note_interrupt(int irq, irq_desc_t *desc, irqreturn_t action_ret)
+{
+	if (action_ret != IRQ_HANDLED) {
+		desc->irqs_unhandled++;
+		if (action_ret != IRQ_NONE)
+			report_bad_irq(irq, desc, action_ret);
+	}
+
+	desc->irq_count++;
+	if (desc->irq_count < 100000)
+		return;
+
+	desc->irq_count = 0;
+	if (desc->irqs_unhandled > 99900) {
+		/*
+		 * The interrupt is stuck
+		 */
+		__report_bad_irq(irq, desc, action_ret);
+		/*
+		 * Now kill the IRQ
+		 */
+		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
+		desc->status |= IRQ_DISABLED;
+		desc->handler->disable(irq);
+	}
+	desc->irqs_unhandled = 0;
 }
 
 /*
@@ -418,10 +479,13 @@ asmlinkage unsigned int do_IRQ(struct pt
 	 * SMP environment.
 	 */
 	for (;;) {
+		irqreturn_t action_ret;
+
 		spin_unlock(&desc->lock);
-		handle_IRQ_event(irq, &regs, action);
+		action_ret = handle_IRQ_event(irq, &regs, action);
 		spin_lock(&desc->lock);
-		
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
 		if (likely(!(desc->status & IRQ_PENDING)))
 			break;
 		desc->status &= ~IRQ_PENDING;
diff -puN include/linux/irq.h~irq-check-rate-limit include/linux/irq.h
--- 25/include/linux/irq.h~irq-check-rate-limit	Tue May 27 14:42:35 2003
+++ 25-akpm/include/linux/irq.h	Tue May 27 14:42:35 2003
@@ -61,6 +61,8 @@ typedef struct {
 	hw_irq_controller *handler;
 	struct irqaction *action;	/* IRQ action list */
 	unsigned int depth;		/* nested irq disables */
+	unsigned int irq_count;		/* For detecting broken interrupts */
+	unsigned int irqs_unhandled;
 	spinlock_t lock;
 } ____cacheline_aligned irq_desc_t;
 
diff -puN Documentation/kernel-parameters.txt~irq-check-rate-limit Documentation/kernel-parameters.txt
--- 25/Documentation/kernel-parameters.txt~irq-check-rate-limit	Tue May 27 14:53:34 2003
+++ 25-akpm/Documentation/kernel-parameters.txt	Tue May 27 14:54:51 2003
@@ -617,6 +617,9 @@ running once the system is up.
 
 	noht		[SMP,IA-32] Disables P4 Xeon(tm) HyperThreading.
 
+	noirqdebug	[IA-32] Disables the code which attempts to detect and
+			disable unhandled interrupt sources.
+
 	noisapnp	[ISAPNP] Disables ISA PnP code.
 
 	noinitrd	[RAM] Tells the kernel not to load any configured

_