From: Martin Schwidefsky <schwidefsky@de.ibm.com>

This patch add a system control that allows to switch off the jiffies timer
interrupts while a cpu sleeps in idle.  This is useful for a system running
with virtual cpus under z/VM.


---

 25-akpm/arch/s390/Kconfig          |   19 ++++
 25-akpm/arch/s390/defconfig        |    1 
 25-akpm/arch/s390/kernel/process.c |   10 +-
 25-akpm/arch/s390/kernel/time.c    |  156 +++++++++++++++++++++++++++++++------
 25-akpm/arch/s390/kernel/traps.c   |    4 
 25-akpm/include/linux/sched.h      |    2 
 25-akpm/include/linux/sysctl.h     |    1 
 25-akpm/include/linux/timer.h      |    2 
 25-akpm/kernel/rcupdate.c          |    6 +
 25-akpm/kernel/sched.c             |    9 ++
 25-akpm/kernel/sysctl.c            |   12 ++
 25-akpm/kernel/timer.c             |   69 ++++++++++++++++
 12 files changed, 262 insertions(+), 29 deletions(-)

diff -puN arch/s390/defconfig~s390-9-9-no-timer-interrupts-in-idle arch/s390/defconfig
--- 25/arch/s390/defconfig~s390-9-9-no-timer-interrupts-in-idle	Fri Apr 23 13:09:54 2004
+++ 25-akpm/arch/s390/defconfig	Fri Apr 23 13:09:54 2004
@@ -83,6 +83,7 @@ CONFIG_PFAULT=y
 # CONFIG_SHARED_KERNEL is not set
 # CONFIG_CMM is not set
 # CONFIG_VIRT_TIMER is not set
+# CONFIG_NO_IDLE_HZ is not set
 # CONFIG_PCMCIA is not set
 
 #
diff -puN arch/s390/Kconfig~s390-9-9-no-timer-interrupts-in-idle arch/s390/Kconfig
--- 25/arch/s390/Kconfig~s390-9-9-no-timer-interrupts-in-idle	Fri Apr 23 13:09:54 2004
+++ 25-akpm/arch/s390/Kconfig	Fri Apr 23 13:09:54 2004
@@ -333,6 +333,25 @@ config APPLDATA_NET_SUM
 	  This can also be compiled as a module, which will be called
 	  appldata_net_sum.o.
 
+config NO_IDLE_HZ
+	bool "No HZ timer ticks in idle"
+	help
+	  Switches the regular HZ timer off when the system is going idle.
+	  This helps z/VM to detect that the Linux system is idle. VM can
+	  then "swap-out" this guest which reduces memory usage. It also
+	  reduces the overhead of idle systems.
+
+	  The HZ timer can be switched on/off via /proc/sys/kernel/hz_timer.
+	  hz_timer=0 means HZ timer is disabled. hz_timer=1 means HZ
+	  timer is active.
+
+config NO_IDLE_HZ_INIT
+	bool "HZ timer in idle off by default"
+	depends on NO_IDLE_HZ
+	help
+	  The HZ timer is switched off in idle by default. That means the
+	  HZ timer is already disabled at boot time.
+
 endmenu
 
 config PCMCIA
diff -puN arch/s390/kernel/process.c~s390-9-9-no-timer-interrupts-in-idle arch/s390/kernel/process.c
--- 25/arch/s390/kernel/process.c~s390-9-9-no-timer-interrupts-in-idle	Fri Apr 23 13:09:54 2004
+++ 25-akpm/arch/s390/kernel/process.c	Fri Apr 23 13:09:54 2004
@@ -40,7 +40,7 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/irq.h>
-#ifdef CONFIG_VIRT_TIMER
+#if defined(CONFIG_VIRT_TIMER) || defined (CONFIG_NO_IDLE_HZ)
 #include <asm/timer.h>
 #endif
 
@@ -75,17 +75,21 @@ void default_idle(void)
 	psw_t wait_psw;
 	unsigned long reg;
 
+	local_irq_disable();
         if (need_resched()) {
+		local_irq_enable();
                 schedule();
                 return;
         }
 
-#ifdef CONFIG_VIRT_TIMER
+#if defined(CONFIG_VIRT_TIMER) || defined (CONFIG_NO_IDLE_HZ)
 	/*
 	 * hook to stop timers that should not tick while CPU is idle
 	 */
-	if (stop_timers())
+	if (stop_timers()) {
+		local_irq_enable();
 		return;
+	}
 #endif
 
 	/* 
diff -puN arch/s390/kernel/time.c~s390-9-9-no-timer-interrupts-in-idle arch/s390/kernel/time.c
--- 25/arch/s390/kernel/time.c~s390-9-9-no-timer-interrupts-in-idle	Fri Apr 23 13:09:54 2004
+++ 25-akpm/arch/s390/kernel/time.c	Fri Apr 23 13:09:54 2004
@@ -281,29 +281,6 @@ int stop_cpu_timer(void)
 	return 0;
 }
 
-void do_monitor_call(struct pt_regs *regs, long interruption_code)
-{
-	/* disable monitor call class 0 */
-	__ctl_clear_bit(8, 15);
-
-	start_cpu_timer();
-}
-
-/*
- * called from cpu_idle to stop any timers
- * returns 1 if CPU should not be stopped
- */
-int stop_timers(void)
-{
-	if (stop_cpu_timer())
-		return 1;
-
-	/* enable monitor call class 0 */
-	__ctl_set_bit(8, 15);
-
-	return 0;
-}
-
 void set_vtimer(__u64 expires)
 {
 	asm volatile ("SPT %0" : : "m" (expires));
@@ -424,6 +401,139 @@ static void do_cpu_timer_interrupt(struc
 }
 #endif
 
+#ifdef CONFIG_NO_IDLE_HZ
+
+#ifdef CONFIG_NO_IDLE_HZ_INIT
+int sysctl_hz_timer = 0;
+#else
+int sysctl_hz_timer = 1;
+#endif
+
+/*
+ * Start the HZ tick on the current CPU.
+ * Only cpu_idle may call this function.
+ */
+void start_hz_timer(struct pt_regs *regs)
+{
+	__u64 tmp;
+	__u32 ticks;
+
+	if (!cpu_isset(smp_processor_id(), idle_cpu_mask))
+		return;
+
+	/* Calculate how many ticks have passed */
+	asm volatile ("STCK 0(%0)" : : "a" (&tmp) : "memory", "cc");
+	tmp = tmp + CLK_TICKS_PER_JIFFY - S390_lowcore.jiffy_timer;
+	ticks = __calculate_ticks(tmp);
+	S390_lowcore.jiffy_timer += CLK_TICKS_PER_JIFFY * (__u64) ticks;
+
+	/* Set the clock comparator to the next tick. */
+	tmp = S390_lowcore.jiffy_timer + CPU_DEVIATION;
+	asm volatile ("SCKC %0" : : "m" (tmp));
+
+	/* Charge the ticks. */
+	if (ticks > 0) {
+#ifdef CONFIG_SMP
+		/*
+		 * Do not rely on the boot cpu to do the calls to do_timer.
+		 * Spread it over all cpus instead.
+		 */
+		write_seqlock(&xtime_lock);
+		if (S390_lowcore.jiffy_timer > xtime_cc) {
+			__u32 xticks;
+
+			tmp = S390_lowcore.jiffy_timer - xtime_cc;
+			if (tmp >= 2*CLK_TICKS_PER_JIFFY) {
+				xticks = __calculate_ticks(tmp);
+				xtime_cc += (__u64) xticks*CLK_TICKS_PER_JIFFY;
+			} else {
+				xticks = 1;
+				xtime_cc += CLK_TICKS_PER_JIFFY;
+			}
+			while (xticks--)
+				do_timer(regs);
+		}
+		write_sequnlock(&xtime_lock);
+		while (ticks--)
+			update_process_times(user_mode(regs));
+#else
+		while (ticks--)
+			do_timer(regs);
+#endif
+	}
+	cpu_clear(smp_processor_id(), idle_cpu_mask);
+}
+
+/*
+ * Stop the HZ tick on the current CPU.
+ * Only cpu_idle may call this function.
+ */
+int stop_hz_timer(void)
+{
+	__u64 timer;
+
+	if (sysctl_hz_timer != 0)
+		return 1;
+
+	/*
+	 * Leave the clock comparator set up for the next timer
+	 * tick if either rcu or a softirq is pending.
+	 */
+	if (rcu_pending(smp_processor_id()) || local_softirq_pending())
+		return 1;
+
+	/*
+	 * This cpu is going really idle. Set up the clock comparator
+	 * for the next event.
+	 */
+	cpu_set(smp_processor_id(), idle_cpu_mask);
+	timer = (__u64) (next_timer_interrupt() - jiffies) + jiffies_64;
+	timer = jiffies_timer_cc + timer * CLK_TICKS_PER_JIFFY;
+	asm volatile ("SCKC %0" : : "m" (timer));
+
+	return 0;
+}
+#endif
+
+#if defined(CONFIG_VIRT_TIMER) || defined(CONFIG_NO_IDLE_HZ)
+
+void do_monitor_call(struct pt_regs *regs, long interruption_code)
+{
+	/* disable monitor call class 0 */
+	__ctl_clear_bit(8, 15);
+
+#ifdef CONFIG_VIRT_TIMER
+	start_cpu_timer();
+#endif
+#ifdef CONFIG_NO_IDLE_HZ
+	start_hz_timer(regs);
+#endif
+}
+
+/*
+ * called from cpu_idle to stop any timers
+ * returns 1 if CPU should not be stopped
+ */
+int stop_timers(void)
+{
+#ifdef CONFIG_VIRT_TIMER
+	if (stop_cpu_timer())
+		return 1;
+#endif
+
+#ifdef CONFIG_NO_IDLE_HZ
+	if (stop_hz_timer())
+		return 1;
+#endif
+
+	/* enable monitor call class 0 */
+	__ctl_set_bit(8, 15);
+
+	return 0;
+}
+
+#endif
+
 /*
  * Start the clock comparator and the virtual CPU timer
  * on the current CPU.
diff -puN arch/s390/kernel/traps.c~s390-9-9-no-timer-interrupts-in-idle arch/s390/kernel/traps.c
--- 25/arch/s390/kernel/traps.c~s390-9-9-no-timer-interrupts-in-idle	Fri Apr 23 13:09:54 2004
+++ 25-akpm/arch/s390/kernel/traps.c	Fri Apr 23 13:09:55 2004
@@ -64,7 +64,7 @@ extern void pfault_fini(void);
 extern void pfault_interrupt(struct pt_regs *regs, __u16 error_code);
 static ext_int_info_t ext_int_pfault;
 #endif
-#ifdef CONFIG_VIRT_TIMER
+#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_VIRT_TIMER)
 extern pgm_check_handler_t do_monitor_call;
 #endif
 
@@ -620,7 +620,7 @@ void __init trap_init(void)
 #endif /* CONFIG_ARCH_S390X */
         pgm_check_table[0x15] = &operand_exception;
         pgm_check_table[0x1C] = &privileged_op;
-#ifdef CONFIG_VIRT_TIMER
+#if defined(CONFIG_VIRT_TIMER) || defined(CONFIG_NO_IDLE_HZ)
 	pgm_check_table[0x40] = &do_monitor_call;
 #endif
 	if (MACHINE_IS_VM) {
diff -puN include/linux/sched.h~s390-9-9-no-timer-interrupts-in-idle include/linux/sched.h
--- 25/include/linux/sched.h~s390-9-9-no-timer-interrupts-in-idle	Fri Apr 23 13:09:54 2004
+++ 25-akpm/include/linux/sched.h	Fri Apr 23 13:09:55 2004
@@ -159,6 +159,8 @@ extern void sched_init(void);
 extern void sched_init_smp(void);
 extern void init_idle(task_t *idle, int cpu);
 
+extern cpumask_t idle_cpu_mask;
+
 extern void show_state(void);
 extern void show_regs(struct pt_regs *);
 
diff -puN include/linux/sysctl.h~s390-9-9-no-timer-interrupts-in-idle include/linux/sysctl.h
--- 25/include/linux/sysctl.h~s390-9-9-no-timer-interrupts-in-idle	Fri Apr 23 13:09:54 2004
+++ 25-akpm/include/linux/sysctl.h	Fri Apr 23 13:09:55 2004
@@ -132,6 +132,7 @@ enum
 	KERN_PTY=62,		/* dir: pty driver */
 	KERN_NGROUPS_MAX=63,	/* int: NGROUPS_MAX */
 	KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */
+	KERN_HZ_TIMER=65,	/* int: hz timer on or off */
 };
 
 
diff -puN include/linux/timer.h~s390-9-9-no-timer-interrupts-in-idle include/linux/timer.h
--- 25/include/linux/timer.h~s390-9-9-no-timer-interrupts-in-idle	Fri Apr 23 13:09:54 2004
+++ 25-akpm/include/linux/timer.h	Fri Apr 23 13:09:55 2004
@@ -65,6 +65,8 @@ extern int del_timer(struct timer_list *
 extern int __mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
 
+extern unsigned long next_timer_interrupt(void);
+
 /***
  * add_timer - start a timer
  * @timer: the timer to be added
diff -puN kernel/rcupdate.c~s390-9-9-no-timer-interrupts-in-idle kernel/rcupdate.c
--- 25/kernel/rcupdate.c~s390-9-9-no-timer-interrupts-in-idle	Fri Apr 23 13:09:54 2004
+++ 25-akpm/kernel/rcupdate.c	Fri Apr 23 13:09:55 2004
@@ -103,6 +103,8 @@ static void rcu_do_batch(struct list_hea
  */
 static void rcu_start_batch(long newbatch)
 {
+	cpumask_t active;
+
 	if (rcu_batch_before(rcu_ctrlblk.maxbatch, newbatch)) {
 		rcu_ctrlblk.maxbatch = newbatch;
 	}
@@ -111,7 +113,9 @@ static void rcu_start_batch(long newbatc
 		return;
 	}
 	/* Can't change, since spin lock held. */
-	rcu_ctrlblk.rcu_cpu_mask = cpu_online_map;
+	active = idle_cpu_mask;
+	cpus_complement(active);
+	cpus_and(rcu_ctrlblk.rcu_cpu_mask, cpu_online_map, active);
 }
 
 /*
diff -puN kernel/sched.c~s390-9-9-no-timer-interrupts-in-idle kernel/sched.c
--- 25/kernel/sched.c~s390-9-9-no-timer-interrupts-in-idle	Fri Apr 23 13:09:54 2004
+++ 25-akpm/kernel/sched.c	Fri Apr 23 13:09:55 2004
@@ -3415,6 +3415,15 @@ void __init init_idle(task_t *idle, int 
 #endif
 }
 
+/*
+ * In a system that switches off the HZ timer idle_cpu_mask
+ * indicates which cpus entered this state. This is used
+ * in the rcu update to wait only for active cpus. For system
+ * which do not switch off the HZ timer idle_cpu_mask should
+ * always be CPU_MASK_NONE.
+ */
+cpumask_t idle_cpu_mask = CPU_MASK_NONE;
+
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
diff -puN kernel/sysctl.c~s390-9-9-no-timer-interrupts-in-idle kernel/sysctl.c
--- 25/kernel/sysctl.c~s390-9-9-no-timer-interrupts-in-idle	Fri Apr 23 13:09:54 2004
+++ 25-akpm/kernel/sysctl.c	Fri Apr 23 13:09:55 2004
@@ -108,6 +108,8 @@ extern int sysctl_ieee_emulation_warning
 extern int sysctl_userprocess_debug;
 #endif
 
+extern int sysctl_hz_timer;
+
 #if defined(CONFIG_PPC32) && defined(CONFIG_6xx)
 extern unsigned long powersave_nap;
 int proc_dol2crvec(ctl_table *table, int write, struct file *filp,
@@ -574,6 +576,16 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_NO_IDLE_HZ
+	{
+		.ctl_name       = KERN_HZ_TIMER,
+		.procname       = "hz_timer",
+		.data           = &sysctl_hz_timer,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+#endif
 	{
 		.ctl_name	= KERN_S390_USER_DEBUG_LOGGING,
 		.procname	= "userprocess_debug",
diff -puN kernel/timer.c~s390-9-9-no-timer-interrupts-in-idle kernel/timer.c
--- 25/kernel/timer.c~s390-9-9-no-timer-interrupts-in-idle	Fri Apr 23 13:09:54 2004
+++ 25-akpm/kernel/timer.c	Fri Apr 23 13:09:55 2004
@@ -428,6 +428,75 @@ repeat:
 	spin_unlock_irq(&base->lock);
 }
 
+#ifdef CONFIG_NO_IDLE_HZ
+/*
+ * Find out when the next timer event is due to happen. This
+ * is used on S/390 to stop all activity when a cpus is idle.
+ * This functions needs to be called disabled.
+ */
+unsigned long next_timer_interrupt(void)
+{
+	tvec_base_t *base;
+	struct list_head *list;
+	struct timer_list *nte;
+	unsigned long expires;
+	tvec_t *varray[4];
+	int i, j;
+
+	base = &__get_cpu_var(tvec_bases);
+	spin_lock(&base->lock);
+	expires = base->timer_jiffies + (LONG_MAX >> 1);
+	list = 0;
+
+	/* Look for timer events in tv1. */
+	j = base->timer_jiffies & TVR_MASK;
+	do {
+		list_for_each_entry(nte, base->tv1.vec + j, entry) {
+			expires = nte->expires;
+			if (j < (base->timer_jiffies & TVR_MASK))
+				list = base->tv2.vec + (INDEX(0));
+			goto found;
+		}
+		j = (j + 1) & TVR_MASK;
+	} while (j != (base->timer_jiffies & TVR_MASK));
+
+	/* Check tv2-tv5. */
+	varray[0] = &base->tv2;
+	varray[1] = &base->tv3;
+	varray[2] = &base->tv4;
+	varray[3] = &base->tv5;
+	for (i = 0; i < 4; i++) {
+		j = INDEX(i);
+		do {
+			if (list_empty(varray[i]->vec + j)) {
+				j = (j + 1) & TVN_MASK;
+				continue;
+			}
+			list_for_each_entry(nte, varray[i]->vec + j, entry)
+				if (time_before(nte->expires, expires))
+					expires = nte->expires;
+			if (j < (INDEX(i)) && i < 3)
+				list = varray[i + 1]->vec + (INDEX(i + 1));
+			goto found;
+		} while (j != (INDEX(i)));
+	}
+found:
+	if (list) {
+		/*
+		 * The search wrapped. We need to look at the next list
+		 * from next tv element that would cascade into tv element
+		 * where we found the timer element.
+		 */
+		list_for_each_entry(nte, list, entry) {
+			if (time_before(nte->expires, expires))
+				expires = nte->expires;
+		}
+	}
+	spin_unlock(&base->lock);
+	return expires;
+}
+#endif
+
 /******************************************************************/
 
 /*

_