From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>,
      "Seth, Rohit" <rohit.seth@intel.com>

This patch addresses the longstanding problem wherein Oracle needs
CAP_IPC_LOCK to allocate SHM_HUGETLB shm memory, but people don't want to run
Oracle as root, and capabilties are busted.

Various ideas with rlimits didn't work out, mainly because these objects live
beyond the lifetime of the user processes which establish them.

What we do is to create root-writeable /proc/sys/vm/hugetlb_shm_group which
specifies a single group ID.  Users who belong to that group may allocate
hugepages for SHM_HUGETLB shm segments.

So the sysadmin will greate a new group, say `hugepageusers', will add the
oracle user to that group and will write that group's ID into
/proc/sys/vm/hugetlb_shm_group.


---

 25-akpm/Documentation/filesystems/proc.txt |   15 ++++++++-------
 25-akpm/Documentation/vm/hugetlbpage.txt   |    9 ++++++---
 25-akpm/fs/hugetlbfs/inode.c               |   10 +++++++++-
 25-akpm/include/linux/hugetlb.h            |    1 +
 25-akpm/include/linux/sysctl.h             |    1 +
 25-akpm/kernel/sysctl.c                    |    8 ++++++++
 6 files changed, 33 insertions(+), 11 deletions(-)

diff -puN Documentation/filesystems/proc.txt~hugetlb_shm_group-sysctl-patch Documentation/filesystems/proc.txt
--- 25/Documentation/filesystems/proc.txt~hugetlb_shm_group-sysctl-patch	2004-05-07 18:05:59.105028424 -0700
+++ 25-akpm/Documentation/filesystems/proc.txt	2004-05-07 18:13:13.374009504 -0700
@@ -1208,6 +1208,14 @@ On the  other  hand,  enabling this feat
 and thrash the system to death, so large and/or important servers will want to
 set this value to 0.
 
+nr_hugepages and hugetlb_shm_group
+----------------------------------
+
+nr_hugepages configures number of hugetlb page reserved for the system.
+
+hugetlb_shm_group contains group id that is allowed to create SysV shared
+memory segment using hugetlb page.
+
 2.5 /proc/sys/dev - Device specific parameters
 ----------------------------------------------
 
@@ -1848,10 +1856,3 @@ need to  recompile  the kernel, or even 
 command to write value into these files, thereby changing the default settings
 of the kernel.
 ------------------------------------------------------------------------------
-
-
-
-
-
-
-
diff -puN Documentation/vm/hugetlbpage.txt~hugetlb_shm_group-sysctl-patch Documentation/vm/hugetlbpage.txt
--- 25/Documentation/vm/hugetlbpage.txt~hugetlb_shm_group-sysctl-patch	2004-05-07 18:05:59.106028272 -0700
+++ 25-akpm/Documentation/vm/hugetlbpage.txt	2004-05-07 18:05:59.117026600 -0700
@@ -91,9 +91,12 @@ A regular chown, chgrp and chmod command
 used to change the file attributes on hugetlbfs.
 
 Also, it is important to note that no such mount command is required if the
-applications are going to use only shmat/shmget system calls.  It is possible
-for same or different applications to use any combination of mmaps and shm*
-calls.  Though the mount of filesystem will be required for using mmaps.
+applications are going to use only shmat/shmget system calls.  Users who
+wish to use hugetlb page via shared memory segment should be a member of
+a supplementary group and system admin needs to configure that gid into
+/proc/sys/vm/hugetlb_shm_group.  It is possible for same or different
+applications to use any combination of mmaps and shm* calls.  Though the
+mount of filesystem will be required for using mmaps.
 
 /* Example of using hugepage in user application using Sys V shared memory
  * system calls.  In this example, app is requesting memory of size 256MB that
diff -puN fs/hugetlbfs/inode.c~hugetlb_shm_group-sysctl-patch fs/hugetlbfs/inode.c
--- 25/fs/hugetlbfs/inode.c~hugetlb_shm_group-sysctl-patch	2004-05-07 18:05:59.108027968 -0700
+++ 25-akpm/fs/hugetlbfs/inode.c	2004-05-07 18:13:29.788514120 -0700
@@ -43,6 +43,8 @@ static struct backing_dev_info hugetlbfs
 	.memory_backed	= 1,	/* Does not contribute to dirty memory */
 };
 
+int sysctl_hugetlb_shm_group;
+
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_dentry->d_inode;
@@ -718,6 +720,12 @@ static unsigned long hugetlbfs_counter(v
 	return ret;
 }
 
+static int can_do_hugetlb_shm(void)
+{
+	return likely(capable(CAP_IPC_LOCK) ||
+			in_group_p(sysctl_hugetlb_shm_group));
+}
+
 struct file *hugetlb_zero_setup(size_t size)
 {
 	int error;
@@ -727,7 +735,7 @@ struct file *hugetlb_zero_setup(size_t s
 	struct qstr quick_string;
 	char buf[16];
 
-	if (!capable(CAP_IPC_LOCK))
+	if (!can_do_hugetlb_shm())
 		return ERR_PTR(-EPERM);
 
 	if (!is_hugepage_mem_enough(size))
diff -puN include/linux/hugetlb.h~hugetlb_shm_group-sysctl-patch include/linux/hugetlb.h
--- 25/include/linux/hugetlb.h~hugetlb_shm_group-sysctl-patch	2004-05-07 18:05:59.109027816 -0700
+++ 25-akpm/include/linux/hugetlb.h	2004-05-07 18:05:59.119026296 -0700
@@ -32,6 +32,7 @@ void free_huge_page(struct page *);
 
 extern unsigned long max_huge_pages;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
+extern int sysctl_hugetlb_shm_group;
 
 static inline void
 mark_mm_hugetlb(struct mm_struct *mm, struct vm_area_struct *vma)
diff -puN include/linux/sysctl.h~hugetlb_shm_group-sysctl-patch include/linux/sysctl.h
--- 25/include/linux/sysctl.h~hugetlb_shm_group-sysctl-patch	2004-05-07 18:05:59.111027512 -0700
+++ 25-akpm/include/linux/sysctl.h	2004-05-07 18:13:13.367010568 -0700
@@ -163,6 +163,7 @@ enum
 	VM_MAX_MAP_COUNT=22,	/* int: Maximum number of mmaps/address-space */
 	VM_LAPTOP_MODE=23,	/* vm laptop mode */
 	VM_BLOCK_DUMP=24,	/* block dump mode */
+	VM_HUGETLB_GROUP=25,	/* permitted hugetlb group */
 };
 
 
diff -puN kernel/sysctl.c~hugetlb_shm_group-sysctl-patch kernel/sysctl.c
--- 25/kernel/sysctl.c~hugetlb_shm_group-sysctl-patch	2004-05-07 18:05:59.112027360 -0700
+++ 25-akpm/kernel/sysctl.c	2004-05-07 18:13:13.371009960 -0700
@@ -738,6 +738,14 @@ static ctl_table vm_table[] = {
 		.extra1		= (void *)&hugetlb_zero,
 		.extra2		= (void *)&hugetlb_infinity,
 	 },
+	 {
+		.ctl_name	= VM_HUGETLB_GROUP,
+		.procname	= "hugetlb_shm_group",
+		.data		= &sysctl_hugetlb_shm_group,
+		.maxlen		= sizeof(gid_t),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	 },
 #endif
 	{
 		.ctl_name	= VM_LOWER_ZONE_PROTECTION,

_