Discussion:
[PATCH 0/4] ksm - dynamic page sharing driver for linux
Izik Eidus
2008-11-11 13:21:37 UTC
Permalink
KSM is a linux driver that allows dynamicly sharing identical memory pages
between one or more processes.

unlike tradtional page sharing that is made at the allocation of the
memory, ksm do it dynamicly after the memory was created.
Memory is periodically scanned; identical pages are identified and merged.
the sharing is unnoticeable by the process that use this memory.
(the shared pages are marked as readonly, and in case of write
do_wp_page() take care to create new copy of the page)

this driver is very useful for KVM as in cases of runing multiple guests
operation system of the same type, many pages are sharable.
this driver can be useful by OpenVZ as well.

KSM right now scan just memory that was registered to used by it, it
does not
scan the whole system memory (this can be changed, but the changes to
find
identical pages in normal linux system that doesnt run multiple guests)

KSM can run as kernel thread or as userspace application (or both (it is
allowed to run more than one scanner in a time)).

example for how to control the kernel thread:


ksmctl.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include "ksm.h"

int main(int argc, char *argv[])
{
int fd;
int used = 0;
int fd_start;
struct ksm_kthread_info info;


if (argc < 2) {
fprintf(stderr, "usage: %s {start npages sleep | stop |
info}\n", argv[0]);
exit(1);
}

fd = open("/dev/ksm", O_RDWR | O_TRUNC, (mode_t)0600);
if (fd == -1) {
fprintf(stderr, "could not open /dev/ksm\n");
exit(1);
}

if (!strncmp(argv[1], "start", strlen(argv[1]))) {
used = 1;
if (argc < 5) {
fprintf(stderr, "usage: %s start npages_to_scan",
argv[0]);
fprintf(stderr, "npages_max_merge sleep\n");
exit(1);
}
info.pages_to_scan = atoi(argv[2]);
info.max_pages_to_merge = atoi(argv[3]);
info.sleep = atoi(argv[4]);
info.running = 1;

fd_start = ioctl(fd, KSM_START_STOP_KTHREAD, &info);
if (fd_start == -1) {
fprintf(stderr, "KSM_START_KTHREAD failed\n");
exit(1);
}
printf("created scanner\n");
}

if (!strncmp(argv[1], "stop", strlen(argv[1]))) {
used = 1;
info.running = 0;
fd_start = ioctl(fd, KSM_START_STOP_KTHREAD, &info);
if (fd_start == -1) {
fprintf(stderr, "KSM_START_STOP_KTHREAD failed\n");
exit(1);
}
printf("stopped scanner\n");
}

if (!strncmp(argv[1], "info", strlen(argv[1]))) {
used = 1;
fd_start = ioctl(fd, KSM_GET_INFO_KTHREAD, &info);
if (fd_start == -1) {
fprintf(stderr, "KSM_GET_INFO_KTHREAD failed\n");
exit(1);
}
printf("running %d, pages_to_scan %d pages_max_merge %d",
info.running, info.pages_to_scan,
info.max_pages_to_merge);
printf("sleep_time %d\n", info.sleep);
}

if (!used)
fprintf(stderr, "unknown command %s\n", argv[1]);

return 0;
}


example of how to register qemu to ksm (or any userspace application)

diff --git a/qemu/vl.c b/qemu/vl.c
index 4721fdd..7785bf9 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -21,6 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN
* THE SOFTWARE.
*/
+#include "ksm.h"
#include "hw/hw.h"
#include "hw/boards.h"
#include "hw/usb.h"
@@ -5799,6 +5800,37 @@ static void termsig_setup(void)

#endif

+int ksm_register_memory(void)
+{
+ int fd;
+ int ksm_fd;
+ int r = 1;
+ struct ksm_memory_region ksm_region;
+
+ fd = open("/dev/ksm", O_RDWR | O_TRUNC, (mode_t)0600);
+ if (fd == -1)
+ goto out;
+
+ ksm_fd = ioctl(fd, KSM_CREATE_SHARED_MEMORY_AREA);
+ if (ksm_fd == -1)
+ goto out_free;
+
+ ksm_region.npages = phys_ram_size / TARGET_PAGE_SIZE;
+ ksm_region.addr = phys_ram_base;
+ r = ioctl(ksm_fd, KSM_REGISTER_MEMORY_REGION, &ksm_region);
+ if (r)
+ goto out_free1;
+
+ return r;
+
+out_free1:
+ close(ksm_fd);
+out_free:
+ close(fd);
+out:
+ return r;
+}
+
int main(int argc, char **argv)
{
#ifdef CONFIG_GDBSTUB
@@ -6735,6 +6767,8 @@ int main(int argc, char **argv)
/* init the dynamic translator */
cpu_exec_init_all(tb_size * 1024 * 1024);

+ ksm_register_memory();
+
bdrv_init();

/* we always create the cdrom drive, even if no disk is there */
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Izik Eidus
2008-11-11 13:21:39 UTC
Permalink
From: Izik Eidus <***@qumranet.com>

this function is needed in cases you want to change the userspace
virtual mapping into diffrent physical page,
KSM need this for merging the identical pages.

this function is working by removing the oldpage from the rmap and
calling put_page on it, and by setting the virtual address pte
to point into the new page.
(note that the new page (the page that we change the pte to map to)
cannot be anonymous page)

Signed-off-by: Izik Eidus <***@qumranet.com>
---
include/linux/mm.h | 3 ++
mm/memory.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 71 insertions(+), 0 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ffee2f7..4da7fa8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1207,6 +1207,9 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn);

+int replace_page(struct vm_area_struct *vma, struct page *oldpage,
+ struct page *newpage, pte_t orig_pte, pgprot_t prot);
+
struct page *follow_page(struct vm_area_struct *, unsigned long address,
unsigned int foll_flags);
#define FOLL_WRITE 0x01 /* check pte is writable */
diff --git a/mm/memory.c b/mm/memory.c
index 164951c..b2c542c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1472,6 +1472,74 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vm_insert_mixed);

+/**
+ * replace _page - replace the pte mapping related to vm area between two pages
+ * (from oldpage to newpage)
+ * NOTE: you should take into consideration the impact on the VM when replacing
+ * anonymous pages with kernel non swappable pages.
+ */
+int replace_page(struct vm_area_struct *vma, struct page *oldpage,
+ struct page *newpage, pte_t orig_pte, pgprot_t prot)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *ptep;
+ spinlock_t *ptl;
+ unsigned long addr;
+ int ret;
+
+ BUG_ON(PageAnon(newpage));
+
+ ret = -EFAULT;
+ addr = page_address_in_vma(oldpage, vma);
+ if (addr == -EFAULT)
+ goto out;
+
+ pgd = pgd_offset(mm, addr);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, addr);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd))
+ goto out;
+
+ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!ptep)
+ goto out;
+
+ if (!pte_same(*ptep, orig_pte)) {
+ pte_unmap_unlock(ptep, ptl);
+ goto out;
+ }
+
+ ret = 0;
+ get_page(newpage);
+ page_add_file_rmap(newpage);
+
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush(vma, addr, ptep);
+ set_pte_at(mm, addr, ptep, mk_pte(newpage, prot));
+
+ page_remove_rmap(oldpage, vma);
+ if (PageAnon(oldpage)) {
+ dec_mm_counter(mm, anon_rss);
+ inc_mm_counter(mm, file_rss);
+ }
+ put_page(oldpage);
+
+ pte_unmap_unlock(ptep, ptl);
+
+out:
+ return ret;
+}
+EXPORT_SYMBOL(replace_page);
+
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
--
1.6.0.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Izik Eidus
2008-11-11 13:21:41 UTC
Permalink
From: Izik Eidus <***@qumranet.com>

this function is optimzation for kvm/users of mmu_notifiers for COW
pages, it is useful for kvm when ksm is used beacuse it allow kvm
not to have to recive VMEXIT and only then map the shared page into
the mmu shadow pages, but instead map it directly at the same time
linux map the page into the host page table.

this mmu notifer macro is working by calling to callback that will map
directly the physical page into the shadow page tables.

(users of mmu_notifiers that didnt implement the set_pte_at_notify()
call back will just recive the mmu_notifier_invalidate_page callback)

Signed-off-by: Izik Eidus <***@qumranet.com>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/mmu.c | 55 ++++++++++++++++++++++++++++++++++-----
include/linux/mmu_notifier.h | 33 +++++++++++++++++++++++
mm/memory.c | 12 ++++++--
mm/mmu_notifier.c | 20 ++++++++++++++
virt/kvm/kvm_main.c | 14 ++++++++++
6 files changed, 125 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 65679d0..a5d01d4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -748,5 +748,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);

#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 99c239c..652a51c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -663,7 +663,8 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
kvm_flush_remote_tlbs(kvm);
}

-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
{
u64 *spte;
int need_tlb_flush = 0;
@@ -678,8 +679,41 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
return need_tlb_flush;
}

+static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
+{
+ u64 *spte, new_spte;
+ u64 *cur_spte;
+ pte_t *ptep = (pte_t *)data;
+ pte_t pte;
+ struct page *new_page;
+ struct page *old_page;
+
+ pte = *ptep;
+ new_page = pfn_to_page(pte_pfn(pte));
+ cur_spte = rmap_next(kvm, rmapp, NULL);
+ while (cur_spte) {
+ spte = cur_spte;
+ BUG_ON(!(*spte & PT_PRESENT_MASK));
+ rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
+ new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
+ new_spte |= pte_pfn(pte);
+ if (!pte_write(pte))
+ new_spte &= ~PT_WRITABLE_MASK;
+ old_page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+ get_page(new_page);
+ cur_spte = rmap_next(kvm, rmapp, spte);
+ set_shadow_pte(spte, new_spte);
+ kvm_flush_remote_tlbs(kvm);
+ put_page(old_page);
+ }
+ return 0;
+}
+
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp))
+ unsigned long data,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data))
{
int i;
int retval = 0;
@@ -700,11 +734,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
end = start + (memslot->npages << PAGE_SHIFT);
if (hva >= start && hva < end) {
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
- retval |= handler(kvm, &memslot->rmap[gfn_offset]);
+ retval |= handler(kvm, &memslot->rmap[gfn_offset], data);
retval |= handler(kvm,
&memslot->lpage_info[
gfn_offset /
- KVM_PAGES_PER_HPAGE].rmap_pde);
+ KVM_PAGES_PER_HPAGE].rmap_pde,
+ data);
}
}

@@ -713,10 +748,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,

int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
{
- return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+ return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+ kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
}

-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
{
u64 *spte;
int young = 0;
@@ -742,7 +783,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)

int kvm_age_hva(struct kvm *kvm, unsigned long hva)
{
- return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
+ return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
}

#ifdef MMU_DEBUG
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index b77486d..c2effe2 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -61,6 +61,15 @@ struct mmu_notifier_ops {
struct mm_struct *mm,
unsigned long address);

+ /*
+ * change_pte is called in cases that pte mapping into page is changed
+ * for example when ksm mapped pte to point into a new shared page.
+ */
+ void (*change_pte)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address,
+ pte_t pte);
+
/*
* Before this is invoked any secondary MMU is still ok to
* read/write to the page previously pointed to by the Linux
@@ -154,6 +163,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
unsigned long address);
+extern void __mmu_notifier_change_pte(struct mm_struct *mm,
+ unsigned long address, pte_t pte);
extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
unsigned long address);
extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -175,6 +186,13 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
return 0;
}

+static inline void mmu_notifier_change_pte(struct mm_struct *mm,
+ unsigned long address, pte_t pte)
+{
+ if (mm_has_notifiers(mm))
+ __mmu_notifier_change_pte(mm, address, pte);
+}
+
static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
unsigned long address)
{
@@ -236,6 +254,16 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
__young; \
})

+#define set_pte_at_notify(__mm, __address, __ptep, __pte) \
+({ \
+ struct mm_struct *___mm = __mm; \
+ unsigned long ___address = __address; \
+ pte_t ___pte = __pte; \
+ \
+ set_pte_at(__mm, __address, __ptep, ___pte); \
+ mmu_notifier_change_pte(___mm, ___address, ___pte); \
+})
+
#else /* CONFIG_MMU_NOTIFIER */

static inline void mmu_notifier_release(struct mm_struct *mm)
@@ -248,6 +276,11 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
return 0;
}

+static inline void mmu_notifier_change_pte(struct mm_struct *mm,
+ unsigned long address, pte_t pte)
+{
+}
+
static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
unsigned long address)
{
diff --git a/mm/memory.c b/mm/memory.c
index b2c542c..374d695 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1524,7 +1524,7 @@ int replace_page(struct vm_area_struct *vma, struct page *oldpage,

flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush(vma, addr, ptep);
- set_pte_at(mm, addr, ptep, mk_pte(newpage, prot));
+ set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot));

page_remove_rmap(oldpage, vma);
if (PageAnon(oldpage)) {
@@ -1981,13 +1981,19 @@ gotten:
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
- ptep_clear_flush_notify(vma, address, page_table);
+ ptep_clear_flush(vma, address, page_table);
SetPageSwapBacked(new_page);
lru_cache_add_active_or_unevictable(new_page, vma);
page_add_new_anon_rmap(new_page, vma, address);

//TODO: is this safe? do_anonymous_page() does it this way.
- set_pte_at(mm, address, page_table, entry);
+ /*
+ * we call here for the notify macro beacuse in cases of using
+ * secondary mmu page table like kvm shadow page tables
+ * we want the new page to be mapped directly into the second
+ * page table.
+ */
+ set_pte_at_notify(mm, address, page_table, entry);
update_mmu_cache(vma, address, entry);
if (old_page) {
/*
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5f4ef02..c3e8779 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -99,6 +99,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
return young;
}

+void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
+ pte_t pte)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->change_pte)
+ mn->ops->change_pte(mn, mm, address, pte);
+ /*
+ * some drivers dont have change_pte and therefor we must call
+ * for invalidate_page in that case
+ */
+ else if (mn->ops->invalidate_page)
+ mn->ops->invalidate_page(mn, mm, address);
+ }
+ rcu_read_unlock();
+}
+
void __mmu_notifier_invalidate_page(struct mm_struct *mm,
unsigned long address)
{
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cf0ab8e..00c12c4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -482,6 +482,19 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,

}

+static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address,
+ pte_t pte)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+
+ spin_lock(&kvm->mmu_lock);
+ kvm->mmu_notifier_seq++;
+ kvm_set_spte_hva(kvm, address, pte);
+ spin_unlock(&kvm->mmu_lock);
+}
+
static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
@@ -554,6 +567,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
+ .change_pte = kvm_mmu_notifier_change_pte,
};
#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
--
1.6.0.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Izik Eidus
2008-11-11 13:21:40 UTC
Permalink
From: Izik Eidus <***@qumranet.com>

ksm is driver that allow merging identical pages between one or more
applications in way unvisible to the application that use it.
pages that are merged are marked as readonly and are COWed when any application
try to change them.

ksm is working by walking over the memory pages of the applications it scan
in order to find identical pages.
it uses an hash table to find in effective way the identical pages.

when ksm find two identical pages, it marked them as readonly and merge them
into single one page,
after the pages are marked as readonly and merged into one page, linux
will treat this pages as normal copy_on_write pages and will fork them
when write access will happen to them.

ksm scan just memory areas that were registred to be scanned by it.

Signed-off-by: Izik Eidus <***@qumranet.com>
---
drivers/Kconfig | 5 +
include/linux/ksm.h | 53 ++
include/linux/miscdevice.h | 1 +
mm/Kconfig | 3 +
mm/Makefile | 1 +
mm/ksm.c | 1202 ++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 1265 insertions(+), 0 deletions(-)
create mode 100644 include/linux/ksm.h
create mode 100644 mm/ksm.c

diff --git a/drivers/Kconfig b/drivers/Kconfig
index d38f43f..c1c701f 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -105,4 +105,9 @@ source "drivers/uio/Kconfig"
source "drivers/xen/Kconfig"

source "drivers/staging/Kconfig"
+
+config KSM
+ bool "KSM driver support"
+ help
+ ksm is driver for merging identical pages between applciations
endmenu
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
new file mode 100644
index 0000000..f873502
--- /dev/null
+++ b/include/linux/ksm.h
@@ -0,0 +1,53 @@
+#ifndef __LINUX_KSM_H
+#define __LINUX_KSM_H
+
+/*
+ * Userspace interface for /dev/ksm - kvm shared memory
+ */
+
+#include <asm/types.h>
+#include <linux/ioctl.h>
+
+#define KSM_API_VERSION 1
+
+/* for KSM_REGISTER_MEMORY_REGION */
+struct ksm_memory_region {
+ __u32 npages; /* number of pages to share */
+ __u32 pad;
+ __u64 addr; /* the begining of the virtual address */
+};
+
+struct ksm_user_scan {
+ __u32 pages_to_scan;
+ __u32 max_pages_to_merge;
+};
+
+struct ksm_kthread_info {
+ __u32 sleep; /* number of microsecoends to sleep */
+ __u32 pages_to_scan; /* number of pages to scan */
+ __u32 max_pages_to_merge;
+ __u32 running;
+};
+
+#define KSMIO 0xAB
+
+/* ioctls for /dev/ksm */
+#define KSM_GET_API_VERSION _IO(KSMIO, 0x00)
+#define KSM_CREATE_SHARED_MEMORY_AREA _IO(KSMIO, 0x01) /* return SMA fd */
+#define KSM_CREATE_SCAN _IO(KSMIO, 0x02) /* return SCAN fd */
+#define KSM_START_STOP_KTHREAD _IOW(KSMIO, 0x03,\
+ struct ksm_kthread_info)
+#define KSM_GET_INFO_KTHREAD _IOW(KSMIO, 0x04,\
+ struct ksm_kthread_info)
+
+
+/* ioctls for SMA fds */
+#define KSM_REGISTER_MEMORY_REGION _IOW(KSMIO, 0x20,\
+ struct ksm_memory_region)
+#define KSM_REMOVE_MEMORY_REGION _IO(KSMIO, 0x21)
+
+/* ioctls for SCAN fds */
+#define KSM_SCAN _IOW(KSMIO, 0x40,\
+ struct ksm_user_scan)
+
+#endif
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 26433ec..adc2435 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -30,6 +30,7 @@
#define TUN_MINOR 200
#define HPET_MINOR 228
#define KVM_MINOR 232
+#define KSM_MINOR 233

struct device;

diff --git a/mm/Kconfig b/mm/Kconfig
index 5b5790f..e7f0061 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -222,3 +222,6 @@ config UNEVICTABLE_LRU

config MMU_NOTIFIER
bool
+
+config KSM
+ bool
diff --git a/mm/Makefile b/mm/Makefile
index c06b45a..9722afe 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+obj-$(CONFIG_KSM) += ksm.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
diff --git a/mm/ksm.c b/mm/ksm.c
new file mode 100644
index 0000000..977eb37
--- /dev/null
+++ b/mm/ksm.c
@@ -0,0 +1,1202 @@
+/*
+ * Memory merging driver for Linux
+ *
+ * This module enables dynamic sharing of identical pages found in different
+ * memory areas, even if they are not shared by fork()
+ *
+ * Copyright (C) 2008 Red Hat, Inc.
+ * Authors:
+ * Izik Eidus
+ * Andrea Arcangeli
+ * Chris Wright
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/file.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+#include <linux/rmap.h>
+#include <linux/spinlock.h>
+#include <linux/jhash.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/anon_inodes.h>
+#include <linux/ksm.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <linux/random.h>
+#include <crypto/sha.h>
+
+#include <asm/tlbflush.h>
+
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+static int page_hash_size;
+module_param(page_hash_size, int, 0);
+MODULE_PARM_DESC(page_hash_size, "Hash table size for the pages checksum");
+
+static int rmap_hash_size;
+module_param(rmap_hash_size, int, 0);
+MODULE_PARM_DESC(rmap_hash_size, "Hash table size for the reverse mapping");
+
+static int sha1_hash_size;
+module_param(sha1_hash_size, int, 0);
+MODULE_PARM_DESC(sha1_hash_size, "Hash table size for the sha1 caching");
+
+struct ksm_mem_slot {
+ struct list_head link;
+ struct list_head sma_link;
+ struct mm_struct *mm;
+ unsigned long addr; /* the begining of the virtual address */
+ int npages; /* number of pages to share */
+};
+
+/*
+ * sma - shared memory area, each process have its own sma that contain the
+ * information about the slots that it own
+ */
+struct ksm_sma {
+ struct list_head sma_slots;
+};
+
+struct ksm_scan {
+ struct ksm_mem_slot *slot_index; /* the slot we are scanning now */
+ int page_index; /* the page inside sma that is now being scanned */
+};
+
+struct page_hash_item {
+ struct hlist_node link;
+ struct mm_struct *mm;
+ unsigned long addr;
+};
+
+struct rmap_item {
+ struct hlist_node link;
+ struct page_hash_item *page_hash_item;
+ unsigned long oldindex;
+};
+
+struct sha1_item {
+ unsigned char sha1val[SHA1_DIGEST_SIZE];
+ unsigned long pfn;
+};
+
+static struct list_head slots;
+static struct rw_semaphore slots_lock;
+
+static DEFINE_MUTEX(sha1_lock);
+
+static int npages_hash;
+static struct hlist_head *page_hash_items;
+static int nrmaps_hash;
+static struct hlist_head *rmap_hash;
+static int nsha1s_hash;
+static struct sha1_item *sha1_hash;
+
+static struct kmem_cache *page_hash_item_cache;
+static struct kmem_cache *rmap_item_cache;
+
+static int kthread_sleep;
+static int kthread_pages_to_scan;
+static int kthread_max_npages;
+static struct ksm_scan kthread_ksm_scan;
+static int kthread_run;
+static struct task_struct *kthread;
+static wait_queue_head_t kthread_wait;
+static struct rw_semaphore kthread_lock;
+static struct crypto_hash *tfm;
+static unsigned char hmac_ke