Discussion:
[RFC 00/15] x86_64: Optimize percpu accesses
(too old to reply)
Mike Travis
2008-07-09 16:51:29 UTC
Permalink
This patchset provides the following:

* Cleanup: Fix early references to cpumask_of_cpu(0)

Provides an early cpumask_of_cpu(0) usable before the cpumask_of_cpu_map
is allocated and initialized.

* Generic: Percpu infrastructure to rebase the per cpu area to zero

This provides for the capability of accessing the percpu variables
using a local register instead of having to go through a table
on node 0 to find the cpu-specific offsets. It also would allow
atomic operations on percpu variables to reduce required locking.
Uses a new config var HAVE_ZERO_BASED_PER_CPU to indicate to the
generic code that the arch has this new basing.

(Note: split into two patches, one to rebase percpu variables at 0,
and the second to actually use %gs as the base for percpu variables.)

* x86_64: Fold pda into per cpu area

Declare the pda as a per cpu variable. This will move the pda
area to an address accessible by the x86_64 per cpu macros.
Subtraction of __per_cpu_start will make the offset based from
the beginning of the per cpu area. Since %gs is pointing to the
pda, it will then also point to the per cpu variables and can be
accessed thusly:

%gs:[&per_cpu_xxxx - __per_cpu_start]

* x86_64: Rebase per cpu variables to zero

Take advantage of the zero-based per cpu area provided above.
Then we can directly use the x86_32 percpu operations. x86_32
offsets %fs by __per_cpu_start. x86_64 has %gs pointing directly
to the pda and the per cpu area thereby allowing access to the
pda with the x86_64 pda operations and access to the per cpu
variables using x86_32 percpu operations.


Based on linux-2.6.tip/master with following patches applied:

[PATCH 1/1] x86: Add check for node passed to node_to_cpumask V3
[PATCH 1/1] x86: Change _node_to_cpumask_ptr to return const ptr
[PATCH 1/1] sched: Reduce stack size in isolated_cpu_setup()
[PATCH 1/1] kthread: Reduce stack pressure in create_kthread and kthreadd


Signed-off-by: Christoph Lameter <***@linux-foundation.org>
Signed-off-by: Mike Travis <***@sgi.com>
---

--
Mike Travis
2008-07-09 16:51:30 UTC
Permalink
* Initialize the cpumask_of_cpu_map to contain a cpumask for cpu 0
in the initdata section. This allows references before the real
cpumask_of_cpu_map is setup avoiding possible null pointer deref
panics.

* Ruggedize some other calls to prevent mishaps from early calls,
pariticularly in non-critical functions:

* Cleanup DEBUG_PER_CPU_MAPS usages and some comments.

Based on linux-2.6.tip/master

Signed-off-by: Mike Travis <***@sgi.com>
---
arch/x86/kernel/setup_percpu.c | 73 ++++++++++++++++++++++++++++-------------
1 file changed, 51 insertions(+), 22 deletions(-)

--- linux-2.6.tip.orig/arch/x86/kernel/setup_percpu.c
+++ linux-2.6.tip/arch/x86/kernel/setup_percpu.c
@@ -15,6 +15,12 @@
#include <asm/apicdef.h>
#include <asm/highmem.h>

+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+# define DBG(x...) printk(KERN_DEBUG x)
+#else
+# define DBG(x...)
+#endif
+
#ifdef CONFIG_X86_LOCAL_APIC
unsigned int num_processors;
unsigned disabled_cpus __cpuinitdata;
@@ -27,31 +33,39 @@ EXPORT_SYMBOL(boot_cpu_physical_apicid);
physid_mask_t phys_cpu_present_map;
#endif

-/* map cpu index to physical APIC ID */
+/*
+ * Map cpu index to physical APIC ID
+ */
DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);

#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
-#define X86_64_NUMA 1
+#define X86_64_NUMA 1 /* (used later) */

-/* map cpu index to node index */
+/*
+ * Map cpu index to node index
+ */
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);

-/* which logical CPUs are on which nodes */
+/*
+ * Which logical CPUs are on which nodes
+ */
cpumask_t *node_to_cpumask_map;
EXPORT_SYMBOL(node_to_cpumask_map);

-/* setup node_to_cpumask_map */
+/*
+ * Setup node_to_cpumask_map
+ */
static void __init setup_node_to_cpumask_map(void);

#else
static inline void setup_node_to_cpumask_map(void) { }
#endif

-#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_SMP)
+#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
/*
* Copy data used in early init routines from the initial arrays to the
* per cpu data areas. These arrays then become expendable and the
@@ -81,16 +95,25 @@ static void __init setup_per_cpu_maps(vo
}

#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
-cpumask_t *cpumask_of_cpu_map __read_mostly;
+/*
+ * Configure an initial cpumask_of_cpu(0) for early users
+ */
+static cpumask_t initial_cpumask_of_cpu_map __initdata = (cpumask_t) { {
+ [BITS_TO_LONGS(NR_CPUS)-1] = 1
+} };
+cpumask_t *cpumask_of_cpu_map __read_mostly =
+ (cpumask_t *)&initial_cpumask_of_cpu_map;
EXPORT_SYMBOL(cpumask_of_cpu_map);

-/* requires nr_cpu_ids to be initialized */
+/* Requires nr_cpu_ids to be initialized. */
static void __init setup_cpumask_of_cpu(void)
{
int i;

/* alloc_bootmem zeroes memory */
cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
+ DBG("cpumask_of_cpu_map %p\n", cpumask_of_cpu_map);
+
for (i = 0; i < nr_cpu_ids; i++)
cpu_set(i, cpumask_of_cpu_map[i]);
}
@@ -197,9 +220,10 @@ void __init setup_per_cpu_areas(void)
per_cpu_offset(cpu) = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);

+ DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
}

- printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
+ printk(KERN_INFO "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
NR_CPUS, nr_cpu_ids, nr_node_ids);

/* Setup percpu data maps */
@@ -221,6 +245,7 @@ void __init setup_per_cpu_areas(void)
* Requires node_possible_map to be valid.
*
* Note: node_to_cpumask() is not valid until after this is done.
+ * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
*/
static void __init setup_node_to_cpumask_map(void)
{
@@ -236,9 +261,7 @@ static void __init setup_node_to_cpumask

/* allocate the map */
map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
-
- Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
- map, nr_node_ids);
+ DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);

/* node_to_cpumask() will now work */
node_to_cpumask_map = map;
@@ -248,17 +271,23 @@ void __cpuinit numa_set_node(int cpu, in
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);

- if (cpu_pda(cpu) && node != NUMA_NO_NODE)
- cpu_pda(cpu)->nodenumber = node;
-
- if (cpu_to_node_map)
+ /* early setting, no percpu area yet */
+ if (cpu_to_node_map) {
cpu_to_node_map[cpu] = node;
+ return;
+ }

- else if (per_cpu_offset(cpu))
- per_cpu(x86_cpu_to_node_map, cpu) = node;
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+ if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) {
+ printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+ dump_stack();
+ return;
+ }
+#endif
+ per_cpu(x86_cpu_to_node_map, cpu) = node;

- else
- Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
+ if (node != NUMA_NO_NODE)
+ cpu_pda(cpu)->nodenumber = node;
}

void __cpuinit numa_clear_node(int cpu)
@@ -275,7 +304,7 @@ void __cpuinit numa_add_cpu(int cpu)

void __cpuinit numa_remove_cpu(int cpu)
{
- cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
+ cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}

#else /* CONFIG_DEBUG_PER_CPU_MAPS */
@@ -285,7 +314,7 @@ void __cpuinit numa_remove_cpu(int cpu)
*/
static void __cpuinit numa_set_cpumask(int cpu, int enable)
{
- int node = cpu_to_node(cpu);
+ int node = early_cpu_to_node(cpu);
cpumask_t *mask;
char buf[64];


--
Mike Travis
2008-07-09 16:51:31 UTC
Permalink
WARNING: there is still a FIXME in this patch (see arch/x86/kernel/acpi/sleep.c)

* Declare the pda as a per cpu variable.

* Make the x86_64 per cpu area start at zero.

* Relocate the initial pda and per_cpu(gdt_page) in head_64.S for the
boot cpu (0). For secondary cpus, do_boot_cpu() sets up the correct
initial pda and gdt_page pointer.

* Initialize per_cpu_offset to point to static pda in the per_cpu area
(@ __per_cpu_load).

* After allocation of the per cpu area for the boot cpu (0), reload the
gdt page pointer.

Based on linux-2.6.tip/master

Signed-off-by: Christoph Lameter <***@linux-foundation.org>
Signed-off-by: Mike Travis <***@sgi.com>
---
arch/x86/Kconfig | 3 +
arch/x86/kernel/acpi/sleep.c | 9 +++
arch/x86/kernel/cpu/common_64.c | 4 -
arch/x86/kernel/head64.c | 24 +--------
arch/x86/kernel/head_64.S | 45 ++++++++++++++++--
arch/x86/kernel/setup_percpu.c | 94 +++++++++++++++++----------------------
arch/x86/kernel/smpboot.c | 52 ---------------------
arch/x86/kernel/vmlinux_64.lds.S | 1
include/asm-x86/desc.h | 5 ++
include/asm-x86/pda.h | 3 -
include/asm-x86/percpu.h | 13 -----
include/asm-x86/trampoline.h | 1
12 files changed, 112 insertions(+), 142 deletions(-)

--- linux-2.6.tip.orig/arch/x86/Kconfig
+++ linux-2.6.tip/arch/x86/Kconfig
@@ -129,6 +129,9 @@ config HAVE_SETUP_PER_CPU_AREA
config HAVE_CPUMASK_OF_CPU_MAP
def_bool X86_64_SMP

+config HAVE_ZERO_BASED_PER_CPU
+ def_bool X86_64_SMP
+
config ARCH_HIBERNATION_POSSIBLE
def_bool y
depends on !SMP || !X86_VOYAGER
--- linux-2.6.tip.orig/arch/x86/kernel/acpi/sleep.c
+++ linux-2.6.tip/arch/x86/kernel/acpi/sleep.c
@@ -89,6 +89,15 @@ int acpi_save_state_mem(void)
#ifdef CONFIG_SMP
stack_start.sp = temp_stack + 4096;
#endif
+ /*
+ * FIXME: with zero-based percpu variables, the pda and gdt_page
+ * addresses must be offset by the base of this cpu's percpu area.
+ * Where/how should we do this?
+ *
+ * for secondary cpu startup in smpboot.c:do_boot_cpu() this is done:
+ * early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+ * initial_pda = (unsigned long)get_cpu_pda(cpu);
+ */
initial_code = (unsigned long)wakeup_long64;
saved_magic = 0x123456789abcdef0;
#endif /* CONFIG_64BIT */
--- linux-2.6.tip.orig/arch/x86/kernel/cpu/common_64.c
+++ linux-2.6.tip/arch/x86/kernel/cpu/common_64.c
@@ -423,8 +423,8 @@ __setup("clearcpuid=", setup_disablecpui

cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;

-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
+DEFINE_PER_CPU_FIRST(struct x8664_pda, pda);
+EXPORT_PER_CPU_SYMBOL(pda);

struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };

--- linux-2.6.tip.orig/arch/x86/kernel/head64.c
+++ linux-2.6.tip/arch/x86/kernel/head64.c
@@ -25,20 +25,6 @@
#include <asm/e820.h>
#include <asm/bios_ebda.h>

-/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda __read_mostly;
-
-#ifdef CONFIG_SMP
-/*
- * We install an empty cpu_pda pointer table to indicate to early users
- * (numa_set_node) that the cpu_pda pointer table for cpus other than
- * the boot cpu is not yet setup.
- */
-static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
-#else
-static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
-#endif
-
static void __init zap_identity_mappings(void)
{
pgd_t *pgd = pgd_offset_k(0UL);
@@ -91,6 +77,10 @@ void __init x86_64_start_kernel(char * r
/* Cleanup the over mapped high alias */
cleanup_highmap();

+ /* Initialize boot cpu_pda data */
+ /* (See head_64.S for earlier pda/gdt initialization) */
+ pda_init(0);
+
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
#ifdef CONFIG_EARLY_PRINTK
set_intr_gate(i, &early_idt_handlers[i]);
@@ -102,12 +92,6 @@ void __init x86_64_start_kernel(char * r

early_printk("Kernel alive\n");

- _cpu_pda = __cpu_pda;
- cpu_pda(0) = &_boot_cpu_pda;
- pda_init(0);
-
- early_printk("Kernel really alive\n");
-
copy_bootdata(__va(real_mode_data));

reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
--- linux-2.6.tip.orig/arch/x86/kernel/head_64.S
+++ linux-2.6.tip/arch/x86/kernel/head_64.S
@@ -12,6 +12,7 @@
#include <linux/linkage.h>
#include <linux/threads.h>
#include <linux/init.h>
+#include <asm/asm-offsets.h>
#include <asm/desc.h>
#include <asm/segment.h>
#include <asm/pgtable.h>
@@ -203,7 +204,27 @@ ENTRY(secondary_startup_64)
* addresses where we're currently running on. We have to do that here
* because in 32bit we couldn't load a 64bit linear address.
*/
- lgdt early_gdt_descr(%rip)
+
+#ifdef CONFIG_SMP
+ /*
+ * For zero-based percpu variables, the base (__per_cpu_load) must
+ * be added to the offset of per_cpu__gdt_page. This is only needed
+ * for the boot cpu but we can't do this prior to secondary_startup_64.
+ * So we use a NULL gdt adrs to indicate that we are starting up the
+ * boot cpu and not the secondary cpus. do_boot_cpu() will fixup
+ * the gdt adrs for those cpus.
+ */
+#define PER_CPU_GDT_PAGE 0
+ movq early_gdt_descr_base(%rip), %rax
+ testq %rax, %rax
+ jnz 1f
+ movq $__per_cpu_load, %rax
+ addq $per_cpu__gdt_page, %rax
+ movq %rax, early_gdt_descr_base(%rip)
+#else
+#define PER_CPU_GDT_PAGE per_cpu__gdt_page
+#endif
+1: lgdt early_gdt_descr(%rip)

/* set up data segments. actually 0 would do too */
movl $__KERNEL_DS,%eax
@@ -220,14 +241,21 @@ ENTRY(secondary_startup_64)
movl %eax,%gs

/*
- * Setup up a dummy PDA. this is just for some early bootup code
- * that does in_interrupt()
+ * Setup up the real PDA.
+ *
+ * For SMP, the boot cpu (0) uses the static pda which is the first
+ * element in the percpu area (@__per_cpu_load). This pda is moved
+ * to the real percpu area once that is allocated. Secondary cpus
+ * will use the initial_pda value setup in do_boot_cpu().
*/
movl $MSR_GS_BASE,%ecx
- movq $empty_zero_page,%rax
+ movq initial_pda(%rip), %rax
movq %rax,%rdx
shrq $32,%rdx
wrmsr
+#ifdef CONFIG_SMP
+ movq %rax, %gs:pda_data_offset
+#endif

/* esi is pointer to real mode structure with interesting info.
pass it to C */
@@ -250,6 +278,12 @@ ENTRY(secondary_startup_64)
.align 8
ENTRY(initial_code)
.quad x86_64_start_kernel
+ ENTRY(initial_pda)
+#ifdef CONFIG_SMP
+ .quad __per_cpu_load # Overwritten for secondary CPUs
+#else
+ .quad per_cpu__pda
+#endif
__FINITDATA

ENTRY(stack_start)
@@ -394,7 +428,8 @@ NEXT_PAGE(level2_spare_pgt)
.globl early_gdt_descr
early_gdt_descr:
.word GDT_ENTRIES*8-1
- .quad per_cpu__gdt_page
+early_gdt_descr_base:
+ .quad PER_CPU_GDT_PAGE # Overwritten for secondary CPUs

ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
--- linux-2.6.tip.orig/arch/x86/kernel/setup_percpu.c
+++ linux-2.6.tip/arch/x86/kernel/setup_percpu.c
@@ -14,6 +14,7 @@
#include <asm/mpspec.h>
#include <asm/apicdef.h>
#include <asm/highmem.h>
+#include <asm/desc.h>

#ifdef CONFIG_DEBUG_PER_CPU_MAPS
# define DBG(x...) printk(KERN_DEBUG x)
@@ -119,63 +120,26 @@ static void __init setup_cpumask_of_cpu(
static inline void setup_cpumask_of_cpu(void) { }
#endif

-#ifdef CONFIG_X86_32
/*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
+ * Pointers to per cpu areas for each cpu
*/
+#ifndef CONFIG_HAVE_ZERO_BASED_PER_CPU
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
-static inline void setup_cpu_pda_map(void) { }
-
-#elif !defined(CONFIG_SMP)
-static inline void setup_cpu_pda_map(void) { }
-
-#else /* CONFIG_SMP && CONFIG_X86_64 */
+#else

/*
- * Allocate cpu_pda pointer table and array via alloc_bootmem.
+ * Initialize percpu offset for boot cpu (0) to static percpu area
+ * for referencing very early in kernel startup.
*/
-static void __init setup_cpu_pda_map(void)
-{
- char *pda;
- struct x8664_pda **new_cpu_pda;
- unsigned long size;
- int cpu;
-
- size = roundup(sizeof(struct x8664_pda), cache_line_size());
-
- /* allocate cpu_pda array and pointer table */
- {
- unsigned long tsize = nr_cpu_ids * sizeof(void *);
- unsigned long asize = size * (nr_cpu_ids - 1);
-
- tsize = roundup(tsize, cache_line_size());
- new_cpu_pda = alloc_bootmem(tsize + asize);
- pda = (char *)new_cpu_pda + tsize;
- }
-
- /* initialize pointer table to static pda's */
- for_each_possible_cpu(cpu) {
- if (cpu == 0) {
- /* leave boot cpu pda in place */
- new_cpu_pda[0] = cpu_pda(0);
- continue;
- }
- new_cpu_pda[cpu] = (struct x8664_pda *)pda;
- new_cpu_pda[cpu]->in_bootmem = 1;
- pda += size;
- }
-
- /* point to new pointer table */
- _cpu_pda = new_cpu_pda;
-}
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
+ [0] = (unsigned long)__per_cpu_load
+};
+EXPORT_SYMBOL(__per_cpu_offset);
#endif

/*
- * Great future plan:
- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
- * Always point %gs to its beginning
+ * Allocate and initialize the per cpu areas which include the PDAs.
*/
void __init setup_per_cpu_areas(void)
{
@@ -193,9 +157,6 @@ void __init setup_per_cpu_areas(void)
nr_cpu_ids = num_processors;
#endif

- /* Setup cpu_pda map */
- setup_cpu_pda_map();
-
/* Copy section for each CPU (we discard the original) */
size = PERCPU_ENOUGH_ROOM;
printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
@@ -215,10 +176,41 @@ void __init setup_per_cpu_areas(void)
else
ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
#endif
+ /* Initialize each cpu's per_cpu area and save pointer */
+ memcpy(ptr, __per_cpu_load, __per_cpu_size);
per_cpu_offset(cpu) = ptr - __per_cpu_start;
- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);

DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
+
+#ifdef CONFIG_X86_64
+ /*
+ * Note the boot cpu (0) has been using the static per_cpu load
+ * area for it's pda. We need to zero out the pdas for the
+ * other cpus that are coming online.
+ *
+ * Additionally, for the boot cpu the gdt page must be reloaded
+ * as we moved it from the static per cpu area to the newly
+ * allocated area.
+ */
+ {
+ /* We rely on the fact that pda is the first element */
+ struct x8664_pda *pda = (struct x8664_pda *)ptr;
+
+ if (cpu) {
+ memset(pda, 0, sizeof(*pda));
+ pda->data_offset = (unsigned long)ptr;
+ } else {
+ struct desc_ptr gdt_descr = early_gdt_descr;
+
+ pda->data_offset = (unsigned long)ptr;
+ gdt_descr.address =
+ (unsigned long)get_cpu_gdt_table(0);
+ native_load_gdt(&gdt_descr);
+ pda_init(0);
+ }
+
+ }
+#endif
}

printk(KERN_INFO "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
--- linux-2.6.tip.orig/arch/x86/kernel/smpboot.c
+++ linux-2.6.tip/arch/x86/kernel/smpboot.c
@@ -762,45 +762,6 @@ static void __cpuinit do_fork_idle(struc
complete(&c_idle->done);
}

-#ifdef CONFIG_X86_64
-/*
- * Allocate node local memory for the AP pda.
- *
- * Must be called after the _cpu_pda pointer table is initialized.
- */
-static int __cpuinit get_local_pda(int cpu)
-{
- struct x8664_pda *oldpda, *newpda;
- unsigned long size = sizeof(struct x8664_pda);
- int node = cpu_to_node(cpu);
-
- if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
- return 0;
-
- oldpda = cpu_pda(cpu);
- newpda = kmalloc_node(size, GFP_ATOMIC, node);
- if (!newpda) {
- printk(KERN_ERR "Could not allocate node local PDA "
- "for CPU %d on node %d\n", cpu, node);
-
- if (oldpda)
- return 0; /* have a usable pda */
- else
- return -1;
- }
-
- if (oldpda) {
- memcpy(newpda, oldpda, size);
- if (!after_bootmem)
- free_bootmem((unsigned long)oldpda, size);
- }
-
- newpda->in_bootmem = 0;
- cpu_pda(cpu) = newpda;
- return 0;
-}
-#endif /* CONFIG_X86_64 */
-
static int __cpuinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -818,16 +779,6 @@ static int __cpuinit do_boot_cpu(int api
};
INIT_WORK(&c_idle.work, do_fork_idle);

-#ifdef CONFIG_X86_64
- /* Allocate node local memory for AP pdas */
- if (cpu > 0) {
- boot_error = get_local_pda(cpu);
- if (boot_error)
- goto restore_state;
- /* if can't get pda memory, can't start cpu */
- }
-#endif
-
alternatives_smp_switch(1);

c_idle.idle = get_idle_for_cpu(cpu);
@@ -865,6 +816,7 @@ do_rest:
#else
cpu_pda(cpu)->pcurrent = c_idle.idle;
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+ initial_pda = (unsigned long)get_cpu_pda(cpu);
#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
@@ -940,8 +892,6 @@ do_rest:
}
}

-restore_state:
-
if (boot_error) {
/* Try to put things back the way they were before ... */
numa_remove_cpu(cpu); /* was set by numa_add_cpu */
--- linux-2.6.tip.orig/arch/x86/kernel/vmlinux_64.lds.S
+++ linux-2.6.tip/arch/x86/kernel/vmlinux_64.lds.S
@@ -16,6 +16,7 @@ jiffies_64 = jiffies;
_proxy_pda = 1;
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
+ percpu PT_LOAD FLAGS(7); /* RWE */
data PT_LOAD FLAGS(7); /* RWE */
user PT_LOAD FLAGS(7); /* RWE */
data.init PT_LOAD FLAGS(7); /* RWE */
--- linux-2.6.tip.orig/include/asm-x86/desc.h
+++ linux-2.6.tip/include/asm-x86/desc.h
@@ -41,6 +41,11 @@ static inline struct desc_struct *get_cp

#ifdef CONFIG_X86_64

+static inline struct x8664_pda *get_cpu_pda(unsigned int cpu)
+{
+ return &per_cpu(pda, cpu);
+}
+
static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
unsigned dpl, unsigned ist, unsigned seg)
{
--- linux-2.6.tip.orig/include/asm-x86/pda.h
+++ linux-2.6.tip/include/asm-x86/pda.h
@@ -37,10 +37,9 @@ struct x8664_pda {
unsigned irq_spurious_count;
} ____cacheline_aligned_in_smp;

-extern struct x8664_pda **_cpu_pda;
extern void pda_init(int);

-#define cpu_pda(i) (_cpu_pda[i])
+#define cpu_pda(cpu) (&per_cpu(pda, cpu))

/*
* There is no fast way to get the base address of the PDA, all the accesses
--- linux-2.6.tip.orig/include/asm-x86/percpu.h
+++ linux-2.6.tip/include/asm-x86/percpu.h
@@ -3,20 +3,11 @@

#ifdef CONFIG_X86_64
#include <linux/compiler.h>
-
-/* Same as asm-generic/percpu.h, except that we store the per cpu offset
- in the PDA. Longer term the PDA and every per cpu variable
- should be just put into a single section and referenced directly
- from %gs */
-
-#ifdef CONFIG_SMP
#include <asm/pda.h>

-#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
+/* Same as asm-generic/percpu.h */
+#ifdef CONFIG_SMP
#define __my_cpu_offset read_pda(data_offset)
-
-#define per_cpu_offset(x) (__per_cpu_offset(x))
-
#endif
#include <asm-generic/percpu.h>

--- linux-2.6.tip.orig/include/asm-x86/trampoline.h
+++ linux-2.6.tip/include/asm-x86/trampoline.h
@@ -12,6 +12,7 @@ extern unsigned char *trampoline_base;

extern unsigned long init_rsp;
extern unsigned long initial_code;
+extern unsigned long initial_pda;

#define TRAMPOLINE_BASE 0x6000
extern unsigned long setup_trampoline(void);

--
Eric W. Biederman
2008-07-09 22:02:45 UTC
Permalink
Post by Mike Travis
WARNING: there is still a FIXME in this patch (see arch/x86/kernel/acpi/sleep.c)
* Declare the pda as a per cpu variable.
* Make the x86_64 per cpu area start at zero.
* Relocate the initial pda and per_cpu(gdt_page) in head_64.S for the
boot cpu (0). For secondary cpus, do_boot_cpu() sets up the correct
initial pda and gdt_page pointer.
* Initialize per_cpu_offset to point to static pda in the per_cpu area
* After allocation of the per cpu area for the boot cpu (0), reload the
gdt page pointer.
Based on linux-2.6.tip/master
Given that we have not yet understood the weird failure case. This patch needs
to be split in two.
- make the current per cpu variable section zero based.
- Move the pda into the per cpu variable section.

There are too many variables at present the reported failure cases to
guess what is really going on.

We can not optimize the per cpu variable accesses until the pda moves
but we can easily test for linker and tool chain bugs with zero
based pda segment itself.

Eric
Mike Travis
2008-07-09 16:51:32 UTC
Permalink
* Since %gs is pointing to the pda, it will then also point to the per cpu
variables and can be accessed thusly:

%gs:[&per_cpu_xxxx - __per_cpu_start]

... and since __per_cpu_start == 0 then:

%gs:&per_cpu_var(xxx)

becomes the optimized effective address.


Signed-off-by: Christoph Lameter <***@linux-foundation.org>
Signed-off-by: Mike Travis <***@sgi.com>
---
include/asm-x86/percpu.h | 36 +++++++++++++-----------------------
1 file changed, 13 insertions(+), 23 deletions(-)

--- linux-2.6.tip.orig/include/asm-x86/percpu.h
+++ linux-2.6.tip/include/asm-x86/percpu.h
@@ -5,15 +5,19 @@
#include <linux/compiler.h>
#include <asm/pda.h>

-/* Same as asm-generic/percpu.h */
+/* Same as asm-generic/percpu.h, except we use %gs as a segment offset. */
#ifdef CONFIG_SMP
#define __my_cpu_offset read_pda(data_offset)
+#define __percpu_seg "%%gs:"
+#else
+#define __percpu_seg ""
#endif
+
#include <asm-generic/percpu.h>

DECLARE_PER_CPU(struct x8664_pda, pda);

-#else /* CONFIG_X86_64 */
+#else /* !CONFIG_X86_64 */

#ifdef __ASSEMBLY__

@@ -42,36 +46,23 @@ DECLARE_PER_CPU(struct x8664_pda, pda);

#else /* ...!ASSEMBLY */

-/*
- * PER_CPU finds an address of a per-cpu variable.
- *
- * Args:
- * var - variable name
- * cpu - 32bit register containing the current CPU number
- *
- * The resulting address is stored in the "cpu" argument.
- *
- * Example:
- * PER_CPU(cpu_gdt_descr, %ebx)
- */
#ifdef CONFIG_SMP
-
#define __my_cpu_offset x86_read_percpu(this_cpu_off)
-
-/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
#define __percpu_seg "%%fs:"
-
-#else /* !SMP */
-
+#else
#define __percpu_seg ""
-
-#endif /* SMP */
+#endif

#include <asm-generic/percpu.h>

/* We can use this directly for local CPU (faster). */
DECLARE_PER_CPU(unsigned long, this_cpu_off);

+#endif /* __ASSEMBLY__ */
+#endif /* !CONFIG_X86_64 */
+
+#ifndef __ASSEMBLY__
+
/* For arch-specific code, we can use direct single-insn ops (they
* don't give an lvalue though). */
extern void __bad_percpu_size(void);
@@ -206,7 +197,6 @@ do { \
percpu_cmpxchg_op(per_cpu_var(var), old, new)

#endif /* !__ASSEMBLY__ */
-#endif /* !CONFIG_X86_64 */

#ifdef CONFIG_SMP


--
Mike Travis
2008-07-09 16:51:36 UTC
Permalink
* It is now possible to use percpu operations for pda access
since the pda is in the percpu area. Drop the pda operations.

Thus:

read_pda --> x86_read_percpu
write_pda --> x86_write_percpu
add_pda (+1) --> x86_inc_percpu
or_pda --> x86_or_percpu

Based on linux-2.6.tip/master

Signed-off-by: Christoph Lameter <***@linux-foundation.org>
Signed-off-by: Mike Travis <***@sgi.com>
---
include/asm-x86/hardirq_64.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

--- linux-2.6.tip.orig/include/asm-x86/hardirq_64.h 2008-07-01 10:41:33.000000000 -0700
+++ linux-2.6.tip/include/asm-x86/hardirq_64.h 2008-07-01 10:49:14.000299503 -0700
@@ -11,12 +11,12 @@

#define __ARCH_IRQ_STAT 1

-#define local_softirq_pending() read_pda(__softirq_pending)
+#define local_softirq_pending() x86_read_percpu(pda.__softirq_pending)

#define __ARCH_SET_SOFTIRQ_PENDING 1

-#define set_softirq_pending(x) write_pda(__softirq_pending, (x))
-#define or_softirq_pending(x) or_pda(__softirq_pending, (x))
+#define set_softirq_pending(x) x86_write_percpu(pda.__softirq_pending, (x))
+#define or_softirq_pending(x) x86_or_percpu(pda.__softirq_pending, (x))

extern void ack_bad_irq(unsigned int irq);


--
Mike Travis
2008-07-09 16:51:33 UTC
Permalink
* Replace cpu_pda(i) references with percpu(pda, i).

Based on linux-2.6.tip/master

Signed-off-by: Christoph Lameter <***@linux-foundation.org>
Signed-off-by: Mike Travis <***@sgi.com>
---
arch/x86/kernel/cpu/common_64.c | 2 +-
arch/x86/kernel/irq_64.c | 36 ++++++++++++++++++++----------------
arch/x86/kernel/nmi.c | 2 +-
arch/x86/kernel/setup_percpu.c | 2 +-
arch/x86/kernel/smpboot.c | 2 +-
arch/x86/kernel/traps_64.c | 11 +++++++----
6 files changed, 31 insertions(+), 24 deletions(-)

--- linux-2.6.tip.orig/arch/x86/kernel/cpu/common_64.c
+++ linux-2.6.tip/arch/x86/kernel/cpu/common_64.c
@@ -477,7 +477,7 @@ __setup("noexec32=", nonx32_setup);

void pda_init(int cpu)
{
- struct x8664_pda *pda = cpu_pda(cpu);
+ struct x8664_pda *pda = &per_cpu(pda, cpu);

/* Setup up data that may be needed in __get_free_pages early */
asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
--- linux-2.6.tip.orig/arch/x86/kernel/irq_64.c
+++ linux-2.6.tip/arch/x86/kernel/irq_64.c
@@ -115,39 +115,43 @@ skip:
} else if (i == NR_IRQS) {
seq_printf(p, "NMI: ");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
+ seq_printf(p, "%10u ", per_cpu(pda.__nmi_count, j));
seq_printf(p, " Non-maskable interrupts\n");
seq_printf(p, "LOC: ");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
+ seq_printf(p, "%10u ", per_cpu(pda.apic_timer_irqs, j));
seq_printf(p, " Local timer interrupts\n");
#ifdef CONFIG_SMP
seq_printf(p, "RES: ");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count);
+ seq_printf(p, "%10u ",
+ per_cpu(pda.irq_resched_count, j));
seq_printf(p, " Rescheduling interrupts\n");
seq_printf(p, "CAL: ");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
+ seq_printf(p, "%10u ", per_cpu(pda.irq_call_count, j));
seq_printf(p, " function call interrupts\n");
seq_printf(p, "TLB: ");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
+ seq_printf(p, "%10u ", per_cpu(pda.irq_tlb_count, j));
seq_printf(p, " TLB shootdowns\n");
#endif
#ifdef CONFIG_X86_MCE
seq_printf(p, "TRM: ");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
+ seq_printf(p, "%10u ",
+ per_cpu(pda.irq_thermal_count, j));
seq_printf(p, " Thermal event interrupts\n");
seq_printf(p, "THR: ");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
+ seq_printf(p, "%10u ",
+ per_cpu(pda.irq_threshold_count, j));
seq_printf(p, " Threshold APIC interrupts\n");
#endif
seq_printf(p, "SPU: ");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
+ seq_printf(p, "%10u ",
+ per_cpu(pda.irq_spurious_count, j));
seq_printf(p, " Spurious interrupts\n");
seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
}
@@ -159,19 +163,19 @@ skip:
*/
u64 arch_irq_stat_cpu(unsigned int cpu)
{
- u64 sum = cpu_pda(cpu)->__nmi_count;
+ u64 sum = per_cpu(pda.__nmi_count, cpu);

- sum += cpu_pda(cpu)->apic_timer_irqs;
+ sum += per_cpu(pda.apic_timer_irqs, cpu);
#ifdef CONFIG_SMP
- sum += cpu_pda(cpu)->irq_resched_count;
- sum += cpu_pda(cpu)->irq_call_count;
- sum += cpu_pda(cpu)->irq_tlb_count;
+ sum += per_cpu(pda.irq_resched_count, cpu);
+ sum += per_cpu(pda.irq_call_count, cpu);
+ sum += per_cpu(pda.irq_tlb_count, cpu);
#endif
#ifdef CONFIG_X86_MCE
- sum += cpu_pda(cpu)->irq_thermal_count;
- sum += cpu_pda(cpu)->irq_threshold_count;
+ sum += per_cpu(pda.irq_thermal_count, cpu);
+ sum += per_cpu(pda.irq_threshold_count, cpu);
#endif
- sum += cpu_pda(cpu)->irq_spurious_count;
+ sum += per_cpu(pda.irq_spurious_count, cpu);
return sum;
}

--- linux-2.6.tip.orig/arch/x86/kernel/nmi.c
+++ linux-2.6.tip/arch/x86/kernel/nmi.c
@@ -61,7 +61,7 @@ static int endflag __initdata;
static inline unsigned int get_nmi_count(int cpu)
{
#ifdef CONFIG_X86_64
- return cpu_pda(cpu)->__nmi_count;
+ return per_cpu(pda.__nmi_count, cpu);
#else
return nmi_count(cpu);
#endif
--- linux-2.6.tip.orig/arch/x86/kernel/setup_percpu.c
+++ linux-2.6.tip/arch/x86/kernel/setup_percpu.c
@@ -279,7 +279,7 @@ void __cpuinit numa_set_node(int cpu, in
per_cpu(x86_cpu_to_node_map, cpu) = node;

if (node != NUMA_NO_NODE)
- cpu_pda(cpu)->nodenumber = node;
+ per_cpu(pda.nodenumber, cpu) = node;
}

void __cpuinit numa_clear_node(int cpu)
--- linux-2.6.tip.orig/arch/x86/kernel/smpboot.c
+++ linux-2.6.tip/arch/x86/kernel/smpboot.c
@@ -814,7 +814,7 @@ do_rest:
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
#else
- cpu_pda(cpu)->pcurrent = c_idle.idle;
+ per_cpu(pda.pcurrent, cpu) = c_idle.idle;
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
initial_pda = (unsigned long)get_cpu_pda(cpu);
#endif
--- linux-2.6.tip.orig/arch/x86/kernel/traps_64.c
+++ linux-2.6.tip/arch/x86/kernel/traps_64.c
@@ -265,7 +265,8 @@ void dump_trace(struct task_struct *tsk,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
- unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
+ unsigned long *irqstack_end =
+ (unsigned long *)per_cpu(pda.irqstackptr, cpu);
unsigned used = 0;
struct thread_info *tinfo;

@@ -399,8 +400,10 @@ _show_stack(struct task_struct *tsk, str
unsigned long *stack;
int i;
const int cpu = smp_processor_id();
- unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
- unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+ unsigned long *irqstack_end =
+ (unsigned long *)per_cpu(pda.irqstackptr, cpu);
+ unsigned long *irqstack =
+ (unsigned long *)(irqstack_end - IRQSTACKSIZE);

// debugging aid: "show_stack(NULL, NULL);" prints the
// back trace for this cpu.
@@ -464,7 +467,7 @@ void show_registers(struct pt_regs *regs
int i;
unsigned long sp;
const int cpu = smp_processor_id();
- struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+ struct task_struct *cur = __get_cpu_var(pda.pcurrent);
u8 *ip;
unsigned int code_prologue = code_bytes * 43 / 64;
unsigned int code_len = code_bytes;

--
Mike Travis
2008-07-09 16:51:38 UTC
Permalink
* It is now possible to use percpu operations for pda access
since the pda is in the percpu area. Drop the pda operations.

Thus:

read_pda --> x86_read_percpu
write_pda --> x86_write_percpu
add_pda (+1) --> x86_inc_percpu
or_pda --> x86_or_percpu

Based on linux-2.6.tip/master

Signed-off-by: Christoph Lameter <***@linux-foundation.org>
Signed-off-by: Mike Travis <***@sgi.com>
---
include/asm-x86/percpu.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

--- linux-2.6.tip.orig/include/asm-x86/percpu.h 2008-07-01 10:41:33.000000000 -0700
+++ linux-2.6.tip/include/asm-x86/percpu.h 2008-07-01 10:49:14.432325788 -0700
@@ -7,7 +7,7 @@

/* Same as asm-generic/percpu.h, except we use %gs as a segment offset. */
#ifdef CONFIG_SMP
-#define __my_cpu_offset read_pda(data_offset)
+#define __my_cpu_offset (x86_read_percpu(pda.data_offset))
#define __percpu_seg "%%gs:"
#else
#define __percpu_seg ""

--
Mike Travis
2008-07-09 16:51:40 UTC
Permalink
* It is now possible to use percpu operations for pda access
since the pda is in the percpu area. Drop the pda operations.

Thus:

read_pda --> x86_read_percpu
write_pda --> x86_write_percpu
add_pda (+1) --> x86_inc_percpu
or_pda --> x86_or_percpu

Based on linux-2.6.tip/master

Signed-off-by: Christoph Lameter <***@linux-foundation.org>
Signed-off-by: Mike Travis <***@sgi.com>
---
include/asm-x86/stackprotector.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

--- linux-2.6.tip.orig/include/asm-x86/stackprotector.h 2008-07-01 10:41:33.000000000 -0700
+++ linux-2.6.tip/include/asm-x86/stackprotector.h 2008-07-01 10:49:14.920355480 -0700
@@ -32,7 +32,7 @@ static __always_inline void boot_init_st
canary += tsc + (tsc << 32UL);

current->stack_canary = canary;
- write_pda(stack_canary, canary);
+ x86_write_percpu(pda.stack_canary, canary);
}

#endif

--
Mike Travis
2008-07-09 16:51:44 UTC
Permalink
* As there are no more references to cpu_pda() then remove it.

Based on linux-2.6.tip/master

Signed-off-by: Christoph Lameter <***@linux-foundation.org>
Signed-off-by: Mike Travis <***@sgi.com>
---
include/asm-x86/pda.h | 2 --
1 file changed, 2 deletions(-)

--- linux-2.6.tip.orig/include/asm-x86/pda.h
+++ linux-2.6.tip/include/asm-x86/pda.h
@@ -39,8 +39,6 @@ struct x8664_pda {

extern void pda_init(int);

-#define cpu_pda(cpu) (&per_cpu(pda, cpu))
-
/*
* proxy_pda doesn't actually exist, but tell gcc it is accessed for
* all PDA accesses so it gets read/write dependencies right.

--
Mike Travis
2008-07-09 16:51:43 UTC
Permalink
* As there are no more references to xxx_pda() ops then remove them.

Based on linux-2.6.tip/master

Signed-off-by: Christoph Lameter <***@linux-foundation.org>
Signed-off-by: Mike Travis <***@sgi.com>
---
include/asm-x86/pda.h | 77 ++++----------------------------------------------
1 file changed, 7 insertions(+), 70 deletions(-)

--- linux-2.6.tip.orig/include/asm-x86/pda.h
+++ linux-2.6.tip/include/asm-x86/pda.h
@@ -21,7 +21,7 @@ struct x8664_pda {
offset 40!!! */
char *irqstackptr;
short nodenumber; /* number of current node (32k max) */
- short in_bootmem; /* pda lives in bootmem */
+ short unused1; /* unused */
unsigned int __softirq_pending;
unsigned int __nmi_count; /* number of NMI on this CPUs */
short mmu_state;
@@ -42,12 +42,6 @@ extern void pda_init(int);
#define cpu_pda(cpu) (&per_cpu(pda, cpu))

/*
- * There is no fast way to get the base address of the PDA, all the accesses
- * have to mention %fs/%gs. So it needs to be done this Torvaldian way.
- */
-extern void __bad_pda_field(void) __attribute__((noreturn));
-
-/*
* proxy_pda doesn't actually exist, but tell gcc it is accessed for
* all PDA accesses so it gets read/write dependencies right.
*/
@@ -55,69 +49,11 @@ extern struct x8664_pda _proxy_pda;

#define pda_offset(field) offsetof(struct x8664_pda, field)

-#define pda_to_op(op, field, val) \
-do { \
- typedef typeof(_proxy_pda.field) T__; \
- if (0) { T__ tmp__; tmp__ = (val); } /* type checking */ \
- switch (sizeof(_proxy_pda.field)) { \
- case 2: \
- asm(op "w %1,%%gs:%c2" : \
- "+m" (_proxy_pda.field) : \
- "ri" ((T__)val), \
- "i"(pda_offset(field))); \
- break; \
- case 4: \
- asm(op "l %1,%%gs:%c2" : \
- "+m" (_proxy_pda.field) : \
- "ri" ((T__)val), \
- "i" (pda_offset(field))); \
- break; \
- case 8: \
- asm(op "q %1,%%gs:%c2": \
- "+m" (_proxy_pda.field) : \
- "ri" ((T__)val), \
- "i"(pda_offset(field))); \
- break; \
- default: \
- __bad_pda_field(); \
- } \
-} while (0)
-
-#define pda_from_op(op, field) \
-({ \
- typeof(_proxy_pda.field) ret__; \
- switch (sizeof(_proxy_pda.field)) { \
- case 2: \
- asm(op "w %%gs:%c1,%0" : \
- "=r" (ret__) : \
- "i" (pda_offset(field)), \
- "m" (_proxy_pda.field)); \
- break; \
- case 4: \
- asm(op "l %%gs:%c1,%0": \
- "=r" (ret__): \
- "i" (pda_offset(field)), \
- "m" (_proxy_pda.field)); \
- break; \
- case 8: \
- asm(op "q %%gs:%c1,%0": \
- "=r" (ret__) : \
- "i" (pda_offset(field)), \
- "m" (_proxy_pda.field)); \
- break; \
- default: \
- __bad_pda_field(); \
- } \
- ret__; \
-})
-
-#define read_pda(field) pda_from_op("mov", field)
-#define write_pda(field, val) pda_to_op("mov", field, val)
-#define add_pda(field, val) pda_to_op("add", field, val)
-#define sub_pda(field, val) pda_to_op("sub", field, val)
-#define or_pda(field, val) pda_to_op("or", field, val)
-
-/* This is not atomic against other CPUs -- CPU preemption needs to be off */
+/*
+ * This is not atomic against other CPUs -- CPU preemption needs to be off
+ * NOTE: This relies on the fact that the cpu_pda is the *first* field in
+ * the per cpu area. Move it and you'll need to change this.
+ */
#define test_and_clear_bit_pda(bit, field) \
({ \
int old__; \
@@ -127,6 +63,7 @@ do { \
old__; \
})

+
#endif

#define PDA_STACKOFFSET (5*8)

--
Mike Travis
2008-07-09 16:51:42 UTC
Permalink
* It is now possible to use percpu operations for pda access
since the pda is in the percpu area. Drop the pda operations.

Thus:

read_pda --> x86_read_percpu
write_pda --> x86_write_percpu
add_pda (+1) --> x86_inc_percpu
or_pda --> x86_or_percpu

Based on linux-2.6.tip/master

Signed-off-by: Christoph Lameter <***@linux-foundation.org>
Signed-off-by: Mike Travis <***@sgi.com>
---
include/asm-x86/topology.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

--- linux-2.6.tip.orig/include/asm-x86/topology.h 2008-07-01 10:41:33.000000000 -0700
+++ linux-2.6.tip/include/asm-x86/topology.h 2008-07-01 10:49:15.420385902 -0700
@@ -77,7 +77,7 @@ extern cpumask_t *node_to_cpumask_map;
DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);

/* Returns the number of the current Node. */
-#define numa_node_id() read_pda(nodenumber)
+#define numa_node_id() x86_read_percpu(pda.nodenumber)

#ifdef CONFIG_DEBUG_PER_CPU_MAPS
extern int cpu_to_node(int cpu);

--
Mike Travis
2008-07-09 16:51:39 UTC
Permalink
* It is now possible to use percpu operations for pda access
since the pda is in the percpu area. Drop the pda operations.

Thus:

read_pda --> x86_read_percpu
write_pda --> x86_write_percpu
add_pda (+1) --> x86_inc_percpu
or_pda --> x86_or_percpu

Based on linux-2.6.tip/master

Signed-off-by: Christoph Lameter <***@linux-foundation.org>
Signed-off-by: Mike Travis <***@sgi.com>
---
include/asm-x86/smp.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

--- linux-2.6.tip.orig/include/asm-x86/smp.h 2008-07-01 10:41:33.000000000 -0700
+++ linux-2.6.tip/include/asm-x86/smp.h 2008-07-01 10:49:14.676340634 -0700
@@ -134,7 +134,7 @@ DECLARE_PER_CPU(int, cpu_number);
extern int safe_smp_processor_id(void);

#elif defined(CONFIG_X86_64_SMP)
-#define raw_smp_processor_id() read_pda(cpunumber)
+#define raw_smp_processor_id() x86_read_percpu(pda.cpunumber)

#define stack_smp_processor_id() \
({ \

--
Mike Travis
2008-07-09 16:51:41 UTC
Permalink
* It is now possible to use percpu operations for pda access
since the pda is in the percpu area. Drop the pda operations.

Thus:

read_pda --> x86_read_percpu
write_pda --> x86_write_percpu
add_pda (+1) --> x86_inc_percpu
or_pda --> x86_or_percpu

Based on linux-2.6.tip/master

Signed-off-by: Christoph Lameter <***@linux-foundation.org>
Signed-off-by: Mike Travis <***@sgi.com>
---
include/asm-x86/thread_info.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

--- linux-2.6.tip.orig/include/asm-x86/thread_info.h 2008-07-01 10:41:33.000000000 -0700
+++ linux-2.6.tip/include/asm-x86/thread_info.h 2008-07-01 10:49:15.172370813 -0700
@@ -200,7 +200,8 @@ static inline struct thread_info *curren
static inline struct thread_info *current_thread_info(void)
{
struct thread_info *ti;
- ti = (void *)(read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE);
+ ti = (void *)(x86_read_percpu(pda.kernelstack) +
+ PDA_STACKOFFSET - THREAD_SIZE);
return ti;
}


--
Mike Travis
2008-07-09 16:51:37 UTC
Permalink
* It is now possible to use percpu operations for pda access
since the pda is in the percpu area. Drop the pda operations.

Thus:

read_pda --> x86_read_percpu
write_pda --> x86_write_percpu
add_pda (+1) --> x86_inc_percpu
or_pda --> x86_or_percpu

Based on linux-2.6.tip/master

Signed-off-by: Christoph Lameter <***@linux-foundation.org>
Signed-off-by: Mike Travis <***@sgi.com>
---
include/asm-x86/mmu_context_64.h | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)

--- linux-2.6.tip.orig/include/asm-x86/mmu_context_64.h 2008-07-01 10:41:33.000000000 -0700
+++ linux-2.6.tip/include/asm-x86/mmu_context_64.h 2008-07-01 10:49:14.220312889 -0700
@@ -20,8 +20,8 @@ void destroy_context(struct mm_struct *m
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
#ifdef CONFIG_SMP
- if (read_pda(mmu_state) == TLBSTATE_OK)
- write_pda(mmu_state, TLBSTATE_LAZY);
+ if (x86_read_percpu(pda.mmu_state) == TLBSTATE_OK)
+ x86_write_percpu(pda.mmu_state, TLBSTATE_LAZY);
#endif
}

@@ -33,8 +33,8 @@ static inline void switch_mm(struct mm_s
/* stop flush ipis for the previous mm */
cpu_clear(cpu, prev->cpu_vm_mask);
#ifdef CONFIG_SMP
- write_pda(mmu_state, TLBSTATE_OK);
- write_pda(active_mm, next);
+ x86_write_percpu(pda.mmu_state, TLBSTATE_OK);
+ x86_write_percpu(pda.active_mm, next);
#endif
cpu_set(cpu, next->cpu_vm_mask);
load_cr3(next->pgd);
@@ -44,8 +44,8 @@ static inline void switch_mm(struct mm_s
}
#ifdef CONFIG_SMP
else {
- write_pda(mmu_state, TLBSTATE_OK);
- if (read_pda(active_mm) != next)
+ x86_write_percpu(pda.mmu_state, TLBSTATE_OK);
+ if (x86_read_percpu(pda.active_mm) != next)
BUG();
if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
/* We were in lazy tlb mode and leave_mm disabled

--
Mike Travis
2008-07-09 16:51:34 UTC
Permalink
* It is now possible to use percpu operations for pda access
since the pda is in the percpu area. Drop the pda operations.

Thus:

read_pda --> x86_read_percpu
write_pda --> x86_write_percpu
add_pda (+1) --> x86_inc_percpu
or_pda --> x86_or_percpu

* Remove unused field (in_bootmem) from the pda.

* One pda op (test_and_clear_bit_pda) cannot be easily removed
but since the pda is the first element in the per cpu area,
then it can be left in place as is.

Based on linux-2.6.tip/master

Signed-off-by: Christoph Lameter <***@linux-foundation.org>
Signed-off-by: Mike Travis <***@sgi.com>
---
arch/x86/kernel/apic_64.c | 4 ++--
arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 2 +-
arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 2 +-
arch/x86/kernel/nmi.c | 3 ++-
arch/x86/kernel/process_64.c | 12 ++++++------
arch/x86/kernel/smp.c | 4 ++--
arch/x86/kernel/time_64.c | 2 +-
arch/x86/kernel/tlb_64.c | 12 ++++++------
arch/x86/kernel/traps_64.c | 2 +-
arch/x86/kernel/x8664_ksyms_64.c | 2 --
arch/x86/xen/smp.c | 2 +-
11 files changed, 23 insertions(+), 24 deletions(-)

--- linux-2.6.tip.orig/arch/x86/kernel/apic_64.c
+++ linux-2.6.tip/arch/x86/kernel/apic_64.c
@@ -457,7 +457,7 @@ static void local_apic_timer_interrupt(v
/*
* the NMI deadlock-detector uses this.
*/
- add_pda(apic_timer_irqs, 1);
+ x86_inc_percpu(pda.apic_timer_irqs);

evt->event_handler(evt);
}
@@ -965,7 +965,7 @@ asmlinkage void smp_spurious_interrupt(v
if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
ack_APIC_irq();

- add_pda(irq_spurious_count, 1);
+ x86_inc_percpu(pda.irq_spurious_count);
irq_exit();
}

--- linux-2.6.tip.orig/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ linux-2.6.tip/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -237,7 +237,7 @@ asmlinkage void mce_threshold_interrupt(
}
}
out:
- add_pda(irq_threshold_count, 1);
+ x86_inc_percpu(pda.irq_threshold_count);
irq_exit();
}

--- linux-2.6.tip.orig/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ linux-2.6.tip/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -26,7 +26,7 @@ asmlinkage void smp_thermal_interrupt(vo
if (therm_throt_process(msr_val & 1))
mce_log_therm_throt_event(smp_processor_id(), msr_val);

- add_pda(irq_thermal_count, 1);
+ x86_inc_percpu(pda.irq_thermal_count);
irq_exit();
}

--- linux-2.6.tip.orig/arch/x86/kernel/nmi.c
+++ linux-2.6.tip/arch/x86/kernel/nmi.c
@@ -82,7 +82,8 @@ static inline int mce_in_progress(void)
static inline unsigned int get_timer_irqs(int cpu)
{
#ifdef CONFIG_X86_64
- return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
+ return x86_read_percpu(pda.apic_timer_irqs) +
+ x86_read_percpu(pda.irq0_irqs);
#else
return per_cpu(irq_stat, cpu).apic_timer_irqs +
per_cpu(irq_stat, cpu).irq0_irqs;
--- linux-2.6.tip.orig/arch/x86/kernel/process_64.c
+++ linux-2.6.tip/arch/x86/kernel/process_64.c
@@ -66,7 +66,7 @@ void idle_notifier_register(struct notif

void enter_idle(void)
{
- write_pda(isidle, 1);
+ x86_write_percpu(pda.isidle, 1);
atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
}

@@ -410,7 +410,7 @@ start_thread(struct pt_regs *regs, unsig
load_gs_index(0);
regs->ip = new_ip;
regs->sp = new_sp;
- write_pda(oldrsp, new_sp);
+ x86_write_percpu(pda.oldrsp, new_sp);
regs->cs = __USER_CS;
regs->ss = __USER_DS;
regs->flags = 0x200;
@@ -646,11 +646,11 @@ __switch_to(struct task_struct *prev_p,
/*
* Switch the PDA and FPU contexts.
*/
- prev->usersp = read_pda(oldrsp);
- write_pda(oldrsp, next->usersp);
- write_pda(pcurrent, next_p);
+ prev->usersp = x86_read_percpu(pda.oldrsp);
+ x86_write_percpu(pda.oldrsp, next->usersp);
+ x86_write_percpu(pda.pcurrent, next_p);

- write_pda(kernelstack,
+ x86_write_percpu(pda.kernelstack,
(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
#ifdef CONFIG_CC_STACKPROTECTOR
/*
--- linux-2.6.tip.orig/arch/x86/kernel/smp.c
+++ linux-2.6.tip/arch/x86/kernel/smp.c
@@ -295,7 +295,7 @@ void smp_reschedule_interrupt(struct pt_
#ifdef CONFIG_X86_32
__get_cpu_var(irq_stat).irq_resched_count++;
#else
- add_pda(irq_resched_count, 1);
+ x86_inc_percpu(pda.irq_resched_count);
#endif
}

@@ -320,7 +320,7 @@ void smp_call_function_interrupt(struct
#ifdef CONFIG_X86_32
__get_cpu_var(irq_stat).irq_call_count++;
#else
- add_pda(irq_call_count, 1);
+ x86_inc_percpu(pda.irq_call_count);
#endif
irq_exit();

--- linux-2.6.tip.orig/arch/x86/kernel/time_64.c
+++ linux-2.6.tip/arch/x86/kernel/time_64.c
@@ -46,7 +46,7 @@ EXPORT_SYMBOL(profile_pc);

static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
{
- add_pda(irq0_irqs, 1);
+ x86_inc_percpu(pda.irq0_irqs);

global_clock_event->event_handler(global_clock_event);

--- linux-2.6.tip.orig/arch/x86/kernel/tlb_64.c
+++ linux-2.6.tip/arch/x86/kernel/tlb_64.c
@@ -62,9 +62,9 @@ static DEFINE_PER_CPU(union smp_flush_st
*/
void leave_mm(int cpu)
{
- if (read_pda(mmu_state) == TLBSTATE_OK)
+ if (x86_read_percpu(pda.mmu_state) == TLBSTATE_OK)
BUG();
- cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
+ cpu_clear(cpu, x86_read_percpu(pda.active_mm)->cpu_vm_mask);
load_cr3(swapper_pg_dir);
}
EXPORT_SYMBOL_GPL(leave_mm);
@@ -142,8 +142,8 @@ asmlinkage void smp_invalidate_interrupt
* BUG();
*/

- if (f->flush_mm == read_pda(active_mm)) {
- if (read_pda(mmu_state) == TLBSTATE_OK) {
+ if (f->flush_mm == x86_read_percpu(pda.active_mm)) {
+ if (x86_read_percpu(pda.mmu_state) == TLBSTATE_OK) {
if (f->flush_va == TLB_FLUSH_ALL)
local_flush_tlb();
else
@@ -154,7 +154,7 @@ asmlinkage void smp_invalidate_interrupt
out:
ack_APIC_irq();
cpu_clear(cpu, f->flush_cpumask);
- add_pda(irq_tlb_count, 1);
+ x86_inc_percpu(pda.irq_tlb_count);
}

void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
@@ -269,7 +269,7 @@ static void do_flush_tlb_all(void *info)
unsigned long cpu = smp_processor_id();

__flush_tlb_all();
- if (read_pda(mmu_state) == TLBSTATE_LAZY)
+ if (x86_read_percpu(pda.mmu_state) == TLBSTATE_LAZY)
leave_mm(cpu);
}

--- linux-2.6.tip.orig/arch/x86/kernel/traps_64.c
+++ linux-2.6.tip/arch/x86/kernel/traps_64.c
@@ -878,7 +878,7 @@ asmlinkage notrace __kprobes void
do_nmi(struct pt_regs *regs, long error_code)
{
nmi_enter();
- add_pda(__nmi_count, 1);
+ x86_inc_percpu(pda.__nmi_count);
if (!ignore_nmis)
default_do_nmi(regs);
nmi_exit();
--- linux-2.6.tip.orig/arch/x86/kernel/x8664_ksyms_64.c
+++ linux-2.6.tip/arch/x86/kernel/x8664_ksyms_64.c
@@ -58,5 +58,3 @@ EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(empty_zero_page);
EXPORT_SYMBOL(init_level4_pgt);
EXPORT_SYMBOL(load_gs_index);
-
-EXPORT_SYMBOL(_proxy_pda);
--- linux-2.6.tip.orig/arch/x86/xen/smp.c
+++ linux-2.6.tip/arch/x86/xen/smp.c
@@ -68,7 +68,7 @@ static irqreturn_t xen_reschedule_interr
#ifdef CONFIG_X86_32
__get_cpu_var(irq_stat).irq_resched_count++;
#else
- add_pda(irq_resched_count, 1);
+ x86_inc_percpu(pda.irq_resched_count);
#endif

return IRQ_HANDLED;

--
Mike Travis
2008-07-09 16:51:35 UTC
Permalink
* It is now possible to use percpu operations for pda access
since the pda is in the percpu area. Drop the pda operations.

Thus:

read_pda --> x86_read_percpu
write_pda --> x86_write_percpu
add_pda (+1) --> x86_inc_percpu
or_pda --> x86_or_percpu

Based on linux-2.6.tip/master

Signed-off-by: Christoph Lameter <***@linux-foundation.org>
Signed-off-by: Mike Travis <***@sgi.com>
---
include/asm-x86/current.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

--- linux-2.6.tip.orig/include/asm-x86/current.h 2008-07-01 10:41:33.000000000 -0700
+++ linux-2.6.tip/include/asm-x86/current.h 2008-07-01 10:49:13.764285143 -0700
@@ -17,12 +17,13 @@ static __always_inline struct task_struc

#ifndef __ASSEMBLY__
#include <asm/pda.h>
+#include <asm/percpu.h>

struct task_struct;

static __always_inline struct task_struct *get_current(void)
{
- return read_pda(pcurrent);
+ return x86_read_percpu(pda.pcurrent);
}

#else /* __ASSEMBLY__ */

--
H. Peter Anvin
2008-07-09 17:19:08 UTC
Permalink
Hi Mike,

Did the suspected linker bug issue ever get resolved?

-hpa
Mike Travis
2008-07-09 17:40:02 UTC
Permalink
Post by H. Peter Anvin
Hi Mike,
Did the suspected linker bug issue ever get resolved?
-hpa
Hi Peter,

I was not able to figure out how the two versions of the same
kernel compiled by gcc-4.2.0 and gcc-4.2.4 differed. Currently,
I'm sticking with gcc-4.2.4 as it boots much farther.

There still is a problem where if I bump THREAD_ORDER, the
problem goes away and everything so far that I've tested boots
up fine.

We tried to install a later gcc (4.3.1) that might have the
"GCC_HAS_SP" flag but our sys admin reported:

The 4.3.1 version gives me errors on the make. I had to
pre-install gmp and mpfr, but, I still get errors on the make.

I think that was the latest he found on the GNU/GCC site.

Thanks,
Mike
H. Peter Anvin
2008-07-09 17:42:40 UTC
Permalink
Post by Mike Travis
Post by H. Peter Anvin
Hi Mike,
Did the suspected linker bug issue ever get resolved?
-hpa
Hi Peter,
I was not able to figure out how the two versions of the same
kernel compiled by gcc-4.2.0 and gcc-4.2.4 differed. Currently,
I'm sticking with gcc-4.2.4 as it boots much farther.
There still is a problem where if I bump THREAD_ORDER, the
problem goes away and everything so far that I've tested boots
up fine.
We tried to install a later gcc (4.3.1) that might have the
The 4.3.1 version gives me errors on the make. I had to
pre-install gmp and mpfr, but, I still get errors on the make.
I think that was the latest he found on the GNU/GCC site.
We have seen miscompilations with gcc 4.3.0 at least; not sure about 4.3.1.

-hpa
Mike Travis
2008-07-09 18:05:10 UTC
Permalink
Post by H. Peter Anvin
Post by Mike Travis
Post by H. Peter Anvin
Hi Mike,
Did the suspected linker bug issue ever get resolved?
-hpa
Hi Peter,
I was not able to figure out how the two versions of the same
kernel compiled by gcc-4.2.0 and gcc-4.2.4 differed. Currently,
I'm sticking with gcc-4.2.4 as it boots much farther.
There still is a problem where if I bump THREAD_ORDER, the
problem goes away and everything so far that I've tested boots
up fine.
We tried to install a later gcc (4.3.1) that might have the
The 4.3.1 version gives me errors on the make. I had to
pre-install gmp and mpfr, but, I still get errors on the make.
I think that was the latest he found on the GNU/GCC site.
We have seen miscompilations with gcc 4.3.0 at least; not sure about 4.3.1.
-hpa
Hmm, I wonder how the CONFIG_CC_STACKPROTECTOR was tested? Could it
be a config option for building GCC itself?

Thanks,
Mike
Jeremy Fitzhardinge
2008-07-09 17:44:41 UTC
Permalink
Post by H. Peter Anvin
Did the suspected linker bug issue ever get resolved?
I don't believe so. I think Mike is getting very early crashes
depending on some combination of gcc, linker and kernel config. Or
something.

This fragility makes me very nervous. It seems hard enough to get this
stuff working with current tools; making it work over the whole range of
supported tools looks like its going to be hard.

J
Mike Travis
2008-07-09 18:09:35 UTC
Permalink
Post by Jeremy Fitzhardinge
Post by H. Peter Anvin
Did the suspected linker bug issue ever get resolved?
I don't believe so. I think Mike is getting very early crashes
depending on some combination of gcc, linker and kernel config. Or
something.
Yes and unfortunately since SGI does not do it's own compilers any
more (they were MIPS compilers), there's no one here that really
understands the internals of the compile tools.
Post by Jeremy Fitzhardinge
This fragility makes me very nervous. It seems hard enough to get this
stuff working with current tools; making it work over the whole range of
supported tools looks like its going to be hard.
(me too ;-)

Once I get a solid version working with (at least) gcc-4.2.4, then
regression testing with older tools will be easier, or at least a
table of results can be produced.

Thanks,
Mike
H. Peter Anvin
2008-07-09 18:30:24 UTC
Permalink
Post by Mike Travis
Post by Jeremy Fitzhardinge
Post by H. Peter Anvin
Did the suspected linker bug issue ever get resolved?
I don't believe so. I think Mike is getting very early crashes
depending on some combination of gcc, linker and kernel config. Or
something.
Yes and unfortunately since SGI does not do it's own compilers any
more (they were MIPS compilers), there's no one here that really
understands the internals of the compile tools.
A bummer, too, since that compiler lives on as the Pathscale compiler...

-hpa
Ingo Molnar
2008-07-09 19:34:04 UTC
Permalink
Post by Mike Travis
Post by Jeremy Fitzhardinge
This fragility makes me very nervous. It seems hard enough to get
this stuff working with current tools; making it work over the whole
range of supported tools looks like its going to be hard.
(me too ;-)
Once I get a solid version working with (at least) gcc-4.2.4, then
regression testing with older tools will be easier, or at least a
table of results can be produced.
the problem is, we cannot just put it even into tip/master if there's no
short-term hope of fixing a problem it triggers. gcc-4.2.3 is solid for
me otherwise, for series of thousands of randomly built kernels.

can we just leave out the zero-based percpu stuff safely and could i
test the rest of your series - or are there dependencies? I think
zero-based percpu, while nice in theory, is probably just a very small
positive effect so it's not a life or death issue. (or is there any
deeper, semantic reason why we'd want it?)

Ingo
H. Peter Anvin
2008-07-09 19:44:51 UTC
Permalink
Post by Ingo Molnar
the problem is, we cannot just put it even into tip/master if there's no
short-term hope of fixing a problem it triggers. gcc-4.2.3 is solid for
me otherwise, for series of thousands of randomly built kernels.
4.2.3 is fine; he was using 4.2.0 before, and as far as I know, 4.2.0
and 4.2.1 are known broken for the kernel.

-hpa
Adrian Bunk
2008-07-09 20:26:33 UTC
Permalink
Post by H. Peter Anvin
Post by Ingo Molnar
the problem is, we cannot just put it even into tip/master if there's
no short-term hope of fixing a problem it triggers. gcc-4.2.3 is solid
for me otherwise, for series of thousands of randomly built kernels.
4.2.3 is fine; he was using 4.2.0 before, and as far as I know, 4.2.0
and 4.2.1 are known broken for the kernel.
Not sure where your knowledge comes from, but the ones I try to get
blacklisted due to known gcc bugs are 4.1.0 and 4.1.1.

On a larger picture, we officially support gcc >= 3.2, and if any kernel
change triggers a bug with e.g. gcc 3.2.3 that's technically a
regression in the kernel...
Post by H. Peter Anvin
-hpa
cu
Adrian
--
"Is there not promise of rain?" Ling Tan asked suddenly out
of the darkness. There had been need of rain for many days.
"Only a promise," Lao Er said.
Pearl S. Buck - Dragon Seed
Mike Travis
2008-07-09 21:03:45 UTC
Permalink
Post by Ingo Molnar
Post by Mike Travis
Post by Jeremy Fitzhardinge
This fragility makes me very nervous. It seems hard enough to get
this stuff working with current tools; making it work over the whole
range of supported tools looks like its going to be hard.
(me too ;-)
Once I get a solid version working with (at least) gcc-4.2.4, then
regression testing with older tools will be easier, or at least a
table of results can be produced.
the problem is, we cannot just put it even into tip/master if there's no
short-term hope of fixing a problem it triggers. gcc-4.2.3 is solid for
me otherwise, for series of thousands of randomly built kernels.
Great, I'll request we load gcc-4.2.3 on our devel server.
Post by Ingo Molnar
can we just leave out the zero-based percpu stuff safely and could i
test the rest of your series - or are there dependencies? I think
zero-based percpu, while nice in theory, is probably just a very small
positive effect so it's not a life or death issue. (or is there any
deeper, semantic reason why we'd want it?)
I sort of assumed that zero-based would not make it into 2.6.26-rcX,
and no, reaching 4096 cpus does not require it. The other patches
I've been submitting are more general and will fix possible panics
(like stack overflows, etc.)

Thanks,
Mike
Jeremy Fitzhardinge
2008-07-09 21:23:58 UTC
Permalink
Post by Ingo Molnar
Post by Mike Travis
Post by Jeremy Fitzhardinge
This fragility makes me very nervous. It seems hard enough to get
this stuff working with current tools; making it work over the whole
range of supported tools looks like its going to be hard.
(me too ;-)
Once I get a solid version working with (at least) gcc-4.2.4, then
regression testing with older tools will be easier, or at least a
table of results can be produced.
the problem is, we cannot just put it even into tip/master if there's no
short-term hope of fixing a problem it triggers. gcc-4.2.3 is solid for
me otherwise, for series of thousands of randomly built kernels.
can we just leave out the zero-based percpu stuff safely and could i
test the rest of your series - or are there dependencies? I think
zero-based percpu, while nice in theory, is probably just a very small
positive effect so it's not a life or death issue. (or is there any
deeper, semantic reason why we'd want it?)
I'm looking forward to using it, because I can make the Xen vcpu
structure a percpu variable shared with the hypervisor. This means
something like a disable interrupt becomes a simple "movb
$1,%gs:per_cpu__xen_vcpu_event_mask". If access to percpu variables is
indirect (ie, two instructions) I need to disable preemption which makes
the whole thing much more complex, and too big to inline. There are
other cases where preemption-safe access to percpu variables is useful
as well.

My view, which is admittedly very one-sided, is that all this brokenness
is forced on us by gcc's stack-protector brokenness. My preferred
approach would be to fix -fstack-protector by eliminating the
requirement for small offsets from %gs. With that in place we could
support it without needing a pda. In the meantime, we could either
support stack-protector or direct access to percpu variables. Either
way, we don't need to worry about making zero-based percpu work.

J
Jeremy Fitzhardinge
2008-07-09 17:27:14 UTC
Permalink
Post by Mike Travis
* Cleanup: Fix early references to cpumask_of_cpu(0)
Provides an early cpumask_of_cpu(0) usable before the cpumask_of_cpu_map
is allocated and initialized.
* Generic: Percpu infrastructure to rebase the per cpu area to zero
This provides for the capability of accessing the percpu variables
using a local register instead of having to go through a table
on node 0 to find the cpu-specific offsets. It also would allow
atomic operations on percpu variables to reduce required locking.
Uses a new config var HAVE_ZERO_BASED_PER_CPU to indicate to the
generic code that the arch has this new basing.
(Note: split into two patches, one to rebase percpu variables at 0,
and the second to actually use %gs as the base for percpu variables.)
* x86_64: Fold pda into per cpu area
Declare the pda as a per cpu variable. This will move the pda
area to an address accessible by the x86_64 per cpu macros.
Subtraction of __per_cpu_start will make the offset based from
the beginning of the per cpu area. Since %gs is pointing to the
pda, it will then also point to the per cpu variables and can be
%gs:[&per_cpu_xxxx - __per_cpu_start]
* x86_64: Rebase per cpu variables to zero
Take advantage of the zero-based per cpu area provided above.
Then we can directly use the x86_32 percpu operations. x86_32
offsets %fs by __per_cpu_start. x86_64 has %gs pointing directly
to the pda and the per cpu area thereby allowing access to the
pda with the x86_64 pda operations and access to the per cpu
variables using x86_32 percpu operations.
The bulk of this series is pda_X to x86_X_percpu conversion. This seems
like pointless churn to me; there's nothing inherently wrong with the
pda_X interfaces, and doing this transformation doesn't get us any
closer to unifying 32 and 64 bit.

I think we should start devolving things out of the pda in the other
direction: make a series where each patch takes a member of struct
x8664_pda, converts it to a per-cpu variable (where possible, the same
one that 32-bit uses), and updates all the references accordingly. When
the pda is as empty as it can be, we can look at removing the
pda-specific interfaces.

J
Christoph Lameter
2008-07-09 17:39:37 UTC
Permalink
Post by Jeremy Fitzhardinge
The bulk of this series is pda_X to x86_X_percpu conversion. This seems
like pointless churn to me; there's nothing inherently wrong with the
pda_X interfaces, and doing this transformation doesn't get us any
closer to unifying 32 and 64 bit.
What is the point of the pda_X interface? It does not exist on 32 bit.
The pda wastes the GS segment register on a small memory area. This patchset
makes the GS segment usable to reach all of the per cpu area by placing
the pda into the per cpu area. Thus the pda_X interface becomes obsolete
and the 32 bit per cpu stuff becomes usable under 64 bit unifying both
architectures.
Post by Jeremy Fitzhardinge
I think we should start devolving things out of the pda in the other
direction: make a series where each patch takes a member of struct
x8664_pda, converts it to a per-cpu variable (where possible, the same
one that 32-bit uses), and updates all the references accordingly. When
the pda is as empty as it can be, we can look at removing the
pda-specific interfaces.
This patchset places the whole x8664_pda structure into the per cpu area and makes the pda macros operate on the x8664_pda structure in the per cpu area. Not sure why you want to go through the churn of doing it for each object separately.
Jeremy Fitzhardinge
2008-07-09 17:51:46 UTC
Permalink
Post by Christoph Lameter
What is the point of the pda_X interface? It does not exist on 32 bit.
The pda wastes the GS segment register on a small memory area. This patchset
makes the GS segment usable to reach all of the per cpu area by placing
the pda into the per cpu area. Thus the pda_X interface becomes obsolete
and the 32 bit per cpu stuff becomes usable under 64 bit unifying both
architectures.
I think we agree on the desired outcome. I just disagree with the path
to getting there.
Post by Christoph Lameter
Post by Jeremy Fitzhardinge
I think we should start devolving things out of the pda in the other
direction: make a series where each patch takes a member of struct
x8664_pda, converts it to a per-cpu variable (where possible, the same
one that 32-bit uses), and updates all the references accordingly. When
the pda is as empty as it can be, we can look at removing the
pda-specific interfaces.
This patchset places the whole x8664_pda structure into the per cpu area and makes the pda macros operate on the x8664_pda structure in the per cpu area. Not sure why you want to go through the churn of doing it for each object separately.
No, it's not churn doing it object at a time. If you convert
pda.pcurrent into a percpu current_task variable, then at one stroke
you've 1) shrunk the pda, 2) unified with i386. If you go through the
process of converting all the read_pda(pcurrent) references into
x86_read_percpu(pda.pcurrent) then that's a pure churn patch. It
doesn't get rid of the pda variable, it doesn't unify with i386. All it
does is remove a reference to a macro which was fairly inoffensive in
the first place.

Once the pda has shrunk as much as it can (which remove everything
except stack_canary, I think), then remove all the X_pda macros, since
there won't be any users anyway.

J
Mike Travis
2008-07-09 18:14:34 UTC
Permalink
Jeremy Fitzhardinge wrote:
...
Post by Jeremy Fitzhardinge
Once the pda has shrunk as much as it can (which remove everything
except stack_canary, I think), then remove all the X_pda macros, since
there won't be any users anyway.
J
You bring up a good point here. Since the stack_canary has to be 20
(or is that 0x20?) bytes from %gs, then it sounds like we'll still
need a pda struct of some sort. And zero padding before that seems
counter-productive, so perhaps taking a poll of the most used pda
(or percpu) variables and keeping them in the same cache line would
be more useful?

Thanks,
Mike
Jeremy Fitzhardinge
2008-07-09 18:22:58 UTC
Permalink
Post by Mike Travis
...
Post by Jeremy Fitzhardinge
Once the pda has shrunk as much as it can (which remove everything
except stack_canary, I think), then remove all the X_pda macros, since
there won't be any users anyway.
J
You bring up a good point here. Since the stack_canary has to be 20
(or is that 0x20?) bytes from %gs, then it sounds like we'll still
need a pda struct of some sort. And zero padding before that seems
counter-productive, so perhaps taking a poll of the most used pda
(or percpu) variables and keeping them in the same cache line would
be more useful?
The offset is 40 (decimal). I don't see any particular problem with
putting zero padding in there; we can get rid of it if
CONFIG_STACK_PROTECTOR is off.

The trouble with reusing the space is that it's going to be things like
"current" which are the best candidates for going there - but if you do
that you lose i386 unification (unless you play some tricks where you
arrange to make the percpu variables land there while still appearing to
be normal percpu vars).

J
Mike Travis
2008-07-09 18:31:56 UTC
Permalink
Jeremy Fitzhardinge wrote:
...
Post by Jeremy Fitzhardinge
The trouble with reusing the space is that it's going to be things like
"current" which are the best candidates for going there - but if you do
that you lose i386 unification (unless you play some tricks where you
arrange to make the percpu variables land there while still appearing to
be normal percpu vars).
J
One more approach... ;-) Once the pda and percpu vars are in the same
area, then could the pda be used for both 32 and 64...?
Jeremy Fitzhardinge
2008-07-09 19:08:04 UTC
Permalink
Post by Mike Travis
One more approach... ;-) Once the pda and percpu vars are in the same
area, then could the pda be used for both 32 and 64...?
The i386 code works quite reliably thanks ;)

J
Mike Travis
2008-07-09 18:02:58 UTC
Permalink
Post by Christoph Lameter
Post by Jeremy Fitzhardinge
The bulk of this series is pda_X to x86_X_percpu conversion. This seems
like pointless churn to me; there's nothing inherently wrong with the
pda_X interfaces, and doing this transformation doesn't get us any
closer to unifying 32 and 64 bit.
What is the point of the pda_X interface? It does not exist on 32 bit.
The pda wastes the GS segment register on a small memory area. This patchset
makes the GS segment usable to reach all of the per cpu area by placing
the pda into the per cpu area. Thus the pda_X interface becomes obsolete
and the 32 bit per cpu stuff becomes usable under 64 bit unifying both
architectures.
Post by Jeremy Fitzhardinge
I think we should start devolving things out of the pda in the other
direction: make a series where each patch takes a member of struct
x8664_pda, converts it to a per-cpu variable (where possible, the same
one that 32-bit uses), and updates all the references accordingly. When
the pda is as empty as it can be, we can look at removing the
pda-specific interfaces.
This patchset places the whole x8664_pda structure into the per cpu area and makes the pda macros operate on the x8664_pda structure in the per cpu area. Not sure why you want to go through the churn of doing it for each object separately.
I think Jeremy's point is that by removing the pda struct entirely, the
references to the fields can be the same for both x86_32 and x86_64.

Thanks,
Mike
Christoph Lameter
2008-07-09 18:13:24 UTC
Permalink
Post by Mike Travis
I think Jeremy's point is that by removing the pda struct entirely, the
references to the fields can be the same for both x86_32 and x86_64.
That is going to be difficult. The GS register is tied up for the pda area
as long as you have it. And you cannot get rid of the pda because of the library
compatibility issues. We would break binary compatibility if we would get rid of the pda.

If one attempts to remove one field after another then the converted accesses will not be able to use GS relative accesses anymore. This can lead to all sorts of complications.

It will be possible to shrink the pda (as long as we maintain the fields that glibc needs) after this patchset because the pda and the per cpu area can both be reached with the GS register. So (apart from undiscovered surprises) the generated code is the same.
Jeremy Fitzhardinge
2008-07-09 18:26:05 UTC
Permalink
Post by Christoph Lameter
That is going to be difficult. The GS register is tied up for the pda area
as long as you have it. And you cannot get rid of the pda because of the library
compatibility issues. We would break binary compatibility if we would get rid of the pda.
If one attempts to remove one field after another then the converted accesses will not be able to use GS relative accesses anymore. This can lead to all sorts of complications.
Eh? Yes they will. That's the whole point of making the pda a percpu
variable itself. You can use %gs:<small> to get to the pda, and
%gs:<larger> to get to percpu variables. Converting pda->percpu will
just have the effect of increasing the %gs offset into the percpu space.

This project isn't interesting to me unless per-cpu variables are
directly accessible off %gs.

J
Christoph Lameter
2008-07-09 18:34:16 UTC
Permalink
Post by Jeremy Fitzhardinge
Eh? Yes they will. That's the whole point of making the pda a percpu
variable itself. You can use %gs:<small> to get to the pda, and
%gs:<larger> to get to percpu variables. Converting pda->percpu will
just have the effect of increasing the %gs offset into the percpu space.
Right that is what this patchset does.
Post by Jeremy Fitzhardinge
This project isn't interesting to me unless per-cpu variables are
directly accessible off %gs.
Maybe I misunderstood but it seems that you proposed to convert individual members of the pda structure (which uses GS) to per cpu variables (which without this patchset cannot use GS).
H. Peter Anvin
2008-07-09 18:37:26 UTC
Permalink
Post by Christoph Lameter
Post by Jeremy Fitzhardinge
Eh? Yes they will. That's the whole point of making the pda a percpu
variable itself. You can use %gs:<small> to get to the pda, and
%gs:<larger> to get to percpu variables. Converting pda->percpu will
just have the effect of increasing the %gs offset into the percpu space.
Right that is what this patchset does.
Post by Jeremy Fitzhardinge
This project isn't interesting to me unless per-cpu variables are
directly accessible off %gs.
Maybe I misunderstood but it seems that you proposed to convert individual members of the pda structure (which uses GS) to per cpu variables (which without this patchset cannot use GS).
I don't understand the "individual members" requirement here.

-hpa
Jeremy Fitzhardinge
2008-07-09 18:48:07 UTC
Permalink
Post by Christoph Lameter
Maybe I misunderstood but it seems that you proposed to convert individual members of the pda structure (which uses GS) to per cpu variables (which without this patchset cannot use GS).
I have no objections to parts 1-3 of the patchset. It's just 4-15,
which does the mechanical conversion.

J
Christoph Lameter
2008-07-09 18:53:57 UTC
Permalink
Post by Jeremy Fitzhardinge
Post by Christoph Lameter
Maybe I misunderstood but it seems that you proposed to convert
individual members of the pda structure (which uses GS) to per cpu
variables (which without this patchset cannot use GS).
I have no objections to parts 1-3 of the patchset. It's just 4-15,
which does the mechanical conversion.
Well yes I agree these could be better if the fields would be moved out of the pda
structure itself but then it wont be mechanical anymore and require more
review. But these are an important step because they allow us to get rid of the
pda operations that do not exist for 32 bit.
Jeremy Fitzhardinge
2008-07-09 19:07:24 UTC
Permalink
Post by Christoph Lameter
Well yes I agree these could be better if the fields would be moved out of the pda
structure itself but then it wont be mechanical anymore and require more
review.
Yes, but they'll have more value. And if you do it as one variable per
patch, then it should be easy to bisect should any problems arise.
Post by Christoph Lameter
But these are an important step because they allow us to get rid of the
pda operations that do not exist for 32 bit.
No, they don't help at all, because they convert X_pda(Y) (which doesn't
exist) into x86_X_percpu(pda.Y) (which also doesn't exist). They don't
remove any #ifdef CONFIG_X86_64's. If you're going to tromp all over
the source, you may as well do the most useful thing in the first step.

J
Christoph Lameter
2008-07-09 19:12:22 UTC
Permalink
Post by Jeremy Fitzhardinge
No, they don't help at all, because they convert X_pda(Y) (which doesn't
exist) into x86_X_percpu(pda.Y) (which also doesn't exist). They don't
remove any #ifdef CONFIG_X86_64's. If you're going to tromp all over
the source, you may as well do the most useful thing in the first step.
Well they help in the sense that the patches get rid of the special X_pda(Y) operations.
x86_X_percpu will then exist under 32 bit and 64 bit.

What is remaining is the task to rename

pda.Y -> Z

in order to make variable references the same under both arches. Presumably the Z is the corresponding 32 bit variable. There are likely a number of cases where the transformation
is trivial if we just identify the corresponding 32 bit equivalent.
Jeremy Fitzhardinge
2008-07-09 19:32:15 UTC
Permalink
Post by Christoph Lameter
Well they help in the sense that the patches get rid of the special X_pda(Y) operations.
x86_X_percpu will then exist under 32 bit and 64 bit.
What is remaining is the task to rename
pda.Y -> Z
in order to make variable references the same under both arches. Presumably the Z is the corresponding 32 bit variable. There are likely a number of cases where the transformation
is trivial if we just identify the corresponding 32 bit equivalent.
Yes, I understand that, but it's still pointless churn. The
intermediate step is no improvement over what was there before, and
isn't any closer to the desired final result.

Once you've made the pda a percpu variable, and redefined all the X_pda
macros in terms of x86_X_percpu, then there's no need to touch all the
usage sites until you're *actually* going to unify something. Touching
them all just because you find "X_pda" unsightly doesn't help anyone.
Ideally every site you touch will remove a #ifdef CONFIG_X86_64, or make
two as-yet unified pieces of code closer to unification.

J
Ingo Molnar
2008-07-09 19:41:36 UTC
Permalink
Post by Jeremy Fitzhardinge
Post by Christoph Lameter
What is remaining is the task to rename
pda.Y -> Z
in order to make variable references the same under both arches.
Presumably the Z is the corresponding 32 bit variable. There are
likely a number of cases where the transformation is trivial if we
just identify the corresponding 32 bit equivalent.
Yes, I understand that, but it's still pointless churn. The
intermediate step is no improvement over what was there before, and
isn't any closer to the desired final result.
Once you've made the pda a percpu variable, and redefined all the
X_pda macros in terms of x86_X_percpu, then there's no need to touch
all the usage sites until you're *actually* going to unify something.
Touching them all just because you find "X_pda" unsightly doesn't help
anyone. Ideally every site you touch will remove a #ifdef
CONFIG_X86_64, or make two as-yet unified pieces of code closer to
unification.
that makes sense. Does everyone agree on #1-#2-#3 and then gradual
elimination of most pda members (without going through an intermediate
renaming of pda members) being the way to go?

Ingo
H. Peter Anvin
2008-07-09 19:45:21 UTC
Permalink
Post by Ingo Molnar
that makes sense. Does everyone agree on #1-#2-#3 and then gradual
elimination of most pda members (without going through an intermediate
renaming of pda members) being the way to go?
Works for me.

-hpa
Christoph Lameter
2008-07-09 19:52:44 UTC
Permalink
Post by Ingo Molnar
that makes sense. Does everyone agree on #1-#2-#3 and then gradual
elimination of most pda members (without going through an intermediate
renaming of pda members) being the way to go?
I think we all agree on 1-2-3.

The rest is TBD. Hope Jeremy can add his wisdom there to get the pda.X replaced by the proper percpu names for 32 bit.

With Jeremy's approach we would be doing two steps at once (getting rid of pda ops plus unifying the variable names between 32 and 64 bit). Maybe more difficult to verify correctness. The removal of the pda ops is a pretty straighforward conversion.
Ingo Molnar
2008-07-09 20:00:57 UTC
Permalink
Post by Christoph Lameter
Post by Ingo Molnar
that makes sense. Does everyone agree on #1-#2-#3 and then gradual
elimination of most pda members (without going through an
intermediate renaming of pda members) being the way to go?
I think we all agree on 1-2-3.
The rest is TBD. Hope Jeremy can add his wisdom there to get the pda.X
replaced by the proper percpu names for 32 bit.
With Jeremy's approach we would be doing two steps at once (getting
rid of pda ops plus unifying the variable names between 32 and 64
bit). Maybe more difficult to verify correctness. The removal of the
pda ops is a pretty straighforward conversion.
Yes, but there's nothing magic about pda variables versus percpu
variables. We should be able to do the pda -> unified step just as much
as we can do a percpu -> unified step. We can think of pda as a funky,
pre-percpu-era relic.

The only thing that percpu really offers over pda is its familarity.
read_pda() has the per-cpu-ness embedded in it, which is nasty with
regard to tracking preemption properties, etc.

So converting to percpu would bring us CONFIG_PREEMPT_DEBUG=y checking
to those ex-pda variables. Today if a read_pda() (or anything but
pcurrent) is done in a non-preempt region that's likely a bug - but
nothing warns about it.

So in that light 4-15 might make some sense in standardizing all these
accesses and making sure it all fits into an existing, familar API
world, with no register level assumptions and assembly (and ABI) ties,
which is instrumented as well, with explicit smp_processor_id()
dependencies, etc.

Ingo
Jeremy Fitzhardinge
2008-07-09 20:09:37 UTC
Permalink
Post by Ingo Molnar
Post by Christoph Lameter
Post by Ingo Molnar
that makes sense. Does everyone agree on #1-#2-#3 and then gradual
elimination of most pda members (without going through an
intermediate renaming of pda members) being the way to go?
I think we all agree on 1-2-3.
The rest is TBD. Hope Jeremy can add his wisdom there to get the pda.X
replaced by the proper percpu names for 32 bit.
With Jeremy's approach we would be doing two steps at once (getting
rid of pda ops plus unifying the variable names between 32 and 64
bit). Maybe more difficult to verify correctness. The removal of the
pda ops is a pretty straighforward conversion.
Yes, but there's nothing magic about pda variables versus percpu
variables. We should be able to do the pda -> unified step just as much
as we can do a percpu -> unified step. We can think of pda as a funky,
pre-percpu-era relic.
The only thing that percpu really offers over pda is its familarity.
read_pda() has the per-cpu-ness embedded in it, which is nasty with
regard to tracking preemption properties, etc.
So converting to percpu would bring us CONFIG_PREEMPT_DEBUG=y checking
to those ex-pda variables. Today if a read_pda() (or anything but
pcurrent) is done in a non-preempt region that's likely a bug - but
nothing warns about it.
So in that light 4-15 might make some sense in standardizing all these
accesses and making sure it all fits into an existing, familar API
world, with no register level assumptions and assembly (and ABI) ties,
which is instrumented as well, with explicit smp_processor_id()
dependencies, etc.
Yeah, but doing

#define read_pda(x) x86_read_percpu(x)

gives you all that anyway. Though because x86_X_percpu and X_pda are
guaranteed to be atomic with respect to preemption, it's actually
reasonable to use them with preemption enabled.

J
Mike Travis
2008-07-09 21:05:32 UTC
Permalink
Post by Ingo Molnar
Post by Jeremy Fitzhardinge
Post by Christoph Lameter
What is remaining is the task to rename
pda.Y -> Z
in order to make variable references the same under both arches.
Presumably the Z is the corresponding 32 bit variable. There are
likely a number of cases where the transformation is trivial if we
just identify the corresponding 32 bit equivalent.
Yes, I understand that, but it's still pointless churn. The
intermediate step is no improvement over what was there before, and
isn't any closer to the desired final result.
Once you've made the pda a percpu variable, and redefined all the
X_pda macros in terms of x86_X_percpu, then there's no need to touch
all the usage sites until you're *actually* going to unify something.
Touching them all just because you find "X_pda" unsightly doesn't help
anyone. Ideally every site you touch will remove a #ifdef
CONFIG_X86_64, or make two as-yet unified pieces of code closer to
unification.
that makes sense. Does everyone agree on #1-#2-#3 and then gradual
elimination of most pda members (without going through an intermediate
renaming of pda members) being the way to go?
Ingo
This is fine with me... not much more work required to go "all the way"... ;-)
Christoph Lameter
2008-07-09 19:44:19 UTC
Permalink
Post by Jeremy Fitzhardinge
Yes, I understand that, but it's still pointless churn. The
intermediate step is no improvement over what was there before, and
isn't any closer to the desired final result.
No its not pointless churn. We actually eliminate all the pda operations and use the per_cpu operations both on 32 and 64 bit. That is unification.

We would be glad if you could contribute the patches to get rid of the pda.xxx references. I do not think that either Mike or I have the 32 bit expertise needed to do that step. We went as far as we could. The patches are touching all the points of interest so locating the lines to fix should be easy.
Jeremy Fitzhardinge
2008-07-09 19:48:47 UTC
Permalink
Post by Christoph Lameter
We would be glad if you could contribute the patches to get rid of the pda.xxx references. I do not think that either Mike or I have the 32 bit expertise needed to do that step. We went as far as we could. The patches are touching all the points of interest so locating the lines to fix should be easy.
Yes, I'd be happy to contribute.

J
Mike Travis
2008-07-09 18:27:07 UTC
Permalink
Post by Christoph Lameter
Post by Mike Travis
I think Jeremy's point is that by removing the pda struct entirely, the
references to the fields can be the same for both x86_32 and x86_64.
That is going to be difficult. The GS register is tied up for the pda area
as long as you have it. And you cannot get rid of the pda because of the library
compatibility issues. We would break binary compatibility if we would get rid of the pda.
If one attempts to remove one field after another then the converted accesses will not be able to use GS relative accesses anymore. This can lead to all sorts of complications.
It will be possible to shrink the pda (as long as we maintain the fields that glibc needs) after this patchset because the pda and the per cpu area can both be reached with the GS register. So (apart from undiscovered surprises) the generated code is the same.
Is there a comprehensive list of these library accesses to variables
offset from %gs, or is it only the "stack_canary"?

Thanks,
Mike
Jeremy Fitzhardinge
2008-07-09 18:46:57 UTC
Permalink
Post by Mike Travis
Post by Christoph Lameter
Post by Mike Travis
I think Jeremy's point is that by removing the pda struct entirely, the
references to the fields can be the same for both x86_32 and x86_64.
That is going to be difficult. The GS register is tied up for the pda area
as long as you have it. And you cannot get rid of the pda because of the library
compatibility issues. We would break binary compatibility if we would get rid of the pda.
If one attempts to remove one field after another then the converted accesses will not be able to use GS relative accesses anymore. This can lead to all sorts of complications.
It will be possible to shrink the pda (as long as we maintain the fields that glibc needs) after this patchset because the pda and the per cpu area can both be reached with the GS register. So (apart from undiscovered surprises) the generated code is the same.
Is there a comprehensive list of these library accesses to variables
offset from %gs, or is it only the "stack_canary"?
It's just the stack canary. It isn't library accesses; it's the code
gcc generates:

foo: subq $152, %rsp
movq %gs:40, %rax
movq %rax, 136(%rsp)
...
movq 136(%rsp), %rdx
xorq %gs:40, %rdx
je .L3
call __stack_chk_fail
.L3:
addq $152, %rsp
.p2align 4,,4
ret


There are two irritating things here:

One is that the kernel supports -fstack-protector for x86-64, which
forces us into all these contortions in the first place. We don't
support stack-protector for 32-bit (gcc does), and things are much easier.

The other somewhat orthogonal irritation is the fixed "40". If they'd
generated %gs:__gcc_stack_canary, then we could alias that to a per-cpu
variable like anything else and the whole problem would go away - and we
could support stack-protector on 32-bit with no problems (and normal
usermode could define __gcc_stack_canary to be a weak symbol with value
"40" (20 on 32-bit) for backwards compatibility).

I'm close to proposing that we run a post-processor over the generated
assembly to perform the %gs:40 -> %gs:__gcc_stack_canary transformation
and deal with it that way.

J
Eric W. Biederman
2008-07-09 20:22:06 UTC
Permalink
It's just the stack canary. It isn't library accesses; it's the code gcc
foo: subq $152, %rsp
movq %gs:40, %rax
movq %rax, 136(%rsp)
...
movq 136(%rsp), %rdx
xorq %gs:40, %rdx
je .L3
call __stack_chk_fail
addq $152, %rsp
.p2align 4,,4
ret
One is that the kernel supports -fstack-protector for x86-64, which forces us
into all these contortions in the first place. We don't support stack-protector
for 32-bit (gcc does), and things are much easier.
How does gcc know to use %gs instead of the usual %fs for accessing
the stack protector variable? My older gcc-4.1.x on ubuntu always uses %fs.
The other somewhat orthogonal irritation is the fixed "40". If they'd generated
%gs:__gcc_stack_canary, then we could alias that to a per-cpu variable like
anything else and the whole problem would go away - and we could support
stack-protector on 32-bit with no problems (and normal usermode could define
__gcc_stack_canary to be a weak symbol with value "40" (20 on 32-bit) for
backwards compatibility).
I'm close to proposing that we run a post-processor over the generated assembly
to perform the %gs:40 -> %gs:__gcc_stack_canary transformation and deal with it
that way.
Or we could do something completely evil. And use the other segment
register for the stack canary.

I think the unification is valid and useful, and that trying to keep
that stupid stack canary working is currently more trouble then it is
worth.

Eric
Jeremy Fitzhardinge
2008-07-09 20:35:58 UTC
Permalink
Post by Eric W. Biederman
How does gcc know to use %gs instead of the usual %fs for accessing
the stack protector variable? My older gcc-4.1.x on ubuntu always uses %fs.
-mcmodel=kernel
Post by Eric W. Biederman
Or we could do something completely evil. And use the other segment
register for the stack canary.
That would still require gcc changes, so it doesn't help much.

J
Eric W. Biederman
2008-07-09 20:53:12 UTC
Permalink
Post by Jeremy Fitzhardinge
Post by Eric W. Biederman
Or we could do something completely evil. And use the other segment
register for the stack canary.
That would still require gcc changes, so it doesn't help much.
We could use %fs for the per cpu variables. Then we could set %gs to whatever
we wanted to sync up with gcc

Eric
Ingo Molnar
2008-07-09 21:03:56 UTC
Permalink
Post by Eric W. Biederman
Post by Jeremy Fitzhardinge
Post by Eric W. Biederman
Or we could do something completely evil. And use the other segment
register for the stack canary.
That would still require gcc changes, so it doesn't help much.
We could use %fs for the per cpu variables. Then we could set %gs to
whatever we wanted to sync up with gcc
one problem is, there's no SWAPFS instruction.

Ingo
H. Peter Anvin
2008-07-09 21:16:50 UTC
Permalink
Post by Eric W. Biederman
Post by Jeremy Fitzhardinge
Post by Eric W. Biederman
Or we could do something completely evil. And use the other segment
register for the stack canary.
That would still require gcc changes, so it doesn't help much.
We could use %fs for the per cpu variables. Then we could set %gs to whatever
we wanted to sync up with gcc
No swapfs instruction, and extra performance penalty because %fs is used
in userspace.

-hpa
Arjan van de Ven
2008-07-09 21:10:44 UTC
Permalink
On Wed, 09 Jul 2008 13:22:06 -0700
Post by Eric W. Biederman
Post by Jeremy Fitzhardinge
It's just the stack canary. It isn't library accesses; it's the
foo: subq $152, %rsp
movq %gs:40, %rax
movq %rax, 136(%rsp)
...
movq 136(%rsp), %rdx
xorq %gs:40, %rdx
je .L3
call __stack_chk_fail
addq $152, %rsp
.p2align 4,,4
ret
One is that the kernel supports -fstack-protector for x86-64, which
forces us into all these contortions in the first place. We don't
support stack-protector for 32-bit (gcc does), and things are much easier.
How does gcc know to use %gs instead of the usual %fs for accessing
the stack protector variable? My older gcc-4.1.x on ubuntu always uses %fs.
ubuntu broke gcc (they don't want to have compiler flags per package so
patches stuff in gcc instead).
Post by Eric W. Biederman
I think the unification is valid and useful, and that trying to keep
that stupid stack canary working is currently more trouble then it is
worth.
I think that "unification over everything" is stupid, especially if it
removes useful features.
--
If you want to reach me at my work email, use ***@linux.intel.com
For development, discussion and tips for power savings,
visit http://www.lesswatts.org
H. Peter Anvin
2008-07-09 18:31:29 UTC
Permalink
Post by Christoph Lameter
Post by Mike Travis
I think Jeremy's point is that by removing the pda struct entirely, the
references to the fields can be the same for both x86_32 and x86_64.
That is going to be difficult. The GS register is tied up for the pda area
as long as you have it. And you cannot get rid of the pda because of the library
compatibility issues. We would break binary compatibility if we would get rid of the pda.
We're talking about the kernel here... who gives a hoot about binary
compatibility?

-hpa
Mike Travis
2008-07-09 18:00:07 UTC
Permalink
Post by Jeremy Fitzhardinge
Post by Mike Travis
* Cleanup: Fix early references to cpumask_of_cpu(0)
Provides an early cpumask_of_cpu(0) usable before the
cpumask_of_cpu_map
is allocated and initialized.
* Generic: Percpu infrastructure to rebase the per cpu area to zero
This provides for the capability of accessing the percpu variables
using a local register instead of having to go through a table
on node 0 to find the cpu-specific offsets. It also would allow
atomic operations on percpu variables to reduce required locking.
Uses a new config var HAVE_ZERO_BASED_PER_CPU to indicate to the
generic code that the arch has this new basing.
(Note: split into two patches, one to rebase percpu variables at 0,
and the second to actually use %gs as the base for percpu variables.)
* x86_64: Fold pda into per cpu area
Declare the pda as a per cpu variable. This will move the pda
area to an address accessible by the x86_64 per cpu macros.
Subtraction of __per_cpu_start will make the offset based from
the beginning of the per cpu area. Since %gs is pointing to the
pda, it will then also point to the per cpu variables and can be
%gs:[&per_cpu_xxxx - __per_cpu_start]
* x86_64: Rebase per cpu variables to zero
Take advantage of the zero-based per cpu area provided above.
Then we can directly use the x86_32 percpu operations. x86_32
offsets %fs by __per_cpu_start. x86_64 has %gs pointing directly
to the pda and the per cpu area thereby allowing access to the
pda with the x86_64 pda operations and access to the per cpu
variables using x86_32 percpu operations.
The bulk of this series is pda_X to x86_X_percpu conversion. This seems
like pointless churn to me; there's nothing inherently wrong with the
pda_X interfaces, and doing this transformation doesn't get us any
closer to unifying 32 and 64 bit.
I think we should start devolving things out of the pda in the other
direction: make a series where each patch takes a member of struct
x8664_pda, converts it to a per-cpu variable (where possible, the same
one that 32-bit uses), and updates all the references accordingly. When
the pda is as empty as it can be, we can look at removing the
pda-specific interfaces.
J
I did compartmentalize the changes so they were in separate patches,
and in particular, by separating the changes to the include files, I
was able to zero in on some problems much easier.

But I have no objections to leaving the cpu_pda ops in place and then,
as you're suggesting, extract and modify the fields as appropriate.

Another approach would be to leave the changes from XXX_pda() to
x86_percpu_XXX in place, and do the patches with simply changing
pda.VAR to VAR .)

In any case I would like to get this version working first, before
attempting that rewrite, as that won't change the generated code.

Btw, while I've got your attention... ;-), there's some code in
arch/x86/xen/smp.c:xen_smp_prepare_boot_cpu() that should be looked
at closer for zero-based per_cpu__gdt_page:

make_lowmem_page_readwrite(&per_cpu__gdt_page);

(I wasn't sure how to deal with this but I suspect the __percpu_offset[]
or __per_cpu_load should be added to it.)

Thanks,
Mike
Jeremy Fitzhardinge
2008-07-09 19:05:26 UTC
Permalink
Post by Mike Travis
I did compartmentalize the changes so they were in separate patches,
and in particular, by separating the changes to the include files, I
was able to zero in on some problems much easier.
But I have no objections to leaving the cpu_pda ops in place and then,
as you're suggesting, extract and modify the fields as appropriate.
Another approach would be to leave the changes from XXX_pda() to
x86_percpu_XXX in place, and do the patches with simply changing
pda.VAR to VAR .)
Yes, but that's still two patches where one would do. If I'm going to
go through the effort of reconciling your percpu patches with my code,
I'd like to be able to remove some #ifdef CONFIG_X86_64s in the process.
Post by Mike Travis
In any case I would like to get this version working first, before
attempting that rewrite, as that won't change the generated code.
Well, as far as I can tell the real meat of the series is in 1-3 and the
rest is fluff. If just applying 1-3 works, then everything else should too.
Post by Mike Travis
Btw, while I've got your attention... ;-), there's some code in
arch/x86/xen/smp.c:xen_smp_prepare_boot_cpu() that should be looked
make_lowmem_page_readwrite(&per_cpu__gdt_page);
(I wasn't sure how to deal with this but I suspect the __percpu_offset[]
or __per_cpu_load should be added to it.)
Already fixed in the mass of patches I posted yesterday. I turned it
into &per_cpu_var(gdt_page)).

J
Ingo Molnar
2008-07-09 19:28:22 UTC
Permalink
Post by Mike Travis
* x86_64: Rebase per cpu variables to zero
Take advantage of the zero-based per cpu area provided above. Then
we can directly use the x86_32 percpu operations. x86_32 offsets
%fs by __per_cpu_start. x86_64 has %gs pointing directly to the
pda and the per cpu area thereby allowing access to the pda with
the x86_64 pda operations and access to the per cpu variables
using x86_32 percpu operations.
hm, have the binutils (or gcc) problems with this been resolved?

If common binutils versions miscompile the kernel with this feature then
i guess we cannot just unconditionally enable it. (My hope is that it's
not necessarily a binutils bug but some broken assumption of the kernel
somewhere.)

Ingo
Mike Travis
2008-07-09 20:55:15 UTC
Permalink
Post by Ingo Molnar
Post by Mike Travis
* x86_64: Rebase per cpu variables to zero
Take advantage of the zero-based per cpu area provided above. Then
we can directly use the x86_32 percpu operations. x86_32 offsets
%fs by __per_cpu_start. x86_64 has %gs pointing directly to the
pda and the per cpu area thereby allowing access to the pda with
the x86_64 pda operations and access to the per cpu variables
using x86_32 percpu operations.
hm, have the binutils (or gcc) problems with this been resolved?
If common binutils versions miscompile the kernel with this feature then
i guess we cannot just unconditionally enable it. (My hope is that it's
not necessarily a binutils bug but some broken assumption of the kernel
somewhere.)
Ingo
Currently I'm using gcc-4.2.4. Which are you using?

I labeled it "RFC" as it does not quite work without THREAD_ORDER bumped to 2.
And I believe the stack overflow is happening because of some interrupt
routine as it does not happen on our simulator.

After that is taken care of, I'll start regression testing earlier compilers.
I think someone mentioned that gcc-2.something was the minimum required...?

Thanks,
Mike
Ingo Molnar
2008-07-09 21:12:56 UTC
Permalink
Post by Mike Travis
After that is taken care of, I'll start regression testing earlier
compilers. I think someone mentioned that gcc-2.something was the
minimum required...?
i think the current official minimum is around gcc-3.2 [2.x is out of
question because we have a few feature dependencies on gcc-3.x] - but i
stopped using it because it miscompiles the kernel so often. 4.0 was
really bad due to large stack footprint. The 4.3.x series miscompiles
the kernel too in certain situations - there was a high-rising
kerneloops.org crash recently in ext3.

So in general, 'too new' is bad because it has new regressions, 'too
old' is bad because it has unfixed old regressions. Somewhere in the
middle, 4.2.x-ish, seems to be pretty robust in practice.

Ingo
Eric W. Biederman
2008-07-09 20:00:19 UTC
Permalink
I just took a quick look at how stack_protector works on x86_64. Unless there is
some deep kernel magic that changes the segment register to %gs from the ABI specified
%fs CC_STACKPROTECTOR is totally broken on x86_64. We access our pda through %gs.

Further -fstack-protector-all only seems to detect against buffer overflows and
thus corruption of the stack. Not stack overflows. So it doesn't appear especially
useful.

So we don't we kill the broken CONFIG_CC_STACKPROTECTOR. Stop trying to figure out
how to use a zero based percpu area.

That should allow us to make the current pda a per cpu variable, and use %gs with
a large offset to access the per cpu area. And since it is only the per cpu accesses
and the pda accesses that will change we should not need to fight toolchain issues
and other weirdness. The linked binary can remain the same.

Eric
Jeremy Fitzhardinge
2008-07-09 20:05:33 UTC
Permalink
Post by Eric W. Biederman
I just took a quick look at how stack_protector works on x86_64. Unless there is
some deep kernel magic that changes the segment register to %gs from the ABI specified
%fs CC_STACKPROTECTOR is totally broken on x86_64. We access our pda through %gs.
-mcmodel=kernel switches it to using %gs.
Post by Eric W. Biederman
Further -fstack-protector-all only seems to detect against buffer overflows and
thus corruption of the stack. Not stack overflows. So it doesn't appear especially
useful.
It's a bit useful. But at the cost of preventing a pile of more useful
unification work, not to mention making all access to per-cpu variables
more expensive.
Post by Eric W. Biederman
So we don't we kill the broken CONFIG_CC_STACKPROTECTOR. Stop trying to figure out
how to use a zero based percpu area.
Yes, please.
Post by Eric W. Biederman
That should allow us to make the current pda a per cpu variable, and use %gs with
a large offset to access the per cpu area. And since it is only the per cpu accesses
and the pda accesses that will change we should not need to fight toolchain issues
and other weirdness. The linked binary can remain the same.
Yes, and it would be functionally identical to the 32-bit code.

J
Ingo Molnar
2008-07-09 20:15:01 UTC
Permalink
Post by Jeremy Fitzhardinge
Post by Eric W. Biederman
Further -fstack-protector-all only seems to detect against buffer
overflows and thus corruption of the stack. Not stack overflows. So
it doesn't appear especially useful.
It's a bit useful. But at the cost of preventing a pile of more
useful unification work, not to mention making all access to per-cpu
variables more expensive.
well, stackprotector is near zero maintenance trouble. It mostly binds
in places that are fundamentally non-unifiable anyway. (nobody is going
to unify the assembly code in switch_to())

i had zero-based percpu problems (early crashes) with a 4.2.3 gcc that
had --fstack-protect compiled out, so there's no connection there.

In its fixed form in tip/core/stackprotector it can catch the splice
exploit which makes it quite a bit useful. It would be rather silly to
not offer that feature.

Ingo
Ingo Molnar
2008-07-09 20:07:57 UTC
Permalink
Post by Eric W. Biederman
I just took a quick look at how stack_protector works on x86_64.
Unless there is some deep kernel magic that changes the segment
register to %gs from the ABI specified %fs CC_STACKPROTECTOR is
totally broken on x86_64. We access our pda through %gs.
Further -fstack-protector-all only seems to detect against buffer
overflows and thus corruption of the stack. Not stack overflows. So
it doesn't appear especially useful.
CC_STACKPROTECTOR, as fixed in -tip, can catch the splice exploit, and
there's no known breakage in it.

Deep stack recursion itself is not really interesting. (as that cannot
arbitrarily be triggered by attackers in most cases) Random overflows of
buffers on the stackframe is a lot more common, and that's what
stackprotector protects against.

( Note that deep stack recursion can be caught via another debug
mechanism, ftrace's mcount approach. )
Post by Eric W. Biederman
So we don't we kill the broken CONFIG_CC_STACKPROTECTOR. Stop trying
to figure out how to use a zero based percpu area.
Note that the zero-based percpu problems are completely unrelated to
stackprotector. I was able to hit them with a stackprotector-disabled
gcc-4.2.3 environment.

Ingo
Jeremy Fitzhardinge
2008-07-09 20:11:03 UTC
Permalink
Post by Ingo Molnar
Note that the zero-based percpu problems are completely unrelated to
stackprotector. I was able to hit them with a stackprotector-disabled
gcc-4.2.3 environment.
The only reason we need to keep a zero-based pda is to support
stack-protector. If we drop drop it, we can drop the pda - and its
special zero-based properties - entirely.

J
Christoph Lameter
2008-07-09 20:18:01 UTC
Permalink
Post by Jeremy Fitzhardinge
Post by Ingo Molnar
Note that the zero-based percpu problems are completely unrelated to
stackprotector. I was able to hit them with a stackprotector-disabled
gcc-4.2.3 environment.
The only reason we need to keep a zero-based pda is to support
stack-protector. If we drop drop it, we can drop the pda - and its
special zero-based properties - entirely.
Another reason to use a zero based per cpu area is to limit the offset range. Limiting the offset range allows in turn to limit the size of the generated instructions because it is part of the instruction. It also is easier to handle since __per_cpu_start does not figure
in the calculation of the offsets.
Jeremy Fitzhardinge
2008-07-09 20:33:33 UTC
Permalink
Post by Christoph Lameter
Post by Jeremy Fitzhardinge
Post by Ingo Molnar
Note that the zero-based percpu problems are completely unrelated to
stackprotector. I was able to hit them with a stackprotector-disabled
gcc-4.2.3 environment.
The only reason we need to keep a zero-based pda is to support
stack-protector. If we drop drop it, we can drop the pda - and its
special zero-based properties - entirely.
Another reason to use a zero based per cpu area is to limit the offset range. Limiting the offset range allows in turn to limit the size of the generated instructions because it is part of the instruction.
No, it makes no difference. %gs:X always has a 32-bit offset in the
instruction, regardless of how big X is:

mov %eax, %gs:0
mov %eax, %gs:0x1234567
->
0: 65 89 04 25 00 00 00 00 mov %eax,%gs:0x0
8: 65 89 04 25 67 45 23 01 mov %eax,%gs:0x1234567
Post by Christoph Lameter
It also is easier to handle since __per_cpu_start does not figure
in the calculation of the offsets.
No, you do it the same as i386. You set the segment base to be
percpu_area-__per_cpu_start, and then just refer to %gs:per_cpu__foo
directly. You can use rip-relative addressing to make it a smaller
addressing mode too:

0: 65 89 05 00 00 00 00 mov %eax,%gs:0(%rip) # 0x7


J
Jeremy Fitzhardinge
2008-07-09 20:48:09 UTC
Permalink
Thinking about this some more, I don't know if it would make sense to
put the x86-64 stack canary at the *end* of the percpu area, and
otherwise use negative offsets. That would make sure they were
readily reachable from %rip-based references from within the kernel
text area.
If we can move the canary then a whole pile of options open up. But the
problem is that we can't.

J
Eric W. Biederman
2008-07-09 21:06:27 UTC
Permalink
Thinking about this some more, I don't know if it would make sense to put the
x86-64 stack canary at the *end* of the percpu area, and otherwise use
negative offsets. That would make sure they were readily reachable from
%rip-based references from within the kernel text area.
If we can move the canary then a whole pile of options open up. But the problem
is that we can't.
But we can pick an arbitrary point where %gs points at.

Hmm. This whole thing is even sillier then I thought.
Why can't we access per cpu vars as:
%gs:(per_cpu__var - __per_cpu_start) ?

If we can subtract constants and allow the linker to perform that resolution
at link. A zero based per cpu segment becomes a moot issue.

We may need to change the definition of PERCPU in vmlinux.lds.h to
#define PERCPU(align) \
. = ALIGN(align); \
- __per_cpu_start = .; \
.data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \
+ __per_cpu_start = .; \
*(.data.percpu) \
*(.data.percpu.shared_aligned) \
+ __per_cpu_end = .; \
+ }
- } \
- __per_cpu_end = .;


So that the linker knows __per_cpu_start and __per_cpu_end are in the same section
but otherwise it sounds entirely reasonable. Just slightly trickier math at link
time.

Eric
Jeremy Fitzhardinge
2008-07-09 21:20:44 UTC
Permalink
Post by Eric W. Biederman
But we can pick an arbitrary point where %gs points at.
Hmm. This whole thing is even sillier then I thought.
%gs:(per_cpu__var - __per_cpu_start) ?
Because there's no linker reloc for doing subtraction (or addition) of
two symbols.
Post by Eric W. Biederman
If we can subtract constants and allow the linker to perform that resolution
at link. A zero based per cpu segment becomes a moot issue.
They're not constants; they're symbols.

J
H. Peter Anvin
2008-07-09 21:16:03 UTC
Permalink
Post by Eric W. Biederman
But we can pick an arbitrary point where %gs points at.
Hmm. This whole thing is even sillier then I thought.
%gs:(per_cpu__var - __per_cpu_start) ?
If we can subtract constants and allow the linker to perform that resolution
at link. A zero based per cpu segment becomes a moot issue.
And then we're back here again!

Supposedly the linker buggers up, although we don't have conclusive
evidence...

-hpa
H. Peter Anvin
2008-07-09 20:42:16 UTC
Permalink
Post by Jeremy Fitzhardinge
No, you do it the same as i386. You set the segment base to be
percpu_area-__per_cpu_start, and then just refer to %gs:per_cpu__foo
directly. You can use rip-relative addressing to make it a smaller
0: 65 89 05 00 00 00 00 mov %eax,%gs:0(%rip) # 0x7
Thinking about this some more, I don't know if it would make sense to
put the x86-64 stack canary at the *end* of the percpu area, and
otherwise use negative offsets. That would make sure they were readily
reachable from %rip-based references from within the kernel text area.

-hpa
Christoph Lameter
2008-07-09 21:25:33 UTC
Permalink
Post by Jeremy Fitzhardinge
No, it makes no difference. %gs:X always has a 32-bit offset in the
mov %eax, %gs:0
mov %eax, %gs:0x1234567
->
0: 65 89 04 25 00 00 00 00 mov %eax,%gs:0x0
8: 65 89 04 25 67 45 23 01 mov %eax,%gs:0x1234567
The processor itself supports smaller offsets.

Note also that the 32 bit offset size limits the offset that can be added to the segment register. You either need to place the per cpu area either in the last 2G of the address space or in the first 2G. The zero based approach removes that limitation.
Post by Jeremy Fitzhardinge
Post by Christoph Lameter
It also is easier to handle since __per_cpu_start does not figure
in the calculation of the offsets.
No, you do it the same as i386. You set the segment base to be
percpu_area-__per_cpu_start, and then just refer to %gs:per_cpu__foo
directly. You can use rip-relative addressing to make it a smaller
0: 65 89 05 00 00 00 00 mov %eax,%gs:0(%rip) # 0x7
RIP relative also implies a 32 bit offset meaning that the code cannot be more than 2G away from the per cpu area.
Jeremy Fitzhardinge
2008-07-09 21:41:14 UTC
Permalink
Post by Christoph Lameter
Post by Jeremy Fitzhardinge
No, it makes no difference. %gs:X always has a 32-bit offset in the
mov %eax, %gs:0
mov %eax, %gs:0x1234567
->
0: 65 89 04 25 00 00 00 00 mov %eax,%gs:0x0
8: 65 89 04 25 67 45 23 01 mov %eax,%gs:0x1234567
The processor itself supports smaller offsets.
Not in 64-bit mode. In 32-bit mode you can use the addr16 prefix, but
that would only save a byte per use (and I doubt it's a fast-path in the
processor).
Post by Christoph Lameter
Note also that the 32 bit offset size limits the offset that can be added to the segment register. You either need to place the per cpu area either in the last 2G of the address space or in the first 2G. The zero based approach removes that limitation.
No. The %gs base is a full 64-bit value you can put anywhere in the
address space. So long as your percpu data is within 2G of that point
you can get to it directly.
Post by Christoph Lameter
Post by Jeremy Fitzhardinge
0: 65 89 05 00 00 00 00 mov %eax,%gs:0(%rip) # 0x7
RIP relative also implies a 32 bit offset meaning that the code cannot be more than 2G away from the per cpu area.
It means the percpu symbols must be within 2G of your code. We can't
compile the kernel any other way (there's no -mcmodel=large-kernel).

J
H. Peter Anvin
2008-07-09 21:36:29 UTC
Permalink
Post by Christoph Lameter
=20
Post by Jeremy Fitzhardinge
No, it makes no difference. %gs:X always has a 32-bit offset in the
mov %eax, %gs:0
mov %eax, %gs:0x1234567
->
0: 65 89 04 25 00 00 00 00 mov %eax,%gs:0x0
8: 65 89 04 25 67 45 23 01 mov %eax,%gs:0x1234567
=20
The processor itself supports smaller offsets.
No, it doesn't, unless you have a base register. There is no naked=20
disp8 form, and disp16 is only available in 16- or 32-bit mode (and in=20
32-bit form it requires a 67h prefix.)
Post by Christoph Lameter
Note also that the 32 bit offset size limits the offset that can be a=
dded to the segment register. You either need to place the per cpu area=
either in the last 2G of the address space or in the first 2G. The zer=
o based approach removes that limitation.

The offset is either =B12 GB from the segment register, or =B12 GB from=
the=20
segment register plus %rip. The latter is more efficient.

The processor *does* permit a 64-bit absolute form, which can be used=20
with a segment register, but that one is hideously restricted (only mov=
e=20
to/from %rax) and bloated (10 bytes!)
Post by Christoph Lameter
Post by Jeremy Fitzhardinge
0: 65 89 05 00 00 00 00 mov %eax,%gs:0(%rip) # 0x=
7
Post by Christoph Lameter
=20
RIP relative also implies a 32 bit offset meaning that the code canno=
t be more than 2G away from the per cpu area.

Not from the per cpu area, but from the linked address of the per cpu=20
area (the segment register base can point anywhere.)

In our case means between -2 GB and a smallish positive value (I believ=
e=20
it is guaranteed to be 2 MB or more.)

Being able to use %rip-relative forms would save a byte per reference,=20
which is valuable.

-hpa
Eric W. Biederman
2008-07-09 22:22:44 UTC
Permalink
Post by Christoph Lameter
Note also that the 32 bit offset size limits the offset that can be added to the
segment register. You either need to place the per cpu area either in the last
2G of the address space or in the first 2G. The zero based approach removes that
limitation.
Good point. Which means that fundamentally we need to come up with a special
linker segment or some other way to guarantee that the offsets we use for per
cpu variables is within 2G of the segment register.

Which means that my idea of using the technique we use on x86_32 will not work.

Eric

H. Peter Anvin
2008-07-09 20:35:45 UTC
Permalink
=20
Another reason to use a zero based per cpu area is to limit the offse=
t range. Limiting the offset range allows in turn to limit the size of =
the generated instructions because it is part of the instruction. It al=
so is easier to handle since __per_cpu_start does not figure
in the calculation of the offsets.
=20
No, that makes no difference. There is no short-offset form that=20
doesn't involve a register (ignoring the 16-bit 67h form on 32 bits.)

=46or 64 bits, you want to keep the offsets within %rip=B12 GB, or you =
will=20
have relocation overflows for %rip-based forms; for absolute forms you=20
have to be in the range 0-4 GB. The %rip-based forms are shorter, and=20
I'm pretty sure they're the ones we currently generate. Since we base=20
the kernel at 0xffffffff80000000 (-2 GB) this means a zero-based offset=
=20
is actively wrong, and only work by accident (since the first=20
CONFIG_PHYSICAL_START of that space is unused.)

-hpa
Arjan van de Ven
2008-07-09 20:39:58 UTC
Permalink
On Wed, 09 Jul 2008 13:11:03 -0700
Post by Jeremy Fitzhardinge
Post by Ingo Molnar
Note that the zero-based percpu problems are completely unrelated
to stackprotector. I was able to hit them with a
stackprotector-disabled gcc-4.2.3 environment.
The only reason we need to keep a zero-based pda is to support
stack-protector. If we drop drop it, we can drop the pda - and its
special zero-based properties - entirely.
what's wrong with zero based btw?

do they stop us from using gcc's __thread keyword for per cpu variables
or something? (*that* would be a nice feature)

or does it stop us from putting the per cpu variables starting from
offset 4096 onwards?
--
If you want to reach me at my work email, use ***@linux.intel.com
For development, discussion and tips for power savings,
visit http://www.lesswatts.org
Jeremy Fitzhardinge
2008-07-09 20:46:49 UTC
Permalink
Post by Arjan van de Ven
what's wrong with zero based btw?
Nothing in princple. In practice it's triggering an amazing variety of
toolchain bugs.
Post by Arjan van de Ven
do they stop us from using gcc's __thread keyword for per cpu variables
or something? (*that* would be a nice feature)
The powerpc guys tried it, and it doesn't work. per-cpu is not
semantically equivalent to per-thread. If you have a function in which
you refer to a percpu variable and then have a preemptable section in
the middle followed by another reference to the same percpu variable,
it's hard to stop gcc from caching a reference to the old tls variable,
even though we may have switched cpus in the meantime.

Also, we explicitly use the other segment register in kernel mode, to
avoid segment register switches where possible. Even with
-mcmodel=kernel, gcc generates %fs references to tls variables.

J
Jeremy Fitzhardinge
2008-07-09 20:50:40 UTC
Permalink
1. it means pda references are invalid if their offsets are ever more
than CONFIG_PHYSICAL_BASE (which I do not think is likely, but still...)
Why?

As an aside, could we solve the problems by making CONFIG_PHYSICAL_BASE
0 - putting the percpu variables as the first thing in the kernel - and
relocating on load? That would avoid having to make a special PT_LOAD
segment at 0. Hm, would that result in the pda and the boot params
getting mushed together?

J
H. Peter Anvin
2008-07-09 21:12:14 UTC
Permalink
Post by Jeremy Fitzhardinge
1. it means pda references are invalid if their offsets are ever more
than CONFIG_PHYSICAL_BASE (which I do not think is likely, but still...)
Why?
As an aside, could we solve the problems by making CONFIG_PHYSICAL_BASE
0 - putting the percpu variables as the first thing in the kernel - and
relocating on load? That would avoid having to make a special PT_LOAD
segment at 0. Hm, would that result in the pda and the boot params
getting mushed together?
CONFIG_PHYSICAL_START rather. And no, it can't be zero! Realistically
we should make it 16 MB by default (currently 2 MB), to keep the DMA
zone clear.

Either way, I really suspect that the right thing to do is to use
negative offsets, with the possible exception of a handful of things (40
bytes or less, perhaps like current) which can get small positive
offsets and end up in the "super hot" cacheline.

The sucky part is that I don't believe GNU ld has native support for a
"hanging down" section (one which has a fixed endpoint rather than a
starting point), so it requires extra magic around the link (or finding
some way to do it with linker script functions.) Let me see if I can
cook up something in linker script that would actually work.

-hpa
Jeremy Fitzhardinge
2008-07-09 21:26:08 UTC
Permalink
Post by H. Peter Anvin
Either way, I really suspect that the right thing to do is to use
negative offsets, with the possible exception of a handful of things
(40 bytes or less, perhaps like current) which can get small positive
offsets and end up in the "super hot" cacheline.
The sucky part is that I don't believe GNU ld has native support for a
"hanging down" section (one which has a fixed endpoint rather than a
starting point), so it requires extra magic around the link (or
finding some way to do it with linker script functions.)
If you're going to do another linker pass, you could have a script to
extract all the percpu symbols and generate a set of derived zero-based
ones and then link against that.

Or generate a vmlinux with relocations and "relocate" all the percpu
symbols down to 0.

J
H. Peter Anvin
2008-07-09 21:37:13 UTC
Permalink
Post by Jeremy Fitzhardinge
If you're going to do another linker pass, you could have a script to
extract all the percpu symbols and generate a set of derived zero-based
ones and then link against that.
Or generate a vmlinux with relocations and "relocate" all the percpu
symbols down to 0.
Yeah, I'd hate to have to go to either of those lengths though.

-hpa
Eric W. Biederman
2008-07-09 22:10:40 UTC
Permalink
1. it means pda references are invalid if their offsets are ever more than
CONFIG_PHYSICAL_BASE (which I do not think is likely, but still...)
Why?
As an aside, could we solve the problems by making CONFIG_PHYSICAL_BASE 0 -
putting the percpu variables as the first thing in the kernel - and relocating
on load? That would avoid having to make a special PT_LOAD segment at 0. Hm,
would that result in the pda and the boot params getting mushed together?
CONFIG_PHYSICAL_START rather. And no, it can't be zero! Realistically we
should make it 16 MB by default (currently 2 MB), to keep the DMA zone clear.
Also on x86_64 CONFIG_PHYSICAL_START is irrelevant as the kernel text segment
is liked at a fixed address -2G and the option only determines the virtual
to physical address mapping.

That said the idea may not be too far off.

Potentially we could put the percpu area at our fixed -2G address and then
we have a constant (instead of an address) we could subtract from this address.

Eric
H. Peter Anvin
2008-07-09 22:23:45 UTC
Permalink
Post by Eric W. Biederman
CONFIG_PHYSICAL_START rather. And no, it can't be zero! Realistically we
should make it 16 MB by default (currently 2 MB), to keep the DMA zone clear.
Also on x86_64 CONFIG_PHYSICAL_START is irrelevant as the kernel text segment
is liked at a fixed address -2G and the option only determines the virtual
to physical address mapping.
No, it's not irrelevant; we currently base the kernel at virtual address
-2 GB (KERNEL_IMAGE_START) + CONFIG_PHYSICAL_START, in order to have the
proper alignment for large pages.

Now, it probably wouldn't hurt moving KERNEL_IMAGE_START up a bit to
have low positive values safer to use.
Post by Eric W. Biederman
That said the idea may not be too far off.
Potentially we could put the percpu area at our fixed -2G address and then
we have a constant (instead of an address) we could subtract from this address.
We can't put it at -2 GB since the offset +40 for the stack sentinel is
hard-coded into gcc. This leaves growing upward from +48 (or another
small positive number), or growing down from zero (or +40) as realistic
options.

Unfortunately, GNU ld handles grow-down not at all.

-hpa
H. Peter Anvin
2008-07-09 20:44:15 UTC
Permalink
Post by Arjan van de Ven
On Wed, 09 Jul 2008 13:11:03 -0700
Post by Jeremy Fitzhardinge
Post by Ingo Molnar
Note that the zero-based percpu problems are completely unrelated
to stackprotector. I was able to hit them with a
stackprotector-disabled gcc-4.2.3 environment.
The only reason we need to keep a zero-based pda is to support
stack-protector. If we drop drop it, we can drop the pda - and its
special zero-based properties - entirely.
what's wrong with zero based btw?
Two problems:

1. it means pda references are invalid if their offsets are ever more
than CONFIG_PHYSICAL_BASE (which I do not think is likely, but still...)

2. some vague hints of a linker bug.

-hpa
Arjan van de Ven
2008-07-09 20:14:05 UTC
Permalink
On Wed, 09 Jul 2008 13:00:19 -0700
Post by Eric W. Biederman
I just took a quick look at how stack_protector works on x86_64.
Unless there is some deep kernel magic that changes the segment
register to %gs from the ABI specified %fs CC_STACKPROTECTOR is
totally broken on x86_64. We access our pda through %gs.
and so does gcc in kernel mode.
Post by Eric W. Biederman
Further -fstack-protector-all only seems to detect against buffer
overflows and thus corruption of the stack. Not stack overflows. So
it doesn't appear especially useful.
stopping buffer overflows and other return address corruption is not
useful? Excuse me?
Post by Eric W. Biederman
So we don't we kill the broken CONFIG_CC_STACKPROTECTOR. Stop trying
to figure out how to use a zero based percpu area.
So why don't we NOT do that and fix instead what you're trying to do?
Post by Eric W. Biederman
That should allow us to make the current pda a per cpu variable, and
use %gs with a large offset to access the per cpu area.
and what does that gain us?
--
If you want to reach me at my work email, use ***@linux.intel.com
For development, discussion and tips for power savings,
visit http://www.lesswatts.org
Eric W. Biederman
2008-07-09 20:33:21 UTC
Permalink
Post by Arjan van de Ven
On Wed, 09 Jul 2008 13:00:19 -0700
Post by Eric W. Biederman
I just took a quick look at how stack_protector works on x86_64.
Unless there is some deep kernel magic that changes the segment
register to %gs from the ABI specified %fs CC_STACKPROTECTOR is
totally broken on x86_64. We access our pda through %gs.
and so does gcc in kernel mode.
Some gcc's in kernel mode. The one I tested with doesn't.
Post by Arjan van de Ven
Post by Eric W. Biederman
Further -fstack-protector-all only seems to detect against buffer
overflows and thus corruption of the stack. Not stack overflows. So
it doesn't appear especially useful.
stopping buffer overflows and other return address corruption is not
useful? Excuse me?
Stopping buffer overflows and return address corruption is useful. Simply
catching and panic'ing the machine when the occur is less useful. We aren't
perfect but we have a pretty good track record of handling this with
old fashioned methods.
Post by Arjan van de Ven
Post by Eric W. Biederman
So we don't we kill the broken CONFIG_CC_STACKPROTECTOR. Stop trying
to figure out how to use a zero based percpu area.
So why don't we NOT do that and fix instead what you're trying to do?
So our choices are.
fix -fstack-protector to not use a hard coded offset.
fix gcc/ld to not miscompile the kernel at random times that prevents us from
booting when we add a segement with an address at 0.

-fstack-protector does not use the TLS ABI and instead uses nasty hard coded magic
and that is why it is a problem. Otherwise we could easily support it.
Post by Arjan van de Ven
Post by Eric W. Biederman
That should allow us to make the current pda a per cpu variable, and
use %gs with a large offset to access the per cpu area.
and what does that gain us?
A faster more maintainable kernel.

Eric
Ingo Molnar
2008-07-09 21:01:20 UTC
Permalink
Post by Eric W. Biederman
Post by Arjan van de Ven
On Wed, 09 Jul 2008 13:00:19 -0700
Post by Eric W. Biederman
I just took a quick look at how stack_protector works on x86_64.
Unless there is some deep kernel magic that changes the segment
register to %gs from the ABI specified %fs CC_STACKPROTECTOR is
totally broken on x86_64. We access our pda through %gs.
and so does gcc in kernel mode.
Some gcc's in kernel mode. The one I tested with doesn't.
yes - stackprotector enabled distros build with kernel mode enabled gcc.
Post by Eric W. Biederman
Post by Arjan van de Ven
Post by Eric W. Biederman
Further -fstack-protector-all only seems to detect against buffer
overflows and thus corruption of the stack. Not stack overflows.
So it doesn't appear especially useful.
stopping buffer overflows and other return address corruption is not
useful? Excuse me?
Stopping buffer overflows and return address corruption is useful.
Simply catching and panic'ing the machine when the occur is less
useful. [...]
why? I personally prefer an informative panic in an overflow-suspect
piece of code instead of a guest root on my machine.

I think you miss one of the fundamental security aspects here. The panic
is not there just to inform the administrator - although it certainly
has such a role.

It is mainly there to _deter_ attackers from experimenting with certain
exploits.

For the more sophisticated attackers (not the script kiddies - the ones
who can do serious economic harm) their exploits and their attack
vectors are their main assets. They want their exploits to work on the
next target as well, and they want to be as stealth as possible.

For a script kiddie crashing a box is not a big issue - they work with
public exploits.

This means that the serious attempts will only use an attack if its
effects are 100% deterministic - they wont risk something like a 50%/50%
chance of a crash (or even a 10% chance of a crash). Some of the most
sophisticated kernel exploits i've seen had like 80% of their code
complexity in making sure that they dont crash the target box. They were
more resilient code than a lot of kernel code we have.
Post by Eric W. Biederman
[...] We aren't perfect but we have a pretty good track record of
handling this with old fashioned methods.
That's your opinion. A valid counter point is that more layers of
defense, in a fundamentally fragile area (buffers on the stack, return
addresses), cannot hurt. If you've got a firewall that is only 10% busy
even under peak load it's a valid option to spend some CPU cycles on
preventive measures.

A firewall _itself_ is huge overhead already - so there's absolutely no
valid technical reason to forbid a firewall from having something like
stackprotector built into its kernel. (and into most of its userspace)

We could have caught the vsplice exploit as well with stackprotector -
but our security QA was not strong enough to keep it from slowly
regressing. (without anyone noticing) That's fixed now in
tip/core/stackprotector.

Ingo
Mike Travis
2008-07-09 21:39:41 UTC
Permalink
Post by Eric W. Biederman
I just took a quick look at how stack_protector works on x86_64. Unless there is
some deep kernel magic that changes the segment register to %gs from the ABI specified
%fs CC_STACKPROTECTOR is totally broken on x86_64. We access our pda through %gs.
Further -fstack-protector-all only seems to detect against buffer overflows and
thus corruption of the stack. Not stack overflows. So it doesn't appear especially
useful.
So we don't we kill the broken CONFIG_CC_STACKPROTECTOR. Stop trying to figure out
how to use a zero based percpu area.
That should allow us to make the current pda a per cpu variable, and use %gs with
a large offset to access the per cpu area. And since it is only the per cpu accesses
and the pda accesses that will change we should not need to fight toolchain issues
and other weirdness. The linked binary can remain the same.
Eric
Hi Eric,

There is one pda op that I was not able to remove. Most likely it can be recoded
but it was a bit over my expertise. Most likely the "pda_offset(field)" can be
replaced with "per_cpu_var(field)" [per_cpu__##field], but for "_proxy_pda.field"
I wasn't sure about.

include/asm-x86/pda.h:

/*
* This is not atomic against other CPUs -- CPU preemption needs to be off
* NOTE: This relies on the fact that the cpu_pda is the *first* field in
* the per cpu area. Move it and you'll need to change this.
*/
#define test_and_clear_bit_pda(bit, field) \
({ \
int old__; \
asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0" \
: "=r" (old__), "+m" (_proxy_pda.field) \
: "dIr" (bit), "i" (pda_offset(field)) : "memory");\
old__; \
})

And there is only one reference to it.

arch/x86/kernel/process_64.c:

static void __exit_idle(void)
{
if (test_and_clear_bit_pda(0, isidle) == 0)
return;
atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
}

Thanks,
Mike
Jeremy Fitzhardinge
2008-07-09 21:47:51 UTC
Permalink
Post by Mike Travis
Post by Eric W. Biederman
I just took a quick look at how stack_protector works on x86_64. Unless there is
some deep kernel magic that changes the segment register to %gs from the ABI specified
%fs CC_STACKPROTECTOR is totally broken on x86_64. We access our pda through %gs.
Further -fstack-protector-all only seems to detect against buffer overflows and
thus corruption of the stack. Not stack overflows. So it doesn't appear especially
useful.
So we don't we kill the broken CONFIG_CC_STACKPROTECTOR. Stop trying to figure out
how to use a zero based percpu area.
That should allow us to make the current pda a per cpu variable, and use %gs with
a large offset to access the per cpu area. And since it is only the per cpu accesses
and the pda accesses that will change we should not need to fight toolchain issues
and other weirdness. The linked binary can remain the same.
Eric
Hi Eric,
There is one pda op that I was not able to remove. Most likely it can be recoded
but it was a bit over my expertise. Most likely the "pda_offset(field)" can be
replaced with "per_cpu_var(field)" [per_cpu__##field], but for "_proxy_pda.field"
I wasn't sure about.
/*
* This is not atomic against other CPUs -- CPU preemption needs to be off
* NOTE: This relies on the fact that the cpu_pda is the *first* field in
* the per cpu area. Move it and you'll need to change this.
*/
#define test_and_clear_bit_pda(bit, field) \
({ \
int old__; \
asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0" \
: "=r" (old__), "+m" (_proxy_pda.field) \
: "dIr" (bit), "i" (pda_offset(field)) : "memory");\
asm volatile("btr %2,%%gs:%1\n\tsbbl %0,%0" \
: "=r" (old__), "+m" (per_cpu_var(var)) \
: "dIr" (bit) : "memory");\


but it barely seems worthwhile if we really can't use test_and_clear_bit.

J
Eric W. Biederman
2008-07-09 21:55:29 UTC
Permalink
Post by Mike Travis
Hi Eric,
There is one pda op that I was not able to remove. Most likely it can be recoded
but it was a bit over my expertise. Most likely the "pda_offset(field)" can be
replaced with "per_cpu_var(field)" [per_cpu__##field], but for
"_proxy_pda.field"
I wasn't sure about.
If you notice we never use %1. My reading would be we just have the +m
there to tell the compiler we may be changing the field. So just
a reference to the per_cpu_var directly should be sufficient. Although
"memory" may actually be enough.

Eric
Continue reading on narkive:
Loading...