From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Aug 2009 05:41:02 +0000 (+0900)
Subject: Merge branch 'percpu-for-linus' into percpu-for-next
X-Git-Tag: firefly_0821_release~12956^2^2~20
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=384be2b18a5f9475eab9ca2bdfa95cc1a04ef59c;p=firefly-linux-kernel-4.4.55.git

Merge branch 'percpu-for-linus' into percpu-for-next

Conflicts:
	arch/sparc/kernel/smp_64.c
	arch/x86/kernel/cpu/perf_counter.c
	arch/x86/kernel/setup_percpu.c
	drivers/cpufreq/cpufreq_ondemand.c
	mm/percpu.c

Conflicts in core and arch percpu codes are mostly from commit
ed78e1e078dd44249f88b1dd8c76dafb39567161 which substituted many
num_possible_cpus() with nr_cpu_ids.  As for-next branch has moved all
the first chunk allocators into mm/percpu.c, the changes are moved
from arch code to mm/percpu.c.

Signed-off-by: Tejun Heo <tj@kernel.org>
---

384be2b18a5f9475eab9ca2bdfa95cc1a04ef59c
diff --cc arch/sparc/kernel/smp_64.c
index 6970333b48b8,3691907a43b4..9856d866b77b
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@@ -1478,26 -1491,25 +1478,26 @@@ void __init setup_per_cpu_areas(void
  	size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start;
  	static struct vm_struct vm;
  	unsigned long delta, cpu;
 -	size_t pcpu_unit_size;
 +	size_t size_sum, pcpu_unit_size;
  	size_t ptrs_size;
 +	void **ptrs;
  
 -	pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
 -			       PERCPU_DYNAMIC_RESERVE);
 -	dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE;
 +	size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
 +			     PERCPU_DYNAMIC_RESERVE);
 +	dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE;
  
  
- 	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(ptrs[0]));
 -	ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpur_ptrs[0]));
 -	pcpur_ptrs = alloc_bootmem(ptrs_size);
++	ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(ptrs[0]));
 +	ptrs = alloc_bootmem(ptrs_size);
  
  	for_each_possible_cpu(cpu) {
 -		pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
 -						     PCPU_CHUNK_SIZE);
 +		ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE,
 +					       PCPU_CHUNK_SIZE);
  
 -		free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
 -			     PCPU_CHUNK_SIZE - pcpur_size);
 +		free_bootmem(__pa(ptrs[cpu] + size_sum),
 +			     PCPU_CHUNK_SIZE - size_sum);
  
 -		memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
 +		memcpy(ptrs[cpu], __per_cpu_load, static_size);
  	}
  
  	/* allocate address and map */
diff --cc arch/x86/kernel/cpu/perf_counter.c
index 13bd6d6cf0bd,900332b800f8..3d4ebbd2e129
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@@ -1559,8 -1798,9 +1798,9 @@@ void callchain_store(struct perf_callch
  		entry->ip[entry->nr++] = ip;
  }
  
 -static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
 -static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
 +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
 +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
+ static DEFINE_PER_CPU(int, in_nmi_frame);
  
  
  static void
diff --cc arch/x86/kernel/setup_percpu.c
index 7501bb14bd51,07d81916f212..a26ff61e2fb0
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@@ -176,35 -185,130 +176,35 @@@ static ssize_t __init setup_pcpu_lpage(
  		return -EINVAL;
  	}
  
 -	/*
 -	 * Currently supports only single page.  Supporting multiple
 -	 * pages won't be too difficult if it ever becomes necessary.
 -	 */
 -	pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
 -			       PERCPU_DYNAMIC_RESERVE);
 -	if (pcpul_size > PMD_SIZE) {
 -		pr_warning("PERCPU: static data is larger than large page, "
 -			   "can't use large page\n");
 -		return -EINVAL;
 -	}
 -	dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
 -
 -	/* allocate pointer array and alloc large pages */
 -	map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
 -	pcpul_map = alloc_bootmem(map_size);
 -
 -	for_each_possible_cpu(cpu) {
 -		pcpul_map[cpu].cpu = cpu;
 -		pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
 -							PMD_SIZE);
 -		if (!pcpul_map[cpu].ptr) {
 -			pr_warning("PERCPU: failed to allocate large page "
 -				   "for cpu%u\n", cpu);
 -			goto enomem;
 -		}
 -
 -		/*
 -		 * Only use pcpul_size bytes and give back the rest.
 -		 *
 -		 * Ingo: The 2MB up-rounding bootmem is needed to make
 -		 * sure the partial 2MB page is still fully RAM - it's
 -		 * not well-specified to have a PAT-incompatible area
 -		 * (unmapped RAM, device memory, etc.) in that hole.
 -		 */
 -		free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
 -			     PMD_SIZE - pcpul_size);
 -
 -		memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
 +	/* allocate and build unit_map */
- 	unit_map_size = num_possible_cpus() * sizeof(int);
++	unit_map_size = nr_cpu_ids * sizeof(int);
 +	unit_map = alloc_bootmem_nopanic(unit_map_size);
 +	if (!unit_map) {
 +		pr_warning("PERCPU: failed to allocate unit_map\n");
 +		return -ENOMEM;
  	}
  
 -	/* allocate address and map */
 -	pcpul_vm.flags = VM_ALLOC;
 -	pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
 -	vm_area_register_early(&pcpul_vm, PMD_SIZE);
 -
 -	for_each_possible_cpu(cpu) {
 -		pmd_t *pmd, pmd_v;
 -
 -		pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
 -					 cpu * PMD_SIZE);
 -		pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
 -				PAGE_KERNEL_LARGE);
 -		set_pmd(pmd, pmd_v);
 +	ret = pcpu_lpage_build_unit_map(static_size,
 +					PERCPU_FIRST_CHUNK_RESERVE,
 +					&dyn_size, &unit_size, PMD_SIZE,
 +					unit_map, pcpu_lpage_cpu_distance);
 +	if (ret < 0) {
 +		pr_warning("PERCPU: failed to build unit_map\n");
 +		goto out_free;
  	}
 +	nr_units = ret;
  
 -	/* we're ready, commit */
 -	pr_info("PERCPU: Remapped at %p with large pages, static data "
 -		"%zu bytes\n", pcpul_vm.addr, static_size);
 -
 -	ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
 -				     PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
 -				     PMD_SIZE, pcpul_vm.addr, NULL);
 -
 -	/* sort pcpul_map array for pcpu_lpage_remapped() */
 -	for (i = 0; i < nr_cpu_ids - 1; i++)
 -		for (j = i + 1; j < nr_cpu_ids; j++)
 -			if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
 -				struct pcpul_ent tmp = pcpul_map[i];
 -				pcpul_map[i] = pcpul_map[j];
 -				pcpul_map[j] = tmp;
 -			}
 -
 -	return ret;
 -
 -enomem:
 -	for_each_possible_cpu(cpu)
 -		if (pcpul_map[cpu].ptr)
 -			free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
 -	free_bootmem(__pa(pcpul_map), map_size);
 -	return -ENOMEM;
 -}
 +	/* do the parameters look okay? */
 +	if (!chosen) {
 +		size_t vm_size = VMALLOC_END - VMALLOC_START;
 +		size_t tot_size = nr_units * unit_size;
  
 -/**
 - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
 - * @kaddr: the kernel address in question
 - *
 - * Determine whether @kaddr falls in the pcpul recycled area.  This is
 - * used by pageattr to detect VM aliases and break up the pcpu PMD
 - * mapping such that the same physical page is not mapped under
 - * different attributes.
 - *
 - * The recycled area is always at the tail of a partially used PMD
 - * page.
 - *
 - * RETURNS:
 - * Address of corresponding remapped pcpu address if match is found;
 - * otherwise, NULL.
 - */
 -void *pcpu_lpage_remapped(void *kaddr)
 -{
 -	void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK);
 -	unsigned long offset = (unsigned long)kaddr & ~PMD_MASK;
 -	int left = 0, right = nr_cpu_ids - 1;
 -	int pos;
 -
 -	/* pcpul in use at all? */
 -	if (!pcpul_map)
 -		return NULL;
 -
 -	/* okay, perform binary search */
 -	while (left <= right) {
 -		pos = (left + right) / 2;
 -
 -		if (pcpul_map[pos].ptr < pmd_addr)
 -			left = pos + 1;
 -		else if (pcpul_map[pos].ptr > pmd_addr)
 -			right = pos - 1;
 -		else {
 -			/* it shouldn't be in the area for the first chunk */
 -			WARN_ON(offset < pcpul_size);
 -
 -			return pcpul_vm.addr +
 -				pcpul_map[pos].cpu * PMD_SIZE + offset;
 +		/* don't consume more than 20% of vmalloc area */
 +		if (tot_size > vm_size / 5) {
 +			pr_info("PERCPU: too large chunk size %zuMB for "
 +				"large page remap\n", tot_size >> 20);
 +			ret = -EINVAL;
 +			goto out_free;
  		}
  	}
  
diff --cc drivers/cpufreq/cpufreq_conservative.c
index a7ef465c83b9,bdea7e2f94ba..bc33ddc9c97c
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@@ -64,8 -64,14 +64,14 @@@ struct cpu_dbs_info_s 
  	unsigned int requested_freq;
  	int cpu;
  	unsigned int enable:1;
+ 	/*
+ 	 * percpu mutex that serializes governor limit change with
+ 	 * do_dbs_timer invocation. We do not want do_dbs_timer to run
+ 	 * when user is changing the governor or limits.
+ 	 */
+ 	struct mutex timer_mutex;
  };
 -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
 +static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info);
  
  static unsigned int dbs_enable;	/* number of CPUs using this policy */
  
diff --cc drivers/cpufreq/cpufreq_ondemand.c
index 36f292a7bd01,d6ba14276bb1..d7a528c80de8
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@@ -70,10 -70,15 +70,15 @@@ struct cpu_dbs_info_s 
  	unsigned int freq_lo_jiffies;
  	unsigned int freq_hi_jiffies;
  	int cpu;
- 	unsigned int enable:1,
- 		sample_type:1;
+ 	unsigned int sample_type:1;
+ 	/*
+ 	 * percpu mutex that serializes governor limit change with
+ 	 * do_dbs_timer invocation. We do not want do_dbs_timer to run
+ 	 * when user is changing the governor or limits.
+ 	 */
+ 	struct mutex timer_mutex;
  };
 -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
 +static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info);
  
  static unsigned int dbs_enable;	/* number of CPUs using this policy */
  
@@@ -193,6 -190,13 +191,13 @@@ static unsigned int powersave_bias_targ
  	return freq_hi;
  }
  
+ static void ondemand_powersave_bias_init_cpu(int cpu)
+ {
 -	struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, cpu);
++	struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
+ 	dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
+ 	dbs_info->freq_lo = 0;
+ }
+ 
  static void ondemand_powersave_bias_init(void)
  {
  	int i;
@@@ -569,9 -550,10 +551,10 @@@ static int cpufreq_governor_dbs(struct 
  			return rc;
  		}
  
+ 		dbs_enable++;
  		for_each_cpu(j, policy->cpus) {
  			struct cpu_dbs_info_s *j_dbs_info;
 -			j_dbs_info = &per_cpu(cpu_dbs_info, j);
 +			j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
  			j_dbs_info->cur_policy = policy;
  
  			j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
diff --cc include/asm-generic/vmlinux.lds.h
index ab8ea9b7741e,6ad76bf5fb40..a43223af98b6
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@@ -30,15 -30,16 +30,13 @@@
   *	EXCEPTION_TABLE(...)
   *	NOTES
   *
-  *	__bss_start = .;
-  *	BSS_SECTION(0, 0)
-  *	__bss_stop = .;
+  *	BSS_SECTION(0, 0, 0)
   *	_end = .;
   *
 - *	/DISCARD/ : {
 - *		EXIT_TEXT
 - *		EXIT_DATA
 - *		EXIT_CALL
 - *	}
   *	STABS_DEBUG
   *	DWARF_DEBUG
 + *
 + *	DISCARDS		// must be the last
   * }
   *
   * [__init_begin, __init_end] is the init section that may be freed after init
diff --cc mm/percpu.c
index b3d0bcff8c7c,5fe37842e0ea..3f9f182f9b44
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@@ -1003,8 -747,9 +1003,8 @@@ static struct pcpu_chunk *alloc_pcpu_ch
  	chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
  	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
  	chunk->map[chunk->map_used++] = pcpu_unit_size;
 -	chunk->page = chunk->page_ar;
  
- 	chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+ 	chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
  	if (!chunk->vm) {
  		free_pcpu_chunk(chunk);
  		return NULL;
@@@ -1290,59 -1052,24 +1290,59 @@@ size_t __init pcpu_setup_first_chunk(si
  	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
  		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
  	BUG_ON(!static_size);
 -	if (unit_size >= 0) {
 -		BUG_ON(unit_size < size_sum);
 -		BUG_ON(unit_size & ~PAGE_MASK);
 -		BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
 -	} else
 -		BUG_ON(base_addr);
 -	BUG_ON(base_addr && populate_pte_fn);
 -
 -	if (unit_size >= 0)
 -		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 -	else
 -		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
 -					PFN_UP(size_sum));
 +	BUG_ON(!base_addr);
 +	BUG_ON(unit_size < size_sum);
 +	BUG_ON(unit_size & ~PAGE_MASK);
 +	BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
 +
 +	/* determine number of units and verify and initialize pcpu_unit_map */
 +	if (unit_map) {
 +		int first_unit = INT_MAX, last_unit = INT_MIN;
 +
 +		for_each_possible_cpu(cpu) {
 +			int unit = unit_map[cpu];
 +
 +			BUG_ON(unit < 0);
 +			for_each_possible_cpu(tcpu) {
 +				if (tcpu == cpu)
 +					break;
 +				/* the mapping should be one-to-one */
 +				BUG_ON(unit_map[tcpu] == unit);
 +			}
 +
 +			if (unit < first_unit) {
 +				pcpu_first_unit_cpu = cpu;
 +				first_unit = unit;
 +			}
 +			if (unit > last_unit) {
 +				pcpu_last_unit_cpu = cpu;
 +				last_unit = unit;
 +			}
 +		}
 +		pcpu_nr_units = last_unit + 1;
 +		pcpu_unit_map = unit_map;
 +	} else {
 +		int *identity_map;
 +
 +		/* #units == #cpus, identity mapped */
- 		identity_map = alloc_bootmem(num_possible_cpus() *
++		identity_map = alloc_bootmem(nr_cpu_ids *
 +					     sizeof(identity_map[0]));
  
 +		for_each_possible_cpu(cpu)
 +			identity_map[cpu] = cpu;
 +
 +		pcpu_first_unit_cpu = 0;
 +		pcpu_last_unit_cpu = pcpu_nr_units - 1;
- 		pcpu_nr_units = num_possible_cpus();
++		pcpu_nr_units = nr_cpu_ids;
 +		pcpu_unit_map = identity_map;
 +	}
 +
 +	/* determine basic parameters */
 +	pcpu_unit_pages = unit_size >> PAGE_SHIFT;
  	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 -	pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size;
 -	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
 -		+ nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *);
 +	pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
 +	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
 +		BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
  
  	if (dyn_size < 0)
  		dyn_size = pcpu_unit_size - static_size - reserved_size;
@@@ -1461,555 -1237,44 +1461,558 @@@ ssize_t __init pcpu_embed_first_chunk(s
  	unsigned int cpu;
  
  	/* determine parameters and allocate */
 -	pcpue_size = PFN_ALIGN(static_size + reserved_size +
 -			       (dyn_size >= 0 ? dyn_size : 0));
 -	if (dyn_size != 0)
 -		dyn_size = pcpue_size - static_size - reserved_size;
 -
 -	if (unit_size >= 0) {
 -		BUG_ON(unit_size < pcpue_size);
 -		pcpue_unit_size = unit_size;
 -	} else
 -		pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
 -
 -	chunk_size = pcpue_unit_size * nr_cpu_ids;
 -
 -	pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
 -					    __pa(MAX_DMA_ADDRESS));
 -	if (!pcpue_ptr) {
 +	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
 +
 +	unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
- 	chunk_size = unit_size * num_possible_cpus();
++	chunk_size = unit_size * nr_cpu_ids;
 +
 +	base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
 +				       __pa(MAX_DMA_ADDRESS));
 +	if (!base) {
  		pr_warning("PERCPU: failed to allocate %zu bytes for "
  			   "embedding\n", chunk_size);
  		return -ENOMEM;
  	}
  
  	/* return the leftover and copy */
- 	for_each_possible_cpu(cpu) {
+ 	for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
 -		void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
 +		void *ptr = base + cpu * unit_size;
  
- 		free_bootmem(__pa(ptr + size_sum), unit_size - size_sum);
- 		memcpy(ptr, __per_cpu_load, static_size);
+ 		if (cpu_possible(cpu)) {
 -			free_bootmem(__pa(ptr + pcpue_size),
 -				     pcpue_unit_size - pcpue_size);
++			free_bootmem(__pa(ptr + size_sum),
++				     unit_size - size_sum);
+ 			memcpy(ptr, __per_cpu_load, static_size);
+ 		} else
 -			free_bootmem(__pa(ptr), pcpue_unit_size);
++			free_bootmem(__pa(ptr), unit_size);
  	}
  
  	/* we're ready, commit */
  	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
 -		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
 +		size_sum >> PAGE_SHIFT, base, static_size);
 +
 +	return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
 +				      unit_size, base, NULL);
 +}
 +
 +/**
 + * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
 + * @static_size: the size of static percpu area in bytes
 + * @reserved_size: the size of reserved percpu area in bytes
 + * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
 + * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
 + * @populate_pte_fn: function to populate pte
 + *
 + * This is a helper to ease setting up embedded first percpu chunk and
 + * can be called where pcpu_setup_first_chunk() is expected.
 + *
 + * This is the basic allocator.  Static percpu area is allocated
 + * page-by-page into vmalloc area.
 + *
 + * RETURNS:
 + * The determined pcpu_unit_size which can be used to initialize
 + * percpu access on success, -errno on failure.
 + */
 +ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
 +				   pcpu_fc_alloc_fn_t alloc_fn,
 +				   pcpu_fc_free_fn_t free_fn,
 +				   pcpu_fc_populate_pte_fn_t populate_pte_fn)
 +{
 +	static struct vm_struct vm;
 +	int unit_pages;
 +	size_t pages_size;
 +	struct page **pages;
 +	unsigned int cpu;
 +	int i, j;
 +	ssize_t ret;
 +
 +	unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
 +				  PCPU_MIN_UNIT_SIZE));
 +
 +	/* unaligned allocations can't be freed, round up to page size */
- 	pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
- 			       sizeof(pages[0]));
++	pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
 +	pages = alloc_bootmem(pages_size);
 +
 +	/* allocate pages */
 +	j = 0;
 +	for_each_possible_cpu(cpu)
 +		for (i = 0; i < unit_pages; i++) {
 +			void *ptr;
 +
 +			ptr = alloc_fn(cpu, PAGE_SIZE);
 +			if (!ptr) {
 +				pr_warning("PERCPU: failed to allocate "
 +					   "4k page for cpu%u\n", cpu);
 +				goto enomem;
 +			}
 +			pages[j++] = virt_to_page(ptr);
 +		}
 +
 +	/* allocate vm area, map the pages and copy static data */
 +	vm.flags = VM_ALLOC;
- 	vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT;
++	vm.size = nr_cpu_ids * unit_pages << PAGE_SHIFT;
 +	vm_area_register_early(&vm, PAGE_SIZE);
 +
 +	for_each_possible_cpu(cpu) {
 +		unsigned long unit_addr = (unsigned long)vm.addr +
 +			(cpu * unit_pages << PAGE_SHIFT);
 +
 +		for (i = 0; i < unit_pages; i++)
 +			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
 +
 +		/* pte already populated, the following shouldn't fail */
 +		ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
 +				       unit_pages);
 +		if (ret < 0)
 +			panic("failed to map percpu area, err=%zd\n", ret);
 +
 +		/*
 +		 * FIXME: Archs with virtual cache should flush local
 +		 * cache for the linear mapping here - something
 +		 * equivalent to flush_cache_vmap() on the local cpu.
 +		 * flush_cache_vmap() can't be used as most supporting
 +		 * data structures are not set up yet.
 +		 */
 +
 +		/* copy static data */
 +		memcpy((void *)unit_addr, __per_cpu_load, static_size);
 +	}
 +
 +	/* we're ready, commit */
 +	pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
 +		unit_pages, static_size);
 +
 +	ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
 +				     unit_pages << PAGE_SHIFT, vm.addr, NULL);
 +	goto out_free_ar;
 +
 +enomem:
 +	while (--j >= 0)
 +		free_fn(page_address(pages[j]), PAGE_SIZE);
 +	ret = -ENOMEM;
 +out_free_ar:
 +	free_bootmem(__pa(pages), pages_size);
 +	return ret;
 +}
 +
 +/*
 + * Large page remapping first chunk setup helper
 + */
 +#ifdef CONFIG_NEED_MULTIPLE_NODES
 +
 +/**
 + * pcpu_lpage_build_unit_map - build unit_map for large page remapping
 + * @static_size: the size of static percpu area in bytes
 + * @reserved_size: the size of reserved percpu area in bytes
 + * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
 + * @unit_sizep: out parameter for unit size
 + * @unit_map: unit_map to be filled
 + * @cpu_distance_fn: callback to determine distance between cpus
 + *
 + * This function builds cpu -> unit map and determine other parameters
 + * considering needed percpu size, large page size and distances
 + * between CPUs in NUMA.
 + *
 + * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
 + * may share units in the same large page.  The returned configuration
 + * is guaranteed to have CPUs on different nodes on different large
 + * pages and >=75% usage of allocated virtual address space.
 + *
 + * RETURNS:
 + * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
 + * returns the number of units to be allocated.  -errno on failure.
 + */
 +int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size,
 +				     ssize_t *dyn_sizep, size_t *unit_sizep,
 +				     size_t lpage_size, int *unit_map,
 +				     pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
 +{
 +	static int group_map[NR_CPUS] __initdata;
 +	static int group_cnt[NR_CPUS] __initdata;
 +	int group_cnt_max = 0;
 +	size_t size_sum, min_unit_size, alloc_size;
 +	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
 +	int last_allocs;
 +	unsigned int cpu, tcpu;
 +	int group, unit;
 +
 +	/*
 +	 * Determine min_unit_size, alloc_size and max_upa such that
 +	 * alloc_size is multiple of lpage_size and is the smallest
 +	 * which can accomodate 4k aligned segments which are equal to
 +	 * or larger than min_unit_size.
 +	 */
 +	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
 +	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
 +
 +	alloc_size = roundup(min_unit_size, lpage_size);
 +	upa = alloc_size / min_unit_size;
 +	while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
 +		upa--;
 +	max_upa = upa;
 +
 +	/* group cpus according to their proximity */
 +	for_each_possible_cpu(cpu) {
 +		group = 0;
 +	next_group:
 +		for_each_possible_cpu(tcpu) {
 +			if (cpu == tcpu)
 +				break;
 +			if (group_map[tcpu] == group &&
 +			    (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
 +			     cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
 +				group++;
 +				goto next_group;
 +			}
 +		}
 +		group_map[cpu] = group;
 +		group_cnt[group]++;
 +		group_cnt_max = max(group_cnt_max, group_cnt[group]);
 +	}
 +
 +	/*
 +	 * Expand unit size until address space usage goes over 75%
 +	 * and then as much as possible without using more address
 +	 * space.
 +	 */
 +	last_allocs = INT_MAX;
 +	for (upa = max_upa; upa; upa--) {
 +		int allocs = 0, wasted = 0;
 +
 +		if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
 +			continue;
 +
 +		for (group = 0; group_cnt[group]; group++) {
 +			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
 +			allocs += this_allocs;
 +			wasted += this_allocs * upa - group_cnt[group];
 +		}
 +
 +		/*
 +		 * Don't accept if wastage is over 25%.  The
 +		 * greater-than comparison ensures upa==1 always
 +		 * passes the following check.
 +		 */
 +		if (wasted > num_possible_cpus() / 3)
 +			continue;
 +
 +		/* and then don't consume more memory */
 +		if (allocs > last_allocs)
 +			break;
 +		last_allocs = allocs;
 +		best_upa = upa;
 +	}
 +	*unit_sizep = alloc_size / best_upa;
  
 -	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
 -				      reserved_size, dyn_size,
 -				      pcpue_unit_size, pcpue_ptr, NULL);
 +	/* assign units to cpus accordingly */
 +	unit = 0;
 +	for (group = 0; group_cnt[group]; group++) {
 +		for_each_possible_cpu(cpu)
 +			if (group_map[cpu] == group)
 +				unit_map[cpu] = unit++;
 +		unit = roundup(unit, best_upa);
 +	}
 +
 +	return unit;	/* unit contains aligned number of units */
 +}
 +
 +struct pcpul_ent {
 +	void		*ptr;
 +	void		*map_addr;
 +};
 +
 +static size_t pcpul_size;
 +static size_t pcpul_lpage_size;
 +static int pcpul_nr_lpages;
 +static struct pcpul_ent *pcpul_map;
 +
 +static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
 +				     unsigned int *cpup)
 +{
 +	unsigned int cpu;
 +
 +	for_each_possible_cpu(cpu)
 +		if (unit_map[cpu] == unit) {
 +			if (cpup)
 +				*cpup = cpu;
 +			return true;
 +		}
 +
 +	return false;
 +}
 +
 +static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
 +					size_t reserved_size, size_t dyn_size,
 +					size_t unit_size, size_t lpage_size,
 +					const int *unit_map, int nr_units)
 +{
 +	int width = 1, v = nr_units;
 +	char empty_str[] = "--------";
 +	int upl, lpl;	/* units per lpage, lpage per line */
 +	unsigned int cpu;
 +	int lpage, unit;
 +
 +	while (v /= 10)
 +		width++;
 +	empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
 +
 +	upl = max_t(int, lpage_size / unit_size, 1);
 +	lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
 +
 +	printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
 +	       static_size, reserved_size, dyn_size, unit_size, lpage_size);
 +
 +	for (lpage = 0, unit = 0; unit < nr_units; unit++) {
 +		if (!(unit % upl)) {
 +			if (!(lpage++ % lpl)) {
 +				printk("\n");
 +				printk("%spcpu-lpage: ", lvl);
 +			} else
 +				printk("| ");
 +		}
 +		if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
 +			printk("%0*d ", width, cpu);
 +		else
 +			printk("%s ", empty_str);
 +	}
 +	printk("\n");
 +}
 +
 +/**
 + * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
 + * @static_size: the size of static percpu area in bytes
 + * @reserved_size: the size of reserved percpu area in bytes
 + * @dyn_size: free size for dynamic allocation in bytes
 + * @unit_size: unit size in bytes
 + * @lpage_size: the size of a large page
 + * @unit_map: cpu -> unit mapping
 + * @nr_units: the number of units
 + * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
 + * @free_fn: function to free percpu memory, @size <= lpage_size
 + * @map_fn: function to map percpu lpage, always called with lpage_size
 + *
 + * This allocator uses large page to build and map the first chunk.
 + * Unlike other helpers, the caller should always specify @dyn_size
 + * and @unit_size.  These parameters along with @unit_map and
 + * @nr_units can be determined using pcpu_lpage_build_unit_map().
 + * This two stage initialization is to allow arch code to evaluate the
 + * parameters before committing to it.
 + *
 + * Large pages are allocated as directed by @unit_map and other
 + * parameters and mapped to vmalloc space.  Unused holes are returned
 + * to the page allocator.  Note that these holes end up being actively
 + * mapped twice - once to the physical mapping and to the vmalloc area
 + * for the first percpu chunk.  Depending on architecture, this might
 + * cause problem when changing page attributes of the returned area.
 + * These double mapped areas can be detected using
 + * pcpu_lpage_remapped().
 + *
 + * RETURNS:
 + * The determined pcpu_unit_size which can be used to initialize
 + * percpu access on success, -errno on failure.
 + */
 +ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
 +				      size_t dyn_size, size_t unit_size,
 +				      size_t lpage_size, const int *unit_map,
 +				      int nr_units,
 +				      pcpu_fc_alloc_fn_t alloc_fn,
 +				      pcpu_fc_free_fn_t free_fn,
 +				      pcpu_fc_map_fn_t map_fn)
 +{
 +	static struct vm_struct vm;
 +	size_t chunk_size = unit_size * nr_units;
 +	size_t map_size;
 +	unsigned int cpu;
 +	ssize_t ret;
 +	int i, j, unit;
 +
 +	pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
 +			     unit_size, lpage_size, unit_map, nr_units);
 +
 +	BUG_ON(chunk_size % lpage_size);
 +
 +	pcpul_size = static_size + reserved_size + dyn_size;
 +	pcpul_lpage_size = lpage_size;
 +	pcpul_nr_lpages = chunk_size / lpage_size;
 +
 +	/* allocate pointer array and alloc large pages */
 +	map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
 +	pcpul_map = alloc_bootmem(map_size);
 +
 +	/* allocate all pages */
 +	for (i = 0; i < pcpul_nr_lpages; i++) {
 +		size_t offset = i * lpage_size;
 +		int first_unit = offset / unit_size;
 +		int last_unit = (offset + lpage_size - 1) / unit_size;
 +		void *ptr;
 +
 +		/* find out which cpu is mapped to this unit */
 +		for (unit = first_unit; unit <= last_unit; unit++)
 +			if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
 +				goto found;
 +		continue;
 +	found:
 +		ptr = alloc_fn(cpu, lpage_size);
 +		if (!ptr) {
 +			pr_warning("PERCPU: failed to allocate large page "
 +				   "for cpu%u\n", cpu);
 +			goto enomem;
 +		}
 +
 +		pcpul_map[i].ptr = ptr;
 +	}
 +
 +	/* return unused holes */
 +	for (unit = 0; unit < nr_units; unit++) {
 +		size_t start = unit * unit_size;
 +		size_t end = start + unit_size;
 +		size_t off, next;
 +
 +		/* don't free used part of occupied unit */
 +		if (pcpul_unit_to_cpu(unit, unit_map, NULL))
 +			start += pcpul_size;
 +
 +		/* unit can span more than one page, punch the holes */
 +		for (off = start; off < end; off = next) {
 +			void *ptr = pcpul_map[off / lpage_size].ptr;
 +			next = min(roundup(off + 1, lpage_size), end);
 +			if (ptr)
 +				free_fn(ptr + off % lpage_size, next - off);
 +		}
 +	}
 +
 +	/* allocate address, map and copy */
 +	vm.flags = VM_ALLOC;
 +	vm.size = chunk_size;
 +	vm_area_register_early(&vm, unit_size);
 +
 +	for (i = 0; i < pcpul_nr_lpages; i++) {
 +		if (!pcpul_map[i].ptr)
 +			continue;
 +		pcpul_map[i].map_addr = vm.addr + i * lpage_size;
 +		map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
 +	}
 +
 +	for_each_possible_cpu(cpu)
 +		memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
 +		       static_size);
 +
 +	/* we're ready, commit */
 +	pr_info("PERCPU: Remapped at %p with large pages, static data "
 +		"%zu bytes\n", vm.addr, static_size);
 +
 +	ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
 +				     unit_size, vm.addr, unit_map);
 +
 +	/*
 +	 * Sort pcpul_map array for pcpu_lpage_remapped().  Unmapped
 +	 * lpages are pushed to the end and trimmed.
 +	 */
 +	for (i = 0; i < pcpul_nr_lpages - 1; i++)
 +		for (j = i + 1; j < pcpul_nr_lpages; j++) {
 +			struct pcpul_ent tmp;
 +
 +			if (!pcpul_map[j].ptr)
 +				continue;
 +			if (pcpul_map[i].ptr &&
 +			    pcpul_map[i].ptr < pcpul_map[j].ptr)
 +				continue;
 +
 +			tmp = pcpul_map[i];
 +			pcpul_map[i] = pcpul_map[j];
 +			pcpul_map[j] = tmp;
 +		}
 +
 +	while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
 +		pcpul_nr_lpages--;
 +
 +	return ret;
 +
 +enomem:
 +	for (i = 0; i < pcpul_nr_lpages; i++)
 +		if (pcpul_map[i].ptr)
 +			free_fn(pcpul_map[i].ptr, lpage_size);
 +	free_bootmem(__pa(pcpul_map), map_size);
 +	return -ENOMEM;
 +}
 +
 +/**
 + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
 + * @kaddr: the kernel address in question
 + *
 + * Determine whether @kaddr falls in the pcpul recycled area.  This is
 + * used by pageattr to detect VM aliases and break up the pcpu large
 + * page mapping such that the same physical page is not mapped under
 + * different attributes.
 + *
 + * The recycled area is always at the tail of a partially used large
 + * page.
 + *
 + * RETURNS:
 + * Address of corresponding remapped pcpu address if match is found;
 + * otherwise, NULL.
 + */
 +void *pcpu_lpage_remapped(void *kaddr)
 +{
 +	unsigned long lpage_mask = pcpul_lpage_size - 1;
 +	void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
 +	unsigned long offset = (unsigned long)kaddr & lpage_mask;
 +	int left = 0, right = pcpul_nr_lpages - 1;
 +	int pos;
 +
 +	/* pcpul in use at all? */
 +	if (!pcpul_map)
 +		return NULL;
 +
 +	/* okay, perform binary search */
 +	while (left <= right) {
 +		pos = (left + right) / 2;
 +
 +		if (pcpul_map[pos].ptr < lpage_addr)
 +			left = pos + 1;
 +		else if (pcpul_map[pos].ptr > lpage_addr)
 +			right = pos - 1;
 +		else
 +			return pcpul_map[pos].map_addr + offset;
 +	}
 +
 +	return NULL;
 +}
 +#endif
 +
 +/*
 + * Generic percpu area setup.
 + *
 + * The embedding helper is used because its behavior closely resembles
 + * the original non-dynamic generic percpu area setup.  This is
 + * important because many archs have addressing restrictions and might
 + * fail if the percpu area is located far away from the previous
 + * location.  As an added bonus, in non-NUMA cases, embedding is
 + * generally a good idea TLB-wise because percpu area can piggy back
 + * on the physical linear memory mapping which uses large page
 + * mappings on applicable archs.
 + */
 +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
 +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 +EXPORT_SYMBOL(__per_cpu_offset);
 +
 +void __init setup_per_cpu_areas(void)
 +{
 +	size_t static_size = __per_cpu_end - __per_cpu_start;
 +	ssize_t unit_size;
 +	unsigned long delta;
 +	unsigned int cpu;
 +
 +	/*
 +	 * Always reserve area for module percpu variables.  That's
 +	 * what the legacy allocator did.
 +	 */
 +	unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
 +					   PERCPU_DYNAMIC_RESERVE);
 +	if (unit_size < 0)
 +		panic("Failed to initialized percpu areas.");
 +
 +	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 +	for_each_possible_cpu(cpu)
 +		__per_cpu_offset[cpu] = delta + cpu * unit_size;
  }
 +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */