arch/powerpc/mm/numa.c

   1 /*
   2  * pSeries NUMA support
   3  *
   4  * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License
   8  * as published by the Free Software Foundation; either version
   9  * 2 of the License, or (at your option) any later version.
  10  */
  11 #include <linux/threads.h>
  12 #include <linux/bootmem.h>
  13 #include <linux/init.h>
  14 #include <linux/mm.h>
  15 #include <linux/mmzone.h>
  16 #include <linux/export.h>
  17 #include <linux/nodemask.h>
  18 #include <linux/cpu.h>
  19 #include <linux/notifier.h>
  20 #include <linux/memblock.h>
  21 #include <linux/of.h>
  22 #include <linux/pfn.h>
  23 #include <linux/cpuset.h>
  24 #include <linux/node.h>
  25 #include <asm/sparsemem.h>
  26 #include <asm/prom.h>
  27 #include <asm/smp.h>
  28 #include <asm/firmware.h>
  29 #include <asm/paca.h>
  30 #include <asm/hvcall.h>
  31 #include <asm/setup.h>
  32
  33 static int numa_enabled = 1;
  34
  35 static char *cmdline __initdata;
  36
  37 static int numa_debug;
  38 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
  39
  40 int numa_cpu_lookup_table[NR_CPUS];
  41 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
  42 struct pglist_data *node_data[MAX_NUMNODES];
  43
  44 EXPORT_SYMBOL(numa_cpu_lookup_table);
  45 EXPORT_SYMBOL(node_to_cpumask_map);
  46 EXPORT_SYMBOL(node_data);
  47
  48 static int min_common_depth;
  49 static int n_mem_addr_cells, n_mem_size_cells;
  50 static int form1_affinity;
  51
  52 #define MAX_DISTANCE_REF_POINTS 4
  53 static int distance_ref_points_depth;
  54 static const unsigned int *distance_ref_points;
  55 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
  56
  57 /*
  58  * Allocate node_to_cpumask_map based on number of available nodes
  59  * Requires node_possible_map to be valid.
  60  *
  61  * Note: cpumask_of_node() is not valid until after this is done.
  62  */
  63 static void __init setup_node_to_cpumask_map(void)
  64 {
  65         unsigned int node, num = 0;
  66
  67         /* setup nr_node_ids if not done yet */
  68         if (nr_node_ids == MAX_NUMNODES) {
  69                 for_each_node_mask(node, node_possible_map)
  70                         num = node;
  71                 nr_node_ids = num + 1;
  72         }
  73
  74         /* allocate the map */
  75         for (node = 0; node < nr_node_ids; node++)
  76                 alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
  77
  78         /* cpumask_of_node() will now work */
  79         dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
  80 }
  81
  82 static int __init fake_numa_create_new_node(unsigned long end_pfn,
  83                                                 unsigned int *nid)
  84 {
  85         unsigned long long mem;
  86         char *p = cmdline;
  87         static unsigned int fake_nid;
  88         static unsigned long long curr_boundary;
  89
  90         /*
  91          * Modify node id, iff we started creating NUMA nodes
  92          * We want to continue from where we left of the last time
  93          */
  94         if (fake_nid)
  95                 *nid = fake_nid;
  96         /*
  97          * In case there are no more arguments to parse, the
  98          * node_id should be the same as the last fake node id
  99          * (we've handled this above).
 100          */
 101         if (!p)
 102                 return 0;
 103
 104         mem = memparse(p, &p);
 105         if (!mem)
 106                 return 0;
 107
 108         if (mem < curr_boundary)
 109                 return 0;
 110
 111         curr_boundary = mem;
 112
 113         if ((end_pfn << PAGE_SHIFT) > mem) {
 114                 /*
 115                  * Skip commas and spaces
 116                  */
 117                 while (*p == ',' || *p == ' ' || *p == '\t')
 118                         p++;
 119
 120                 cmdline = p;
 121                 fake_nid++;
 122                 *nid = fake_nid;
 123                 dbg("created new fake_node with id %d\n", fake_nid);
 124                 return 1;
 125         }
 126         return 0;
 127 }
 128
 129 /*
 130  * get_node_active_region - Return active region containing pfn
 131  * Active range returned is empty if none found.
 132  * @pfn: The page to return the region for
 133  * @node_ar: Returned set to the active region containing @pfn
 134  */
 135 static void __init get_node_active_region(unsigned long pfn,
 136                                           struct node_active_region *node_ar)
 137 {
 138         unsigned long start_pfn, end_pfn;
 139         int i, nid;
 140
 141         for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
 142                 if (pfn >= start_pfn && pfn < end_pfn) {
 143                         node_ar->nid = nid;
 144                         node_ar->start_pfn = start_pfn;
 145                         node_ar->end_pfn = end_pfn;
 146                         break;
 147                 }
 148         }
 149 }
 150
 151 static void map_cpu_to_node(int cpu, int node)
 152 {
 153         numa_cpu_lookup_table[cpu] = node;
 154
 155         dbg("adding cpu %d to node %d\n", cpu, node);
 156
 157         if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
 158                 cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
 159 }
 160
 161 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
 162 static void unmap_cpu_from_node(unsigned long cpu)
 163 {
 164         int node = numa_cpu_lookup_table[cpu];
 165
 166         dbg("removing cpu %lu from node %d\n", cpu, node);
 167
 168         if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
 169                 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
 170         } else {
 171                 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
 172                        cpu, node);
 173         }
 174 }
 175 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
 176
 177 /* must hold reference to node during call */
 178 static const int *of_get_associativity(struct device_node *dev)
 179 {
 180         return of_get_property(dev, "ibm,associativity", NULL);
 181 }
 182
 183 /*
 184  * Returns the property linux,drconf-usable-memory if
 185  * it exists (the property exists only in kexec/kdump kernels,
 186  * added by kexec-tools)
 187  */
 188 static const u32 *of_get_usable_memory(struct device_node *memory)
 189 {
 190         const u32 *prop;
 191         u32 len;
 192         prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
 193         if (!prop || len < sizeof(unsigned int))
 194                 return 0;
 195         return prop;
 196 }
 197
 198 int __node_distance(int a, int b)
 199 {
 200         int i;
 201         int distance = LOCAL_DISTANCE;
 202
 203         if (!form1_affinity)
 204                 return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
 205
 206         for (i = 0; i < distance_ref_points_depth; i++) {
 207                 if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
 208                         break;
 209
 210                 /* Double the distance for each NUMA level */
 211                 distance *= 2;
 212         }
 213
 214         return distance;
 215 }
 216
 217 static void initialize_distance_lookup_table(int nid,
 218                 const unsigned int *associativity)
 219 {
 220         int i;
 221
 222         if (!form1_affinity)
 223                 return;
 224
 225         for (i = 0; i < distance_ref_points_depth; i++) {
 226                 distance_lookup_table[nid][i] =
 227                         associativity[distance_ref_points[i]];
 228         }
 229 }
 230
 231 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
 232  * info is found.
 233  */
 234 static int associativity_to_nid(const unsigned int *associativity)
 235 {
 236         int nid = -1;
 237
 238         if (min_common_depth == -1)
 239                 goto out;
 240
 241         if (associativity[0] >= min_common_depth)
 242                 nid = associativity[min_common_depth];
 243
 244         /* POWER4 LPAR uses 0xffff as invalid node */
 245         if (nid == 0xffff || nid >= MAX_NUMNODES)
 246                 nid = -1;
 247
 248         if (nid > 0 && associativity[0] >= distance_ref_points_depth)
 249                 initialize_distance_lookup_table(nid, associativity);
 250
 251 out:
 252         return nid;
 253 }
 254
 255 /* Returns the nid associated with the given device tree node,
 256  * or -1 if not found.
 257  */
 258 static int of_node_to_nid_single(struct device_node *device)
 259 {
 260         int nid = -1;
 261         const unsigned int *tmp;
 262
 263         tmp = of_get_associativity(device);
 264         if (tmp)
 265                 nid = associativity_to_nid(tmp);
 266         return nid;
 267 }
 268
 269 /* Walk the device tree upwards, looking for an associativity id */
 270 int of_node_to_nid(struct device_node *device)
 271 {
 272         struct device_node *tmp;
 273         int nid = -1;
 274
 275         of_node_get(device);
 276         while (device) {
 277                 nid = of_node_to_nid_single(device);
 278                 if (nid != -1)
 279                         break;
 280
 281                 tmp = device;
 282                 device = of_get_parent(tmp);
 283                 of_node_put(tmp);
 284         }
 285         of_node_put(device);
 286
 287         return nid;
 288 }
 289 EXPORT_SYMBOL_GPL(of_node_to_nid);
 290
 291 static int __init find_min_common_depth(void)
 292 {
 293         int depth;
 294         struct device_node *chosen;
 295         struct device_node *root;
 296         const char *vec5;
 297
 298         if (firmware_has_feature(FW_FEATURE_OPAL))
 299                 root = of_find_node_by_path("/ibm,opal");
 300         else
 301                 root = of_find_node_by_path("/rtas");
 302         if (!root)
 303                 root = of_find_node_by_path("/");
 304
 305         /*
 306          * This property is a set of 32-bit integers, each representing
 307          * an index into the ibm,associativity nodes.
 308          *
 309          * With form 0 affinity the first integer is for an SMP configuration
 310          * (should be all 0's) and the second is for a normal NUMA
 311          * configuration. We have only one level of NUMA.
 312          *
 313          * With form 1 affinity the first integer is the most significant
 314          * NUMA boundary and the following are progressively less significant
 315          * boundaries. There can be more than one level of NUMA.
 316          */
 317         distance_ref_points = of_get_property(root,
 318                                         "ibm,associativity-reference-points",
 319                                         &distance_ref_points_depth);
 320
 321         if (!distance_ref_points) {
 322                 dbg("NUMA: ibm,associativity-reference-points not found.\n");
 323                 goto err;
 324         }
 325
 326         distance_ref_points_depth /= sizeof(int);
 327
 328 #define VEC5_AFFINITY_BYTE      5
 329 #define VEC5_AFFINITY           0x80
 330
 331         if (firmware_has_feature(FW_FEATURE_OPAL))
 332                 form1_affinity = 1;
 333         else {
 334                 chosen = of_find_node_by_path("/chosen");
 335                 if (chosen) {
 336                         vec5 = of_get_property(chosen,
 337                                                "ibm,architecture-vec-5", NULL);
 338                         if (vec5 && (vec5[VEC5_AFFINITY_BYTE] &
 339                                                         VEC5_AFFINITY)) {
 340                                 dbg("Using form 1 affinity\n");
 341                                 form1_affinity = 1;
 342                         }
 343
 344                         of_node_put(chosen);
 345                 }
 346         }
 347
 348         if (form1_affinity) {
 349                 depth = distance_ref_points[0];
 350         } else {
 351                 if (distance_ref_points_depth < 2) {
 352                         printk(KERN_WARNING "NUMA: "
 353                                 "short ibm,associativity-reference-points\n");
 354                         goto err;
 355                 }
 356
 357                 depth = distance_ref_points[1];
 358         }
 359
 360         /*
 361          * Warn and cap if the hardware supports more than
 362          * MAX_DISTANCE_REF_POINTS domains.
 363          */
 364         if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
 365                 printk(KERN_WARNING "NUMA: distance array capped at "
 366                         "%d entries\n", MAX_DISTANCE_REF_POINTS);
 367                 distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
 368         }
 369
 370         of_node_put(root);
 371         return depth;
 372
 373 err:
 374         of_node_put(root);
 375         return -1;
 376 }
 377
 378 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
 379 {
 380         struct device_node *memory = NULL;
 381
 382         memory = of_find_node_by_type(memory, "memory");
 383         if (!memory)
 384                 panic("numa.c: No memory nodes found!");
 385
 386         *n_addr_cells = of_n_addr_cells(memory);
 387         *n_size_cells = of_n_size_cells(memory);
 388         of_node_put(memory);
 389 }
 390
 391 static unsigned long read_n_cells(int n, const unsigned int **buf)
 392 {
 393         unsigned long result = 0;
 394
 395         while (n--) {
 396                 result = (result << 32) | **buf;
 397                 (*buf)++;
 398         }
 399         return result;
 400 }
 401
 402 /*
 403  * Read the next memblock list entry from the ibm,dynamic-memory property
 404  * and return the information in the provided of_drconf_cell structure.
 405  */
 406 static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
 407 {
 408         const u32 *cp;
 409
 410         drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
 411
 412         cp = *cellp;
 413         drmem->drc_index = cp[0];
 414         drmem->reserved = cp[1];
 415         drmem->aa_index = cp[2];
 416         drmem->flags = cp[3];
 417
 418         *cellp = cp + 4;
 419 }
 420
 421 /*
 422  * Retrieve and validate the ibm,dynamic-memory property of the device tree.
 423  *
 424  * The layout of the ibm,dynamic-memory property is a number N of memblock
 425  * list entries followed by N memblock list entries.  Each memblock list entry
 426  * contains information as laid out in the of_drconf_cell struct above.
 427  */
 428 static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
 429 {
 430         const u32 *prop;
 431         u32 len, entries;
 432
 433         prop = of_get_property(memory, "ibm,dynamic-memory", &len);
 434         if (!prop || len < sizeof(unsigned int))
 435                 return 0;
 436
 437         entries = *prop++;
 438
 439         /* Now that we know the number of entries, revalidate the size
 440          * of the property read in to ensure we have everything
 441          */
 442         if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
 443                 return 0;
 444
 445         *dm = prop;
 446         return entries;
 447 }
 448
 449 /*
 450  * Retrieve and validate the ibm,lmb-size property for drconf memory
 451  * from the device tree.
 452  */
 453 static u64 of_get_lmb_size(struct device_node *memory)
 454 {
 455         const u32 *prop;
 456         u32 len;
 457
 458         prop = of_get_property(memory, "ibm,lmb-size", &len);
 459         if (!prop || len < sizeof(unsigned int))
 460                 return 0;
 461
 462         return read_n_cells(n_mem_size_cells, &prop);
 463 }
 464
 465 struct assoc_arrays {
 466         u32     n_arrays;
 467         u32     array_sz;
 468         const u32 *arrays;
 469 };
 470
 471 /*
 472  * Retrieve and validate the list of associativity arrays for drconf
 473  * memory from the ibm,associativity-lookup-arrays property of the
 474  * device tree..
 475  *
 476  * The layout of the ibm,associativity-lookup-arrays property is a number N
 477  * indicating the number of associativity arrays, followed by a number M
 478  * indicating the size of each associativity array, followed by a list
 479  * of N associativity arrays.
 480  */
 481 static int of_get_assoc_arrays(struct device_node *memory,
 482                                struct assoc_arrays *aa)
 483 {
 484         const u32 *prop;
 485         u32 len;
 486
 487         prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
 488         if (!prop || len < 2 * sizeof(unsigned int))
 489                 return -1;
 490
 491         aa->n_arrays = *prop++;
 492         aa->array_sz = *prop++;
 493
 494         /* Now that we know the number of arrays and size of each array,
 495          * revalidate the size of the property read in.
 496          */
 497         if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
 498                 return -1;
 499
 500         aa->arrays = prop;
 501         return 0;
 502 }
 503
 504 /*
 505  * This is like of_node_to_nid_single() for memory represented in the
 506  * ibm,dynamic-reconfiguration-memory node.
 507  */
 508 static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
 509                                    struct assoc_arrays *aa)
 510 {
 511         int default_nid = 0;
 512         int nid = default_nid;
 513         int index;
 514
 515         if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
 516             !(drmem->flags & DRCONF_MEM_AI_INVALID) &&
 517             drmem->aa_index < aa->n_arrays) {
 518                 index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
 519                 nid = aa->arrays[index];
 520
 521                 if (nid == 0xffff || nid >= MAX_NUMNODES)
 522                         nid = default_nid;
 523         }
 524
 525         return nid;
 526 }
 527
 528 /*
 529  * Figure out to which domain a cpu belongs and stick it there.
 530  * Return the id of the domain used.
 531  */
 532 static int __cpuinit numa_setup_cpu(unsigned long lcpu)
 533 {
 534         int nid = 0;
 535         struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
 536
 537         if (!cpu) {
 538                 WARN_ON(1);
 539                 goto out;
 540         }
 541
 542         nid = of_node_to_nid_single(cpu);
 543
 544         if (nid < 0 || !node_online(nid))
 545                 nid = first_online_node;
 546 out:
 547         map_cpu_to_node(lcpu, nid);
 548
 549         of_node_put(cpu);
 550
 551         return nid;
 552 }
 553
 554 static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
 555                              unsigned long action,
 556                              void *hcpu)
 557 {
 558         unsigned long lcpu = (unsigned long)hcpu;
 559         int ret = NOTIFY_DONE;
 560
 561         switch (action) {
 562         case CPU_UP_PREPARE:
 563         case CPU_UP_PREPARE_FROZEN:
 564                 numa_setup_cpu(lcpu);
 565                 ret = NOTIFY_OK;
 566                 break;
 567 #ifdef CONFIG_HOTPLUG_CPU
 568         case CPU_DEAD:
 569         case CPU_DEAD_FROZEN:
 570         case CPU_UP_CANCELED:
 571         case CPU_UP_CANCELED_FROZEN:
 572                 unmap_cpu_from_node(lcpu);
 573                 break;
 574                 ret = NOTIFY_OK;
 575 #endif
 576         }
 577         return ret;
 578 }
 579
 580 /*
 581  * Check and possibly modify a memory region to enforce the memory limit.
 582  *
 583  * Returns the size the region should have to enforce the memory limit.
 584  * This will either be the original value of size, a truncated value,
 585  * or zero. If the returned value of size is 0 the region should be
 586  * discarded as it lies wholly above the memory limit.
 587  */
 588 static unsigned long __init numa_enforce_memory_limit(unsigned long start,
 589                                                       unsigned long size)
 590 {
 591         /*
 592          * We use memblock_end_of_DRAM() in here instead of memory_limit because
 593          * we've already adjusted it for the limit and it takes care of
 594          * having memory holes below the limit.  Also, in the case of
 595          * iommu_is_off, memory_limit is not set but is implicitly enforced.
 596          */
 597
 598         if (start + size <= memblock_end_of_DRAM())
 599                 return size;
 600
 601         if (start >= memblock_end_of_DRAM())
 602                 return 0;
 603
 604         return memblock_end_of_DRAM() - start;
 605 }
 606
 607 /*
 608  * Reads the counter for a given entry in
 609  * linux,drconf-usable-memory property
 610  */
 611 static inline int __init read_usm_ranges(const u32 **usm)
 612 {
 613         /*
 614          * For each lmb in ibm,dynamic-memory a corresponding
 615          * entry in linux,drconf-usable-memory property contains
 616          * a counter followed by that many (base, size) duple.
 617          * read the counter from linux,drconf-usable-memory
 618          */
 619         return read_n_cells(n_mem_size_cells, usm);
 620 }
 621
 622 /*
 623  * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
 624  * node.  This assumes n_mem_{addr,size}_cells have been set.
 625  */
 626 static void __init parse_drconf_memory(struct device_node *memory)
 627 {
 628         const u32 *uninitialized_var(dm), *usm;
 629         unsigned int n, rc, ranges, is_kexec_kdump = 0;
 630         unsigned long lmb_size, base, size, sz;
 631         int nid;
 632         struct assoc_arrays aa = { .arrays = NULL };
 633
 634         n = of_get_drconf_memory(memory, &dm);
 635         if (!n)
 636                 return;
 637
 638         lmb_size = of_get_lmb_size(memory);
 639         if (!lmb_size)
 640                 return;
 641
 642         rc = of_get_assoc_arrays(memory, &aa);
 643         if (rc)
 644                 return;
 645
 646         /* check if this is a kexec/kdump kernel */
 647         usm = of_get_usable_memory(memory);
 648         if (usm != NULL)
 649                 is_kexec_kdump = 1;
 650
 651         for (; n != 0; --n) {
 652                 struct of_drconf_cell drmem;
 653
 654                 read_drconf_cell(&drmem, &dm);
 655
 656                 /* skip this block if the reserved bit is set in flags (0x80)
 657                    or if the block is not assigned to this partition (0x8) */
 658                 if ((drmem.flags & DRCONF_MEM_RESERVED)
 659                     || !(drmem.flags & DRCONF_MEM_ASSIGNED))
 660                         continue;
 661
 662                 base = drmem.base_addr;
 663                 size = lmb_size;
 664                 ranges = 1;
 665
 666                 if (is_kexec_kdump) {
 667                         ranges = read_usm_ranges(&usm);
 668                         if (!ranges) /* there are no (base, size) duple */
 669                                 continue;
 670                 }
 671                 do {
 672                         if (is_kexec_kdump) {
 673                                 base = read_n_cells(n_mem_addr_cells, &usm);
 674                                 size = read_n_cells(n_mem_size_cells, &usm);
 675                         }
 676                         nid = of_drconf_to_nid_single(&drmem, &aa);
 677                         fake_numa_create_new_node(
 678                                 ((base + size) >> PAGE_SHIFT),
 679                                            &nid);
 680                         node_set_online(nid);
 681                         sz = numa_enforce_memory_limit(base, size);
 682                         if (sz)
 683                                 memblock_set_node(base, sz, nid);
 684                 } while (--ranges);
 685         }
 686 }
 687
 688 static int __init parse_numa_properties(void)
 689 {
 690         struct device_node *memory;
 691         int default_nid = 0;
 692         unsigned long i;
 693
 694         if (numa_enabled == 0) {
 695                 printk(KERN_WARNING "NUMA disabled by user\n");
 696                 return -1;
 697         }
 698
 699         min_common_depth = find_min_common_depth();
 700
 701         if (min_common_depth < 0)
 702                 return min_common_depth;
 703
 704         dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
 705
 706         /*
 707          * Even though we connect cpus to numa domains later in SMP
 708          * init, we need to know the node ids now. This is because
 709          * each node to be onlined must have NODE_DATA etc backing it.
 710          */
 711         for_each_present_cpu(i) {
 712                 struct device_node *cpu;
 713                 int nid;
 714
 715                 cpu = of_get_cpu_node(i, NULL);
 716                 BUG_ON(!cpu);
 717                 nid = of_node_to_nid_single(cpu);
 718                 of_node_put(cpu);
 719
 720                 /*
 721                  * Don't fall back to default_nid yet -- we will plug
 722                  * cpus into nodes once the memory scan has discovered
 723                  * the topology.
 724                  */
 725                 if (nid < 0)
 726                         continue;
 727                 node_set_online(nid);
 728         }
 729
 730         get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
 731
 732         for_each_node_by_type(memory, "memory") {
 733                 unsigned long start;
 734                 unsigned long size;
 735                 int nid;
 736                 int ranges;
 737                 const unsigned int *memcell_buf;
 738                 unsigned int len;
 739
 740                 memcell_buf = of_get_property(memory,
 741                         "linux,usable-memory", &len);
 742                 if (!memcell_buf || len <= 0)
 743                         memcell_buf = of_get_property(memory, "reg", &len);
 744                 if (!memcell_buf || len <= 0)
 745                         continue;
 746
 747                 /* ranges in cell */
 748                 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
 749 new_range:
 750                 /* these are order-sensitive, and modify the buffer pointer */
 751                 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
 752                 size = read_n_cells(n_mem_size_cells, &memcell_buf);
 753
 754                 /*
 755                  * Assumption: either all memory nodes or none will
 756                  * have associativity properties.  If none, then
 757                  * everything goes to default_nid.
 758                  */
 759                 nid = of_node_to_nid_single(memory);
 760                 if (nid < 0)
 761                         nid = default_nid;
 762
 763                 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
 764                 node_set_online(nid);
 765
 766                 if (!(size = numa_enforce_memory_limit(start, size))) {
 767                         if (--ranges)
 768                                 goto new_range;
 769                         else
 770                                 continue;
 771                 }
 772
 773                 memblock_set_node(start, size, nid);
 774
 775                 if (--ranges)
 776                         goto new_range;
 777         }
 778
 779         /*
 780          * Now do the same thing for each MEMBLOCK listed in the
 781          * ibm,dynamic-memory property in the
 782          * ibm,dynamic-reconfiguration-memory node.
 783          */
 784         memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
 785         if (memory)
 786                 parse_drconf_memory(memory);
 787
 788         return 0;
 789 }
 790
 791 static void __init setup_nonnuma(void)
 792 {
 793         unsigned long top_of_ram = memblock_end_of_DRAM();
 794         unsigned long total_ram = memblock_phys_mem_size();
 795         unsigned long start_pfn, end_pfn;
 796         unsigned int nid = 0;
 797         struct memblock_region *reg;
 798
 799         printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
 800                top_of_ram, total_ram);
 801         printk(KERN_DEBUG "Memory hole size: %ldMB\n",
 802                (top_of_ram - total_ram) >> 20);
 803
 804         for_each_memblock(memory, reg) {
 805                 start_pfn = memblock_region_memory_base_pfn(reg);
 806                 end_pfn = memblock_region_memory_end_pfn(reg);
 807
 808                 fake_numa_create_new_node(end_pfn, &nid);
 809                 memblock_set_node(PFN_PHYS(start_pfn),
 810                                   PFN_PHYS(end_pfn - start_pfn), nid);
 811                 node_set_online(nid);
 812         }
 813 }
 814
 815 void __init dump_numa_cpu_topology(void)
 816 {
 817         unsigned int node;
 818         unsigned int cpu, count;
 819
 820         if (min_common_depth == -1 || !numa_enabled)
 821                 return;
 822
 823         for_each_online_node(node) {
 824                 printk(KERN_DEBUG "Node %d CPUs:", node);
 825
 826                 count = 0;
 827                 /*
 828                  * If we used a CPU iterator here we would miss printing
 829                  * the holes in the cpumap.
 830                  */
 831                 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
 832                         if (cpumask_test_cpu(cpu,
 833                                         node_to_cpumask_map[node])) {
 834                                 if (count == 0)
 835                                         printk(" %u", cpu);
 836                                 ++count;
 837                         } else {
 838                                 if (count > 1)
 839                                         printk("-%u", cpu - 1);
 840                                 count = 0;
 841                         }
 842                 }
 843
 844                 if (count > 1)
 845                         printk("-%u", nr_cpu_ids - 1);
 846                 printk("\n");
 847         }
 848 }
 849
 850 static void __init dump_numa_memory_topology(void)
 851 {
 852         unsigned int node;
 853         unsigned int count;
 854
 855         if (min_common_depth == -1 || !numa_enabled)
 856                 return;
 857
 858         for_each_online_node(node) {
 859                 unsigned long i;
 860
 861                 printk(KERN_DEBUG "Node %d Memory:", node);
 862
 863                 count = 0;
 864
 865                 for (i = 0; i < memblock_end_of_DRAM();
 866                      i += (1 << SECTION_SIZE_BITS)) {
 867                         if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
 868                                 if (count == 0)
 869                                         printk(" 0x%lx", i);
 870                                 ++count;
 871                         } else {
 872                                 if (count > 0)
 873                                         printk("-0x%lx", i);
 874                                 count = 0;
 875                         }
 876                 }
 877
 878                 if (count > 0)
 879                         printk("-0x%lx", i);
 880                 printk("\n");
 881         }
 882 }
 883
 884 /*
 885  * Allocate some memory, satisfying the memblock or bootmem allocator where
 886  * required. nid is the preferred node and end is the physical address of
 887  * the highest address in the node.
 888  *
 889  * Returns the virtual address of the memory.
 890  */
 891 static void __init *careful_zallocation(int nid, unsigned long size,
 892                                        unsigned long align,
 893                                        unsigned long end_pfn)
 894 {
 895         void *ret;
 896         int new_nid;
 897         unsigned long ret_paddr;
 898
 899         ret_paddr = __memblock_alloc_base(size, align, end_pfn << PAGE_SHIFT);
 900
 901         /* retry over all memory */
 902         if (!ret_paddr)
 903                 ret_paddr = __memblock_alloc_base(size, align, memblock_end_of_DRAM());
 904
 905         if (!ret_paddr)
 906                 panic("numa.c: cannot allocate %lu bytes for node %d",
 907                       size, nid);
 908
 909         ret = __va(ret_paddr);
 910
 911         /*
 912          * We initialize the nodes in numeric order: 0, 1, 2...
 913          * and hand over control from the MEMBLOCK allocator to the
 914          * bootmem allocator.  If this function is called for
 915          * node 5, then we know that all nodes <5 are using the
 916          * bootmem allocator instead of the MEMBLOCK allocator.
 917          *
 918          * So, check the nid from which this allocation came
 919          * and double check to see if we need to use bootmem
 920          * instead of the MEMBLOCK.  We don't free the MEMBLOCK memory
 921          * since it would be useless.
 922          */
 923         new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT);
 924         if (new_nid < nid) {
 925                 ret = __alloc_bootmem_node(NODE_DATA(new_nid),
 926                                 size, align, 0);
 927
 928                 dbg("alloc_bootmem %p %lx\n", ret, size);
 929         }
 930
 931         memset(ret, 0, size);
 932         return ret;
 933 }
 934
 935 static struct notifier_block __cpuinitdata ppc64_numa_nb = {
 936         .notifier_call = cpu_numa_callback,
 937         .priority = 1 /* Must run before sched domains notifier. */
 938 };
 939
 940 static void __init mark_reserved_regions_for_nid(int nid)
 941 {
 942         struct pglist_data *node = NODE_DATA(nid);
 943         struct memblock_region *reg;
 944
 945         for_each_memblock(reserved, reg) {
 946                 unsigned long physbase = reg->base;
 947                 unsigned long size = reg->size;
 948                 unsigned long start_pfn = physbase >> PAGE_SHIFT;
 949                 unsigned long end_pfn = PFN_UP(physbase + size);
 950                 struct node_active_region node_ar;
 951                 unsigned long node_end_pfn = node->node_start_pfn +
 952                                              node->node_spanned_pages;
 953
 954                 /*
 955                  * Check to make sure that this memblock.reserved area is
 956                  * within the bounds of the node that we care about.
 957                  * Checking the nid of the start and end points is not
 958                  * sufficient because the reserved area could span the
 959                  * entire node.
 960                  */
 961                 if (end_pfn <= node->node_start_pfn ||
 962                     start_pfn >= node_end_pfn)
 963                         continue;
 964
 965                 get_node_active_region(start_pfn, &node_ar);
 966                 while (start_pfn < end_pfn &&
 967                         node_ar.start_pfn < node_ar.end_pfn) {
 968                         unsigned long reserve_size = size;
 969                         /*
 970                          * if reserved region extends past active region
 971                          * then trim size to active region
 972                          */
 973                         if (end_pfn > node_ar.end_pfn)
 974                                 reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
 975                                         - physbase;
 976                         /*
 977                          * Only worry about *this* node, others may not
 978                          * yet have valid NODE_DATA().
 979                          */
 980                         if (node_ar.nid == nid) {
 981                                 dbg("reserve_bootmem %lx %lx nid=%d\n",
 982                                         physbase, reserve_size, node_ar.nid);
 983                                 reserve_bootmem_node(NODE_DATA(node_ar.nid),
 984                                                 physbase, reserve_size,
 985                                                 BOOTMEM_DEFAULT);
 986                         }
 987                         /*
 988                          * if reserved region is contained in the active region
 989                          * then done.
 990                          */
 991                         if (end_pfn <= node_ar.end_pfn)
 992                                 break;
 993
 994                         /*
 995                          * reserved region extends past the active region
 996                          *   get next active region that contains this
 997                          *   reserved region
 998                          */
 999                         start_pfn = node_ar.end_pfn;
1000                         physbase = start_pfn << PAGE_SHIFT;
1001                         size = size - reserve_size;
1002                         get_node_active_region(start_pfn, &node_ar);
1003                 }
1004         }
1005 }
1006
1007
1008 void __init do_init_bootmem(void)
1009 {
1010         int nid;
1011
1012         min_low_pfn = 0;
1013         max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
1014         max_pfn = max_low_pfn;
1015
1016         if (parse_numa_properties())
1017                 setup_nonnuma();
1018         else
1019                 dump_numa_memory_topology();
1020
1021         for_each_online_node(nid) {
1022                 unsigned long start_pfn, end_pfn;
1023                 void *bootmem_vaddr;
1024                 unsigned long bootmap_pages;
1025
1026                 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
1027
1028                 /*
1029                  * Allocate the node structure node local if possible
1030                  *
1031                  * Be careful moving this around, as it relies on all
1032                  * previous nodes' bootmem to be initialized and have
1033                  * all reserved areas marked.
1034                  */
1035                 NODE_DATA(nid) = careful_zallocation(nid,
1036                                         sizeof(struct pglist_data),
1037                                         SMP_CACHE_BYTES, end_pfn);
1038
1039                 dbg("node %d\n", nid);
1040                 dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
1041
1042                 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
1043                 NODE_DATA(nid)->node_start_pfn = start_pfn;
1044                 NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
1045
1046                 if (NODE_DATA(nid)->node_spanned_pages == 0)
1047                         continue;
1048
1049                 dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
1050                 dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
1051
1052                 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
1053                 bootmem_vaddr = careful_zallocation(nid,
1054                                         bootmap_pages << PAGE_SHIFT,
1055                                         PAGE_SIZE, end_pfn);
1056
1057                 dbg("bootmap_vaddr = %p\n", bootmem_vaddr);
1058
1059                 init_bootmem_node(NODE_DATA(nid),
1060                                   __pa(bootmem_vaddr) >> PAGE_SHIFT,
1061                                   start_pfn, end_pfn);
1062
1063                 free_bootmem_with_active_regions(nid, end_pfn);
1064                 /*
1065                  * Be very careful about moving this around.  Future
1066                  * calls to careful_zallocation() depend on this getting
1067                  * done correctly.
1068                  */
1069                 mark_reserved_regions_for_nid(nid);
1070                 sparse_memory_present_with_active_regions(nid);
1071         }
1072
1073         init_bootmem_done = 1;
1074
1075         /*
1076          * Now bootmem is initialised we can create the node to cpumask
1077          * lookup tables and setup the cpu callback to populate them.
1078          */
1079         setup_node_to_cpumask_map();
1080
1081         register_cpu_notifier(&ppc64_numa_nb);
1082         cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
1083                           (void *)(unsigned long)boot_cpuid);
1084 }
1085
1086 void __init paging_init(void)
1087 {
1088         unsigned long max_zone_pfns[MAX_NR_ZONES];
1089         memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
1090         max_zone_pfns[ZONE_DMA] = memblock_end_of_DRAM() >> PAGE_SHIFT;
1091         free_area_init_nodes(max_zone_pfns);
1092 }
1093
1094 static int __init early_numa(char *p)
1095 {
1096         if (!p)
1097                 return 0;
1098
1099         if (strstr(p, "off"))
1100                 numa_enabled = 0;
1101
1102         if (strstr(p, "debug"))
1103                 numa_debug = 1;
1104
1105         p = strstr(p, "fake=");
1106         if (p)
1107                 cmdline = p + strlen("fake=");
1108
1109         return 0;
1110 }
1111 early_param("numa", early_numa);
1112
1113 #ifdef CONFIG_MEMORY_HOTPLUG
1114 /*
1115  * Find the node associated with a hot added memory section for
1116  * memory represented in the device tree by the property
1117  * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
1118  */
1119 static int hot_add_drconf_scn_to_nid(struct device_node *memory,
1120                                      unsigned long scn_addr)
1121 {
1122         const u32 *dm;
1123         unsigned int drconf_cell_cnt, rc;
1124         unsigned long lmb_size;
1125         struct assoc_arrays aa;
1126         int nid = -1;
1127
1128         drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
1129         if (!drconf_cell_cnt)
1130                 return -1;
1131
1132         lmb_size = of_get_lmb_size(memory);
1133         if (!lmb_size)
1134                 return -1;
1135
1136         rc = of_get_assoc_arrays(memory, &aa);
1137         if (rc)
1138                 return -1;
1139
1140         for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
1141                 struct of_drconf_cell drmem;
1142
1143                 read_drconf_cell(&drmem, &dm);
1144
1145                 /* skip this block if it is reserved or not assigned to
1146                  * this partition */
1147                 if ((drmem.flags & DRCONF_MEM_RESERVED)
1148                     || !(drmem.flags & DRCONF_MEM_ASSIGNED))
1149                         continue;
1150
1151                 if ((scn_addr < drmem.base_addr)
1152                     || (scn_addr >= (drmem.base_addr + lmb_size)))
1153                         continue;
1154
1155                 nid = of_drconf_to_nid_single(&drmem, &aa);
1156                 break;
1157         }
1158
1159         return nid;
1160 }
1161
1162 /*
1163  * Find the node associated with a hot added memory section for memory
1164  * represented in the device tree as a node (i.e. memory@XXXX) for
1165  * each memblock.
1166  */
1167 int hot_add_node_scn_to_nid(unsigned long scn_addr)
1168 {
1169         struct device_node *memory;
1170         int nid = -1;
1171
1172         for_each_node_by_type(memory, "memory") {
1173                 unsigned long start, size;
1174                 int ranges;
1175                 const unsigned int *memcell_buf;
1176                 unsigned int len;
1177
1178                 memcell_buf = of_get_property(memory, "reg", &len);
1179                 if (!memcell_buf || len <= 0)
1180                         continue;
1181
1182                 /* ranges in cell */
1183                 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
1184
1185                 while (ranges--) {
1186                         start = read_n_cells(n_mem_addr_cells, &memcell_buf);
1187                         size = read_n_cells(n_mem_size_cells, &memcell_buf);
1188
1189                         if ((scn_addr < start) || (scn_addr >= (start + size)))
1190                                 continue;
1191
1192                         nid = of_node_to_nid_single(memory);
1193                         break;
1194                 }
1195
1196                 if (nid >= 0)
1197                         break;
1198         }
1199
1200         of_node_put(memory);
1201
1202         return nid;
1203 }
1204
1205 /*
1206  * Find the node associated with a hot added memory section.  Section
1207  * corresponds to a SPARSEMEM section, not an MEMBLOCK.  It is assumed that
1208  * sections are fully contained within a single MEMBLOCK.
1209  */
1210 int hot_add_scn_to_nid(unsigned long scn_addr)
1211 {
1212         struct device_node *memory = NULL;
1213         int nid, found = 0;
1214
1215         if (!numa_enabled || (min_common_depth < 0))
1216                 return first_online_node;
1217
1218         memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1219         if (memory) {
1220                 nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
1221                 of_node_put(memory);
1222         } else {
1223                 nid = hot_add_node_scn_to_nid(scn_addr);
1224         }
1225
1226         if (nid < 0 || !node_online(nid))
1227                 nid = first_online_node;
1228
1229         if (NODE_DATA(nid)->node_spanned_pages)
1230                 return nid;
1231
1232         for_each_online_node(nid) {
1233                 if (NODE_DATA(nid)->node_spanned_pages) {
1234                         found = 1;
1235                         break;
1236                 }
1237         }
1238
1239         BUG_ON(!found);
1240         return nid;
1241 }
1242
1243 static u64 hot_add_drconf_memory_max(void)
1244 {
1245         struct device_node *memory = NULL;
1246         unsigned int drconf_cell_cnt = 0;
1247         u64 lmb_size = 0;
1248         const u32 *dm = 0;
1249
1250         memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1251         if (memory) {
1252                 drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
1253                 lmb_size = of_get_lmb_size(memory);
1254                 of_node_put(memory);
1255         }
1256         return lmb_size * drconf_cell_cnt;
1257 }
1258
1259 /*
1260  * memory_hotplug_max - return max address of memory that may be added
1261  *
1262  * This is currently only used on systems that support drconfig memory
1263  * hotplug.
1264  */
1265 u64 memory_hotplug_max(void)
1266 {
1267         return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
1268 }
1269 #endif /* CONFIG_MEMORY_HOTPLUG */
1270
1271 /* Virtual Processor Home Node (VPHN) support */
1272 #ifdef CONFIG_PPC_SPLPAR
1273 static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
1274 static cpumask_t cpu_associativity_changes_mask;
1275 static int vphn_enabled;
1276 static void set_topology_timer(void);
1277
1278 /*
1279  * Store the current values of the associativity change counters in the
1280  * hypervisor.
1281  */
1282 static void setup_cpu_associativity_change_counters(void)
1283 {
1284         int cpu;
1285
1286         /* The VPHN feature supports a maximum of 8 reference points */
1287         BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
1288
1289         for_each_possible_cpu(cpu) {
1290                 int i;
1291                 u8 *counts = vphn_cpu_change_counts[cpu];
1292                 volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
1293
1294                 for (i = 0; i < distance_ref_points_depth; i++)
1295                         counts[i] = hypervisor_counts[i];
1296         }
1297 }
1298
1299 /*
1300  * The hypervisor maintains a set of 8 associativity change counters in
1301  * the VPA of each cpu that correspond to the associativity levels in the
1302  * ibm,associativity-reference-points property. When an associativity
1303  * level changes, the corresponding counter is incremented.
1304  *
1305  * Set a bit in cpu_associativity_changes_mask for each cpu whose home
1306  * node associativity levels have changed.
1307  *
1308  * Returns the number of cpus with unhandled associativity changes.
1309  */
1310 static int update_cpu_associativity_changes_mask(void)
1311 {
1312         int cpu, nr_cpus = 0;
1313         cpumask_t *changes = &cpu_associativity_changes_mask;
1314
1315         cpumask_clear(changes);
1316
1317         for_each_possible_cpu(cpu) {
1318                 int i, changed = 0;
1319                 u8 *counts = vphn_cpu_change_counts[cpu];
1320                 volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
1321
1322                 for (i = 0; i < distance_ref_points_depth; i++) {
1323                         if (hypervisor_counts[i] != counts[i]) {
1324                                 counts[i] = hypervisor_counts[i];
1325                                 changed = 1;
1326                         }
1327                 }
1328                 if (changed) {
1329                         cpumask_set_cpu(cpu, changes);
1330                         nr_cpus++;
1331                 }
1332         }
1333
1334         return nr_cpus;
1335 }
1336
1337 /*
1338  * 6 64-bit registers unpacked into 12 32-bit associativity values. To form
1339  * the complete property we have to add the length in the first cell.
1340  */
1341 #define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32) + 1)
1342
1343 /*
1344  * Convert the associativity domain numbers returned from the hypervisor
1345  * to the sequence they would appear in the ibm,associativity property.
1346  */
1347 static int vphn_unpack_associativity(const long *packed, unsigned int *unpacked)
1348 {
1349         int i, nr_assoc_doms = 0;
1350         const u16 *field = (const u16*) packed;
1351
1352 #define VPHN_FIELD_UNUSED       (0xffff)
1353 #define VPHN_FIELD_MSB          (0x8000)
1354 #define VPHN_FIELD_MASK         (~VPHN_FIELD_MSB)
1355
1356         for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
1357                 if (*field == VPHN_FIELD_UNUSED) {
1358                         /* All significant fields processed, and remaining
1359                          * fields contain the reserved value of all 1's.
1360                          * Just store them.
1361                          */
1362                         unpacked[i] = *((u32*)field);
1363                         field += 2;
1364                 } else if (*field & VPHN_FIELD_MSB) {
1365                         /* Data is in the lower 15 bits of this field */
1366                         unpacked[i] = *field & VPHN_FIELD_MASK;
1367                         field++;
1368                         nr_assoc_doms++;
1369                 } else {
1370                         /* Data is in the lower 15 bits of this field
1371                          * concatenated with the next 16 bit field
1372                          */
1373                         unpacked[i] = *((u32*)field);
1374                         field += 2;
1375                         nr_assoc_doms++;
1376                 }
1377         }
1378
1379         /* The first cell contains the length of the property */
1380         unpacked[0] = nr_assoc_doms;
1381
1382         return nr_assoc_doms;
1383 }
1384
1385 /*
1386  * Retrieve the new associativity information for a virtual processor's
1387  * home node.
1388  */
1389 static long hcall_vphn(unsigned long cpu, unsigned int *associativity)
1390 {
1391         long rc;
1392         long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
1393         u64 flags = 1;
1394         int hwcpu = get_hard_smp_processor_id(cpu);
1395
1396         rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
1397         vphn_unpack_associativity(retbuf, associativity);
1398
1399         return rc;
1400 }
1401
1402 static long vphn_get_associativity(unsigned long cpu,
1403                                         unsigned int *associativity)
1404 {
1405         long rc;
1406
1407         rc = hcall_vphn(cpu, associativity);
1408
1409         switch (rc) {
1410         case H_FUNCTION:
1411                 printk(KERN_INFO
1412                         "VPHN is not supported. Disabling polling...\n");
1413                 stop_topology_update();
1414                 break;
1415         case H_HARDWARE:
1416                 printk(KERN_ERR
1417                         "hcall_vphn() experienced a hardware fault "
1418                         "preventing VPHN. Disabling polling...\n");
1419                 stop_topology_update();
1420         }
1421
1422         return rc;
1423 }
1424
1425 /*
1426  * Update the node maps and sysfs entries for each cpu whose home node
1427  * has changed. Returns 1 when the topology has changed, and 0 otherwise.
1428  */
1429 int arch_update_cpu_topology(void)
1430 {
1431         int cpu, nid, old_nid, changed = 0;
1432         unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
1433         struct device *dev;
1434
1435         for_each_cpu(cpu,&cpu_associativity_changes_mask) {
1436                 vphn_get_associativity(cpu, associativity);
1437                 nid = associativity_to_nid(associativity);
1438
1439                 if (nid < 0 || !node_online(nid))
1440                         nid = first_online_node;
1441
1442                 old_nid = numa_cpu_lookup_table[cpu];
1443
1444                 /* Disable hotplug while we update the cpu
1445                  * masks and sysfs.
1446                  */
1447                 get_online_cpus();
1448                 unregister_cpu_under_node(cpu, old_nid);
1449                 unmap_cpu_from_node(cpu);
1450                 map_cpu_to_node(cpu, nid);
1451                 register_cpu_under_node(cpu, nid);
1452                 put_online_cpus();
1453
1454                 dev = get_cpu_device(cpu);
1455                 if (dev)
1456                         kobject_uevent(&dev->kobj, KOBJ_CHANGE);
1457                 changed = 1;
1458         }
1459
1460         return changed;
1461 }
1462
1463 static void topology_work_fn(struct work_struct *work)
1464 {
1465         rebuild_sched_domains();
1466 }
1467 static DECLARE_WORK(topology_work, topology_work_fn);
1468
1469 void topology_schedule_update(void)
1470 {
1471         schedule_work(&topology_work);
1472 }
1473
1474 static void topology_timer_fn(unsigned long ignored)
1475 {
1476         if (!vphn_enabled)
1477                 return;
1478         if (update_cpu_associativity_changes_mask() > 0)
1479                 topology_schedule_update();
1480         set_topology_timer();
1481 }
1482 static struct timer_list topology_timer =
1483         TIMER_INITIALIZER(topology_timer_fn, 0, 0);
1484
1485 static void set_topology_timer(void)
1486 {
1487         topology_timer.data = 0;
1488         topology_timer.expires = jiffies + 60 * HZ;
1489         add_timer(&topology_timer);
1490 }
1491
1492 /*
1493  * Start polling for VPHN associativity changes.
1494  */
1495 int start_topology_update(void)
1496 {
1497         int rc = 0;
1498
1499         /* Disabled until races with load balancing are fixed */
1500         if (0 && firmware_has_feature(FW_FEATURE_VPHN) &&
1501             get_lppaca()->shared_proc) {
1502                 vphn_enabled = 1;
1503                 setup_cpu_associativity_change_counters();
1504                 init_timer_deferrable(&topology_timer);
1505                 set_topology_timer();
1506                 rc = 1;
1507         }
1508
1509         return rc;
1510 }
1511 __initcall(start_topology_update);
1512
1513 /*
1514  * Disable polling for VPHN associativity changes.
1515  */
1516 int stop_topology_update(void)
1517 {
1518         vphn_enabled = 0;
1519         return del_timer_sync(&topology_timer);
1520 }
1521 #endif /* CONFIG_PPC_SPLPAR */