Merge branches 'x86-alternatives-for-linus', 'x86-fpu-for-linus', 'x86-hwmon-for...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 6 Jan 2011 19:11:50 +0000 (11:11 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 6 Jan 2011 19:11:50 +0000 (11:11 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 6 Jan 2011 19:11:50 +0000 (11:11 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 6 Jan 2011 19:11:50 +0000 (11:11 -0800)
diff --git a/CREDITS b/CREDITS

index 41d8e63d5165b5b786db6ab7d8c14fbc49fc0107..494b6e4746d7b9d08f0334bab61651341d285068 100644 (file)
--- a/CREDITS
+++ b/CREDITS
@@ -2365,8 +2365,6 @@ E: acme@redhat.com
  W: http://oops.ghostprotocols.net:81/blog/
  P: 1024D/9224DF01 D5DF E3BB E3C8 BCBB F8AD  841A B6AB 4681 9224 DF01
  D: IPX, LLC, DCCP, cyc2x, wl3501_cs, net/ hacks
-S: R. Brasílio Itiberê, 4270/1010 - Água Verde
-S: 80240-060 - Curitiba - Paraná
  S: Brazil
  
  N: Karsten Merker
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt

index a851118775d84c7a1d2356ba6a6c8e6208292887..6a8c73f55b80ca38601ba96f179565fe8b0b7ea0 100644 (file)
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -1,18 +1,22 @@
  CONFIG_RCU_TRACE debugfs Files and Formats
  
  
-The rcutree implementation of RCU provides debugfs trace output that
-summarizes counters and state.  This information is useful for debugging
-RCU itself, and can sometimes also help to debug abuses of RCU.
-The following sections describe the debugfs files and formats.
+The rcutree and rcutiny implementations of RCU provide debugfs trace
+output that summarizes counters and state.  This information is useful for
+debugging RCU itself, and can sometimes also help to debug abuses of RCU.
+The following sections describe the debugfs files and formats, first
+for rcutree and next for rcutiny.
  
  
-Hierarchical RCU debugfs Files and Formats
+CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
  
-This implementation of RCU provides three debugfs files under the
+These implementations of RCU provides five debugfs files under the
  top-level directory RCU: rcu/rcudata (which displays fields in struct
-rcu_data), rcu/rcugp (which displays grace-period counters), and
-rcu/rcuhier (which displays the struct rcu_node hierarchy).
+rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of
+rcu/rcudata), rcu/rcugp (which displays grace-period counters),
+rcu/rcuhier (which displays the struct rcu_node hierarchy), and
+rcu/rcu_pending (which displays counts of the reasons that the
+rcu_pending() function decided that there was core RCU work to do).
  
  The output of "cat rcu/rcudata" looks as follows:
  
@@ -130,7 +134,8 @@ o   "ci" is the number of RCU callbacks that have been invoked for
         been registered in absence of CPU-hotplug activity.
  
  o      "co" is the number of RCU callbacks that have been orphaned due to
-       this CPU going offline.
+       this CPU going offline.  These orphaned callbacks have been moved
+       to an arbitrarily chosen online CPU.
  
  o      "ca" is the number of RCU callbacks that have been adopted due to
         other CPUs going offline.  Note that ci+co-ca+ql is the number of
@@ -168,12 +173,12 @@ o "gpnum" is the number of grace periods that have started.  It is
  
  The output of "cat rcu/rcuhier" looks as follows, with very long lines:
  
-c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0
+c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
  1/1 .>. 0:127 ^0    
  3/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
  3/3f .>. 0:5 ^0    2/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3    
  rcu_bh:
-c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0
+c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
  0/1 .>. 0:127 ^0    
  0/3 .>. 0:35 ^0    0/0 .>. 36:71 ^1    0/0 .>. 72:107 ^2    0/0 .>. 108:127 ^3    
  0/3f .>. 0:5 ^0    0/3 .>. 6:11 ^1    0/0 .>. 12:17 ^2    0/0 .>. 18:23 ^3    0/0 .>. 24:29 ^4    0/0 .>. 30:35 ^5    0/0 .>. 36:41 ^0    0/0 .>. 42:47 ^1    0/0 .>. 48:53 ^2    0/0 .>. 54:59 ^3    0/0 .>. 60:65 ^4    0/0 .>. 66:71 ^5    0/0 .>. 72:77 ^0    0/0 .>. 78:83 ^1    0/0 .>. 84:89 ^2    0/0 .>. 90:95 ^3    0/0 .>. 96:101 ^4    0/0 .>. 102:107 ^5    0/0 .>. 108:113 ^0    0/0 .>. 114:119 ^1    0/0 .>. 120:125 ^2    0/0 .>. 126:127 ^3
@@ -212,11 +217,6 @@ o  "fqlh" is the number of calls to force_quiescent_state() that
         exited immediately (without even being counted in nfqs above)
         due to contention on ->fqslock.
  
-o      "oqlen" is the number of callbacks on the "orphan" callback
-       list.  RCU callbacks are placed on this list by CPUs going
-       offline, and are "adopted" either by the CPU helping the outgoing
-       CPU or by the next rcu_barrier*() call, whichever comes first.
-
  o      Each element of the form "1/1 0:127 ^0" represents one struct
         rcu_node.  Each line represents one level of the hierarchy, from
         root to leaves.  It is best to think of the rcu_data structures
@@ -326,3 +326,115 @@ o "nn" is the number of times that this CPU needed nothing.  Alert
         readers will note that the rcu "nn" number for a given CPU very
         closely matches the rcu_bh "np" number for that same CPU.  This
         is due to short-circuit evaluation in rcu_pending().
+
+
+CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats
+
+These implementations of RCU provides a single debugfs file under the
+top-level directory RCU, namely rcu/rcudata, which displays fields in
+rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU,
+rcu_preempt_ctrlblk.
+
+The output of "cat rcu/rcudata" is as follows:
+
+rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=...
+             ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274
+             normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0
+             exp balk: bt=0 nos=0
+rcu_sched: qlen: 0
+rcu_bh: qlen: 0
+
+This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the
+rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds.
+The last three lines of the rcu_preempt section appear only in
+CONFIG_RCU_BOOST kernel builds.  The fields are as follows:
+
+o      "qlen" is the number of RCU callbacks currently waiting either
+       for an RCU grace period or waiting to be invoked.  This is the
+       only field present for rcu_sched and rcu_bh, due to the
+       short-circuiting of grace period in those two cases.
+
+o      "gp" is the number of grace periods that have completed.
+
+o      "g197/p197/c197" displays the grace-period state, with the
+       "g" number being the number of grace periods that have started
+       (mod 256), the "p" number being the number of grace periods
+       that the CPU has responded to (also mod 256), and the "c"
+       number being the number of grace periods that have completed
+       (once again mode 256).
+
+       Why have both "gp" and "g"?  Because the data flowing into
+       "gp" is only present in a CONFIG_RCU_TRACE kernel.
+
+o      "tasks" is a set of bits.  The first bit is "T" if there are
+       currently tasks that have recently blocked within an RCU
+       read-side critical section, the second bit is "N" if any of the
+       aforementioned tasks are blocking the current RCU grace period,
+       and the third bit is "E" if any of the aforementioned tasks are
+       blocking the current expedited grace period.  Each bit is "."
+       if the corresponding condition does not hold.
+
+o      "ttb" is a single bit.  It is "B" if any of the blocked tasks
+       need to be priority boosted and "." otherwise.
+
+o      "btg" indicates whether boosting has been carried out during
+       the current grace period, with "exp" indicating that boosting
+       is in progress for an expedited grace period, "no" indicating
+       that boosting has not yet started for a normal grace period,
+       "begun" indicating that boosting has bebug for a normal grace
+       period, and "done" indicating that boosting has completed for
+       a normal grace period.
+
+o      "ntb" is the total number of tasks subjected to RCU priority boosting
+       periods since boot.
+
+o      "neb" is the number of expedited grace periods that have had
+       to resort to RCU priority boosting since boot.
+
+o      "nnb" is the number of normal grace periods that have had
+       to resort to RCU priority boosting since boot.
+
+o      "j" is the low-order 12 bits of the jiffies counter in hexadecimal.
+
+o      "bt" is the low-order 12 bits of the value that the jiffies counter
+       will have at the next time that boosting is scheduled to begin.
+
+o      In the line beginning with "normal balk", the fields are as follows:
+
+       o       "nt" is the number of times that the system balked from
+               boosting because there were no blocked tasks to boost.
+               Note that the system will balk from boosting even if the
+               grace period is overdue when the currently running task
+               is looping within an RCU read-side critical section.
+               There is no point in boosting in this case, because
+               boosting a running task won't make it run any faster.
+
+       o       "gt" is the number of times that the system balked
+               from boosting because, although there were blocked tasks,
+               none of them were preventing the current grace period
+               from completing.
+
+       o       "bt" is the number of times that the system balked
+               from boosting because boosting was already in progress.
+
+       o       "b" is the number of times that the system balked from
+               boosting because boosting had already completed for
+               the grace period in question.
+
+       o       "ny" is the number of times that the system balked from
+               boosting because it was not yet time to start boosting
+               the grace period in question.
+
+       o       "nos" is the number of times that the system balked from
+               boosting for inexplicable ("not otherwise specified")
+               reasons.  This can actually happen due to races involving
+               increments of the jiffies counter.
+
+o      In the line beginning with "exp balk", the fields are as follows:
+
+       o       "bt" is the number of times that the system balked from
+               boosting because there were no blocked tasks to boost.
+
+       o       "nos" is the number of times that the system balked from
+                boosting for inexplicable ("not otherwise specified")
+                reasons.
diff --git a/Documentation/dontdiff b/Documentation/dontdiff

index d9bcffd594331d7b52a5608269327a7c6642af31..470d3dba1a69aa48c55d98bea3c70d094f458de6 100644 (file)
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -62,6 +62,10 @@ aic7*reg_print.c*
  aic7*seq.h*
  aicasm
  aicdb.h*
+altivec1.c
+altivec2.c
+altivec4.c
+altivec8.c
  asm-offsets.h
  asm_offsets.h
  autoconf.h*
@@ -76,6 +80,7 @@ btfixupprep
  build
  bvmlinux
  bzImage*
+capflags.c
  classlist.h*
  comp*.log
  compile.h*
@@ -94,6 +99,7 @@ devlist.h*
  docproc
  elf2ecoff
  elfconfig.h*
+evergreen_reg_safe.h
  fixdep
  flask.h
  fore200e_mkfirm
@@ -108,9 +114,16 @@ genksyms
  *_gray256.c
  ihex2fw
  ikconfig.h*
+inat-tables.c
  initramfs_data.cpio
  initramfs_data.cpio.gz
  initramfs_list
+int16.c
+int1.c
+int2.c
+int32.c
+int4.c
+int8.c
  kallsyms
  kconfig
  keywords.c
@@ -140,6 +153,7 @@ mkprep
  mktables
  mktree
  modpost
+modules.builtin
  modules.order
  modversions.h*
  ncscope.*
@@ -153,14 +167,23 @@ pca200e.bin
  pca200e_ecd.bin2
  piggy.gz
  piggyback
+piggy.S
  pnmtologo
  ppc_defs.h*
  pss_boot.h
  qconf
+r100_reg_safe.h
+r200_reg_safe.h
+r300_reg_safe.h
+r420_reg_safe.h
+r600_reg_safe.h
  raid6altivec*.c
  raid6int*.c
  raid6tables.c
  relocs
+rn50_reg_safe.h
+rs600_reg_safe.h
+rv515_reg_safe.h
  series
  setup
  setup.bin
@@ -169,6 +192,7 @@ sImage
  sm_tbl*
  split-include
  syscalltab.h
+tables.c
  tags
  tftpboot.img
  timeconst.h
@@ -190,6 +214,7 @@ vmlinux
  vmlinux-*
  vmlinux.aout
  vmlinux.lds
+voffset.h
  vsyscall.lds
  vsyscall_32.lds
  wanxlfw.inc
@@ -200,3 +225,4 @@ wakeup.elf
  wakeup.lds
  zImage*
  zconf.hash.c
+zoffset.h
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking

index b6426f15b4ae85f5469b962806004237b9b5607e..33fa3e5d38fd7480d2ddd136b68f77fef7734c1d 100644 (file)
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -18,7 +18,6 @@ prototypes:
         char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
  
  locking rules:
-       none have BKL
                 dcache_lock     rename_lock     ->d_lock        may block
  d_revalidate:  no              no              no              yes
  d_hash         no              no              no              yes
@@ -42,18 +41,23 @@ ata *);
         int (*rename) (struct inode *, struct dentry *,
                         struct inode *, struct dentry *);
         int (*readlink) (struct dentry *, char __user *,int);
-       int (*follow_link) (struct dentry *, struct nameidata *);
+       void * (*follow_link) (struct dentry *, struct nameidata *);
+       void (*put_link) (struct dentry *, struct nameidata *, void *);
         void (*truncate) (struct inode *);
         int (*permission) (struct inode *, int, struct nameidata *);
+       int (*check_acl)(struct inode *, int);
         int (*setattr) (struct dentry *, struct iattr *);
         int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
         int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
         ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
         ssize_t (*listxattr) (struct dentry *, char *, size_t);
         int (*removexattr) (struct dentry *, const char *);
+       void (*truncate_range)(struct inode *, loff_t, loff_t);
+       long (*fallocate)(struct inode *inode, int mode, loff_t offset, loff_t len);
+       int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
  
  locking rules:
-       all may block, none have BKL
+       all may block
                 i_mutex(inode)
  lookup:                yes
  create:                yes
@@ -66,19 +70,24 @@ rmdir:              yes (both)      (see below)
  rename:                yes (all)       (see below)
  readlink:      no
  follow_link:   no
+put_link:      no
  truncate:      yes             (see below)
  setattr:       yes
  permission:    no
+check_acl:     no
  getattr:       no
  setxattr:      yes
  getxattr:      no
  listxattr:     no
  removexattr:   yes
+truncate_range:        yes
+fallocate:     no
+fiemap:                no
         Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
  victim.
         cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
         ->truncate() is never called directly - it's a callback, not a
-method. It's called by vmtruncate() - library function normally used by
+method. It's called by vmtruncate() - deprecated library function used by
  ->setattr(). Locking information above applies to that call (i.e. is
  inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been
  passed).
@@ -91,7 +100,7 @@ prototypes:
         struct inode *(*alloc_inode)(struct super_block *sb);
         void (*destroy_inode)(struct inode *);
         void (*dirty_inode) (struct inode *);
-       int (*write_inode) (struct inode *, int);
+       int (*write_inode) (struct inode *, struct writeback_control *wbc);
         int (*drop_inode) (struct inode *);
         void (*evict_inode) (struct inode *);
         void (*put_super) (struct super_block *);
@@ -105,10 +114,10 @@ prototypes:
         int (*show_options)(struct seq_file *, struct vfsmount *);
         ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
         ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
+       int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
  
  locking rules:
         All may block [not true, see below]
-       None have BKL
                         s_umount
  alloc_inode:
  destroy_inode:
@@ -127,6 +136,7 @@ umount_begin:               no
  show_options:          no              (namespace_sem)
  quota_read:            no              (see below)
  quota_write:           no              (see below)
+bdev_try_to_free_page: no              (see below)
  
  ->statfs() has s_umount (shared) when called by ustat(2) (native or
  compat), but that's an accident of bad API; s_umount is used to pin
@@ -139,19 +149,25 @@ be the only ones operating on the quota file by the quota code (via
  dqio_sem) (unless an admin really wants to screw up something and
  writes to quota files with quotas on). For other details about locking
  see also dquot_operations section.
+->bdev_try_to_free_page is called from the ->releasepage handler of
+the block device inode.  See there for more details.
  
  --------------------------- file_system_type ---------------------------
  prototypes:
         int (*get_sb) (struct file_system_type *, int,
                        const char *, void *, struct vfsmount *);
+       struct dentry *(*mount) (struct file_system_type *, int,
+                      const char *, void *);
         void (*kill_sb) (struct super_block *);
  locking rules:
-               may block       BKL
-get_sb         yes             no
-kill_sb                yes             no
+               may block
+get_sb         yes
+mount          yes
+kill_sb                yes
  
  ->get_sb() returns error or 0 with locked superblock attached to the vfsmount
  (exclusive on ->s_umount).
+->mount() returns ERR_PTR or the root dentry.
  ->kill_sb() takes a write-locked superblock, does all shutdown work on it,
  unlocks and drops the reference.
  
@@ -176,27 +192,35 @@ prototypes:
         void (*freepage)(struct page *);
         int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
                         loff_t offset, unsigned long nr_segs);
-       int (*launder_page) (struct page *);
+       int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **,
+                               unsigned long *);
+       int (*migratepage)(struct address_space *, struct page *, struct page *);
+       int (*launder_page)(struct page *);
+       int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long);
+       int (*error_remove_page)(struct address_space *, struct page *);
  
  locking rules:
         All except set_page_dirty and freepage may block
  
-                       BKL     PageLocked(page)        i_mutex
-writepage:             no      yes, unlocks (see below)
-readpage:              no      yes, unlocks
-sync_page:             no      maybe
-writepages:            no
-set_page_dirty         no      no
-readpages:             no
-write_begin:           no      locks the page          yes
-write_end:             no      yes, unlocks            yes
-perform_write:         no      n/a                     yes
-bmap:                  no
-invalidatepage:                no      yes
-releasepage:           no      yes
-freepage:              no      yes
-direct_IO:             no
-launder_page:          no      yes
+                       PageLocked(page)        i_mutex
+writepage:             yes, unlocks (see below)
+readpage:              yes, unlocks
+sync_page:             maybe
+writepages:
+set_page_dirty         no
+readpages:
+write_begin:           locks the page          yes
+write_end:             yes, unlocks            yes
+bmap:
+invalidatepage:                yes
+releasepage:           yes
+freepage:              yes
+direct_IO:
+get_xip_mem:                                   maybe
+migratepage:           yes (both)
+launder_page:          yes
+is_partially_uptodate: yes
+error_remove_page:     yes
  
         ->write_begin(), ->write_end(), ->sync_page() and ->readpage()
  may be called from the request handler (/dev/loop).
@@ -276,9 +300,8 @@ under spinlock (it cannot block) and is sometimes called with the page
  not locked.
  
         ->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some
-filesystems and by the swapper. The latter will eventually go away. All
-instances do not actually need the BKL. Please, keep it that way and don't
-breed new callers.
+filesystems and by the swapper. The latter will eventually go away.  Please,
+keep it that way and don't breed new callers.
  
         ->invalidatepage() is called when the filesystem must attempt to drop
  some or all of the buffers from the page when it is being truncated.  It
@@ -299,47 +322,37 @@ cleaned, or an error value if not. Note that in order to prevent the page
  getting mapped back in and redirtied, it needs to be kept locked
  across the entire operation.
  
-       Note: currently almost all instances of address_space methods are
-using BKL for internal serialization and that's one of the worst sources
-of contention. Normally they are calling library functions (in fs/buffer.c)
-and pass foo_get_block() as a callback (on local block-based filesystems,
-indeed). BKL is not needed for library stuff and is usually taken by
-foo_get_block(). It's an overkill, since block bitmaps can be protected by
-internal fs locking and real critical areas are much smaller than the areas
-filesystems protect now.
-
  ----------------------- file_lock_operations ------------------------------
  prototypes:
-       void (*fl_insert)(struct file_lock *);  /* lock insertion callback */
-       void (*fl_remove)(struct file_lock *);  /* lock removal callback */
         void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
         void (*fl_release_private)(struct file_lock *);
  
  
  locking rules:
-                       BKL     may block
-fl_insert:             yes     no
-fl_remove:             yes     no
-fl_copy_lock:          yes     no
-fl_release_private:    yes     yes
+                       file_lock_lock  may block
+fl_copy_lock:          yes             no
+fl_release_private:    maybe           no
  
  ----------------------- lock_manager_operations ---------------------------
  prototypes:
         int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
         void (*fl_notify)(struct file_lock *);  /* unblock callback */
+       int (*fl_grant)(struct file_lock *, struct file_lock *, int);
         void (*fl_release_private)(struct file_lock *);
         void (*fl_break)(struct file_lock *); /* break_lease callback */
+       int (*fl_mylease)(struct file_lock *, struct file_lock *);
+       int (*fl_change)(struct file_lock **, int);
  
  locking rules:
-                       BKL     may block
-fl_compare_owner:      yes     no
-fl_notify:             yes     no
-fl_release_private:    yes     yes
-fl_break:              yes     no
-
-       Currently only NFSD and NLM provide instances of this class. None of the
-them block. If you have out-of-tree instances - please, show up. Locking
-in that area will change.
+                       file_lock_lock  may block
+fl_compare_owner:      yes             no
+fl_notify:             yes             no
+fl_grant:              no              no
+fl_release_private:    maybe           no
+fl_break:              yes             no
+fl_mylease:            yes             no
+fl_change              yes             no
+
  --------------------------- buffer_head -----------------------------------
  prototypes:
         void (*b_end_io)(struct buffer_head *bh, int uptodate);
@@ -364,17 +377,17 @@ prototypes:
         void (*swap_slot_free_notify) (struct block_device *, unsigned long);
  
  locking rules:
-                       BKL     bd_mutex
-open:                  no      yes
-release:               no      yes
-ioctl:                 no      no
-compat_ioctl:          no      no
-direct_access:         no      no
-media_changed:         no      no
-unlock_native_capacity:        no      no
-revalidate_disk:       no      no
-getgeo:                        no      no
-swap_slot_free_notify: no      no      (see below)
+                       bd_mutex
+open:                  yes
+release:               yes
+ioctl:                 no
+compat_ioctl:          no
+direct_access:         no
+media_changed:         no
+unlock_native_capacity:        no
+revalidate_disk:       no
+getgeo:                        no
+swap_slot_free_notify: no      (see below)
  
  media_changed, unlock_native_capacity and revalidate_disk are called only from
  check_disk_change().
@@ -413,34 +426,21 @@ prototypes:
         unsigned long (*get_unmapped_area)(struct file *, unsigned long,
                         unsigned long, unsigned long, unsigned long);
         int (*check_flags)(int);
+       int (*flock) (struct file *, int, struct file_lock *);
+       ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *,
+                       size_t, unsigned int);
+       ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *,
+                       size_t, unsigned int);
+       int (*setlease)(struct file *, long, struct file_lock **);
  };
  
  locking rules:
-       All may block.
-                       BKL
-llseek:                        no      (see below)
-read:                  no
-aio_read:              no
-write:                 no
-aio_write:             no
-readdir:               no
-poll:                  no
-unlocked_ioctl:                no
-compat_ioctl:          no
-mmap:                  no
-open:                  no
-flush:                 no
-release:               no
-fsync:                 no      (see below)
-aio_fsync:             no
-fasync:                        no
-lock:                  yes
-readv:                 no
-writev:                        no
-sendfile:              no
-sendpage:              no
-get_unmapped_area:     no
-check_flags:           no
+       All may block except for ->setlease.
+       No VFS locks held on entry except for ->fsync and ->setlease.
+
+->fsync() has i_mutex on inode.
+
+->setlease has the file_list_lock held and must not sleep.
  
  ->llseek() locking has moved from llseek to the individual llseek
  implementations.  If your fs is not using generic_file_llseek, you
@@ -450,17 +450,10 @@ mutex or just to use i_size_read() instead.
  Note: this does not protect the file->f_pos against concurrent modifications
  since this is something the userspace has to take care about.
  
-Note: ext2_release() was *the* source of contention on fs-intensive
-loads and dropping BKL on ->release() helps to get rid of that (we still
-grab BKL for cases when we close a file that had been opened r/w, but that
-can and should be done using the internal locking with smaller critical areas).
-Current worst offender is ext2_get_block()...
-
-->fasync() is called without BKL protection, and is responsible for
-maintaining the FASYNC bit in filp->f_flags.  Most instances call
-fasync_helper(), which does that maintenance, so it's not normally
-something one needs to worry about.  Return values > 0 will be mapped to
-zero in the VFS layer.
+->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags.
+Most instances call fasync_helper(), which does that maintenance, so it's
+not normally something one needs to worry about.  Return values > 0 will be
+mapped to zero in the VFS layer.
  
  ->readdir() and ->ioctl() on directories must be changed. Ideally we would
  move ->readdir() to inode_operations and use a separate method for directory
@@ -471,8 +464,6 @@ components. And there are other reasons why the current interface is a mess...
  ->read on directories probably must go away - we should just enforce -EISDIR
  in sys_read() and friends.
  
-->fsync() has i_mutex on inode.
-
  --------------------------- dquot_operations -------------------------------
  prototypes:
         int (*write_dquot) (struct dquot *);
@@ -507,12 +498,12 @@ prototypes:
         int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
  
  locking rules:
-               BKL     mmap_sem        PageLocked(page)
-open:          no      yes
-close:         no      yes
-fault:         no      yes             can return with page locked
-page_mkwrite:  no      yes             can return with page locked
-access:                no      yes
+               mmap_sem        PageLocked(page)
+open:          yes
+close:         yes
+fault:         yes             can return with page locked
+page_mkwrite:  yes             can return with page locked
+access:                yes
  
         ->fault() is called when a previously not present pte is about
  to be faulted in. The filesystem must find and return the page associated
@@ -539,6 +530,3 @@ VM_IO | VM_PFNMAP VMAs.
  
  (if you break something or notice that it is broken and do not fix it yourself
  - at least put it here)
-
-ipc/shm.c::shm_delete() - may need BKL.
-->read() and ->write() in many drivers are (probably) missing BKL.
diff --git a/Documentation/kernel-docs.txt b/Documentation/kernel-docs.txt

index 715eaaf1519dd25fa0d4011684cd263bc12b2d2f..9a8674629a07598eb553970e1207b31b42ec0316 100644 (file)
--- a/Documentation/kernel-docs.txt
+++ b/Documentation/kernel-docs.txt
@@ -537,7 +537,7 @@
         Notes: Further information in
         http://www.oreilly.com/catalog/linuxdrive2/
  
-     * Title: "Linux Device Drivers, 3nd Edition"
+     * Title: "Linux Device Drivers, 3rd Edition"
         Authors: Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman
         Publisher: O'Reilly & Associates.
         Date: 2005.
@@ -592,14 +592,6 @@
         Pages: 600.
         ISBN: 0-13-101908-2
  
-     * Title:  "The  Design  and Implementation of the 4.4 BSD UNIX
-       Operating System"
-       Author: Marshall Kirk McKusick, Keith Bostic, Michael J. Karels,
-       John S. Quarterman.
-       Publisher: Addison-Wesley.
-       Date: 1996.
-       ISBN: 0-201-54979-4
-
       * Title: "Programming for the real world - POSIX.4"
         Author: Bill O. Gallmeister.
         Publisher: O'Reilly & Associates, Inc..
@@ -610,28 +602,13 @@
         POSIX. Good reference.
  
       * Title:  "UNIX  Systems  for  Modern Architectures: Symmetric
-       Multiprocesssing and Caching for Kernel Programmers"
+       Multiprocessing and Caching for Kernel Programmers"
         Author: Curt Schimmel.
         Publisher: Addison Wesley.
         Date: June, 1994.
         Pages: 432.
         ISBN: 0-201-63338-8
  
-     * Title:  "The  Design  and Implementation of the 4.3 BSD UNIX
-       Operating System"
-       Author: Samuel J. Leffler, Marshall Kirk McKusick, Michael J.
-       Karels, John S. Quarterman.
-       Publisher: Addison-Wesley.
-       Date: 1989 (reprinted with corrections on October, 1990).
-       ISBN: 0-201-06196-1
-
-     * Title: "The Design of the UNIX Operating System"
-       Author: Maurice J. Bach.
-       Publisher: Prentice Hall.
-       Date: 1986.
-       Pages: 471.
-       ISBN: 0-13-201757-1
-
       MISCELLANEOUS:
  
       * Name: linux/Documentation
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index 8b61c93609994dd91e36c25e1b29647ad084eaff..f3dc951e949f04255d90f35b5da4b78c7d015a67 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1579,20 +1579,12 @@ and is between 256 and 4096 characters. It is defined in the file
  
         nmi_watchdog=   [KNL,BUGS=X86] Debugging features for SMP kernels
                         Format: [panic,][num]
-                       Valid num: 0,1,2
+                       Valid num: 0
                         0 - turn nmi_watchdog off
-                       1 - use the IO-APIC timer for the NMI watchdog
-                       2 - use the local APIC for the NMI watchdog using
-                       a performance counter. Note: This will use one
-                       performance counter and the local APIC's performance
-                       vector.
                         When panic is specified, panic when an NMI watchdog
                         timeout occurs.
                         This is useful when you use a panic=... timeout and
                         need the box quickly up again.
-                       Instead of 1 and 2 it is possible to use the following
-                       symbolic names: lapic and ioapic
-                       Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic
  
         netpoll.carrier_timeout=
                         [NET] Specifies amount of time (in seconds) that
@@ -1622,6 +1614,8 @@ and is between 256 and 4096 characters. It is defined in the file
         noapic          [SMP,APIC] Tells the kernel to not make use of any
                         IOAPICs that may be present in the system.
  
+       noautogroup     Disable scheduler automatic task group creation.
+
         nobats          [PPC] Do not use BATs for mapping kernel lowmem
                         on "Classic" PPC cores.
  
@@ -1759,7 +1753,7 @@ and is between 256 and 4096 characters. It is defined in the file
  
         nousb           [USB] Disable the USB subsystem
  
-       nowatchdog      [KNL] Disable the lockup detector.
+       nowatchdog      [KNL] Disable the lockup detector (NMI watchdog).
  
         nowb            [ARM]
  
@@ -2467,12 +2461,13 @@ and is between 256 and 4096 characters. It is defined in the file
                         to facilitate early boot debugging.
                         See also Documentation/trace/events.txt
  
-       tsc=            Disable clocksource-must-verify flag for TSC.
+       tsc=            Disable clocksource stability checks for TSC.
                         Format: <string>
                         [x86] reliable: mark tsc clocksource as reliable, this
-                       disables clocksource verification at runtime.
-                       Used to enable high-resolution timer mode on older
-                       hardware, and in virtualized environment.
+                       disables clocksource verification at runtime, as well
+                       as the stability checks done at bootup. Used to enable
+                       high-resolution timer mode on older hardware, and in
+                       virtualized environment.
                         [x86] noirqtime: Do not use TSC to do irq accounting.
                         Used to run time disable IRQ_TIME_ACCOUNTING on any
                         platforms where RDTSC is slow and this accounting
diff --git a/Documentation/trace/events-power.txt b/Documentation/trace/events-power.txt

new file mode 100644 (file)

index 0000000..96d87b6
--- /dev/null
+++ b/Documentation/trace/events-power.txt
@@ -0,0 +1,90 @@
+
+                       Subsystem Trace Points: power
+
+The power tracing system captures events related to power transitions
+within the kernel. Broadly speaking there are three major subheadings:
+
+  o Power state switch which reports events related to suspend (S-states),
+     cpuidle (C-states) and cpufreq (P-states)
+  o System clock related changes
+  o Power domains related changes and transitions
+
+This document describes what each of the tracepoints is and why they
+might be useful.
+
+Cf. include/trace/events/power.h for the events definitions.
+
+1. Power state switch events
+============================
+
+1.1 New trace API
+-----------------
+
+A 'cpu' event class gathers the CPU-related events: cpuidle and
+cpufreq.
+
+cpu_idle               "state=%lu cpu_id=%lu"
+cpu_frequency          "state=%lu cpu_id=%lu"
+
+A suspend event is used to indicate the system going in and out of the
+suspend mode:
+
+machine_suspend                "state=%lu"
+
+
+Note: the value of '-1' or '4294967295' for state means an exit from the current state,
+i.e. trace_cpu_idle(4, smp_processor_id()) means that the system
+enters the idle state 4, while trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id())
+means that the system exits the previous idle state.
+
+The event which has 'state=4294967295' in the trace is very important to the user
+space tools which are using it to detect the end of the current state, and so to
+correctly draw the states diagrams and to calculate accurate statistics etc.
+
+1.2 DEPRECATED trace API
+------------------------
+
+A new Kconfig option CONFIG_EVENT_POWER_TRACING_DEPRECATED with the default value of
+'y' has been created. This allows the legacy trace power API to be used conjointly
+with the new trace API.
+The Kconfig option, the old trace API (in include/trace/events/power.h) and the
+old trace points will disappear in a future release (namely 2.6.41).
+
+power_start            "type=%lu state=%lu cpu_id=%lu"
+power_frequency                "type=%lu state=%lu cpu_id=%lu"
+power_end              "cpu_id=%lu"
+
+The 'type' parameter takes one of those macros:
+ . POWER_NONE  = 0,
+ . POWER_CSTATE        = 1,    /* C-State */
+ . POWER_PSTATE        = 2,    /* Fequency change or DVFS */
+
+The 'state' parameter is set depending on the type:
+ . Target C-state for type=POWER_CSTATE,
+ . Target frequency for type=POWER_PSTATE,
+
+power_end is used to indicate the exit of a state, corresponding to the latest
+power_start event.
+
+2. Clocks events
+================
+The clock events are used for clock enable/disable and for
+clock rate change.
+
+clock_enable           "%s state=%lu cpu_id=%lu"
+clock_disable          "%s state=%lu cpu_id=%lu"
+clock_set_rate         "%s state=%lu cpu_id=%lu"
+
+The first parameter gives the clock name (e.g. "gpio1_iclk").
+The second parameter is '1' for enable, '0' for disable, the target
+clock rate for set_rate.
+
+3. Power domains events
+=======================
+The power domain events are used for power domains transitions
+
+power_domain_target    "%s state=%lu cpu_id=%lu"
+
+The first parameter gives the power domain name (e.g. "mpu_pwrdm").
+The second parameter is the power domain target state.
+
diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt

index 30b43e1b26979cee024aa7636e250520b8ba235f..bdeb81ccb5f61b973280f9704f28a9f9cd8cc2e1 100644 (file)
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -600,6 +600,7 @@ Protocol:   2.07+
    0x00000001   lguest
    0x00000002   Xen
    0x00000003   Moorestown MID
+  0x00000004   CE4100 TV Platform
  
  Field name:    hardware_subarch_data
  Type:          write (subarch-dependent)
diff --git a/MAINTAINERS b/MAINTAINERS

index 71e40f9118df5d9d1f002a84f48e8600ac533176..c5c7292daba076e439843e3b8c5c09a5d96d2285 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -792,11 +792,14 @@ S:        Maintained
  
  ARM/NOMADIK ARCHITECTURE
  M:     Alessandro Rubini <rubini@unipv.it>
+M:     Linus Walleij <linus.walleij@stericsson.com>
  M:     STEricsson <STEricsson_nomadik_linux@list.st.com>
  L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:     Maintained
  F:     arch/arm/mach-nomadik/
  F:     arch/arm/plat-nomadik/
+F:     drivers/i2c/busses/i2c-nomadik.c
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-stericsson.git
  
  ARM/OPENMOKO NEO FREERUNNER (GTA02) MACHINE SUPPORT
  M:     Nelson Castillo <arhuaco@freaks-unidos.net>
@@ -998,12 +1001,24 @@ F:       drivers/i2c/busses/i2c-stu300.c
  F:     drivers/rtc/rtc-coh901331.c
  F:     drivers/watchdog/coh901327_wdt.c
  F:     drivers/dma/coh901318*
+F:     drivers/mfd/ab3100*
+F:     drivers/rtc/rtc-ab3100.c
+F:     drivers/rtc/rtc-coh901331.c
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-stericsson.git
  
-ARM/U8500 ARM ARCHITECTURE
+ARM/Ux500 ARM ARCHITECTURE
  M:     Srinidhi Kasagar <srinidhi.kasagar@stericsson.com>
+M:     Linus Walleij <linus.walleij@stericsson.com>
  L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:     Maintained
  F:     arch/arm/mach-ux500/
+F:     drivers/dma/ste_dma40*
+F:     drivers/mfd/ab3550*
+F:     drivers/mfd/abx500*
+F:     drivers/mfd/ab8500*
+F:     drivers/mfd/stmpe*
+F:     drivers/rtc/rtc-ab8500.c
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-stericsson.git
  
  ARM/VFP SUPPORT
  M:     Russell King <linux@arm.linux.org.uk>
@@ -2797,6 +2812,10 @@ M:       Thomas Gleixner <tglx@linutronix.de>
  S:     Maintained
  F:     Documentation/timers/
  F:     kernel/hrtimer.c
+F:     kernel/time/clockevents.c
+F:     kernel/time/tick*.*
+F:     kernel/time/timer_*.c
+F      include/linux/clockevents.h
  F:     include/linux/hrtimer.h
  
  HIGH-SPEED SCC DRIVER FOR AX.25
@@ -4612,7 +4631,7 @@ PERFORMANCE EVENTS SUBSYSTEM
  M:     Peter Zijlstra <a.p.zijlstra@chello.nl>
  M:     Paul Mackerras <paulus@samba.org>
  M:     Ingo Molnar <mingo@elte.hu>
-M:     Arnaldo Carvalho de Melo <acme@redhat.com>
+M:     Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
  S:     Supported
  F:     kernel/perf_event*.c
  F:     include/linux/perf_event.h
@@ -5127,6 +5146,18 @@ L:       alsa-devel@alsa-project.org (moderated for non-subscribers)
  S:     Supported
  F:     sound/soc/s3c24xx
  
+TIMEKEEPING, NTP
+M:     John Stultz <johnstul@us.ibm.com>
+M:     Thomas Gleixner <tglx@linutronix.de>
+S:     Supported
+F:     include/linux/clocksource.h
+F:     include/linux/time.h
+F:     include/linux/timex.h
+F:     include/linux/timekeeping.h
+F:     kernel/time/clocksource.c
+F:     kernel/time/time*.c
+F:     kernel/time/ntp.c
+
  TLG2300 VIDEO4LINUX-2 DRIVER
  M:     Huang Shijie <shijie8@gmail.com>
  M:     Kang Yong <kangyong@telegent.com>
diff --git a/Makefile b/Makefile

index e7c41f1344e5e8064e796149024255ec47ccab5f..74b25559f831c5c48150789ed3230921635e0ae0 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
  VERSION = 2
  PATCHLEVEL = 6
  SUBLEVEL = 37
-EXTRAVERSION = -rc8
+EXTRAVERSION =
  NAME = Flesh-Eating Bats with Fangs
  
  # *DOCUMENTATION*
diff --git a/arch/Kconfig b/arch/Kconfig

index 8bf0fa652eb63c57dec1ebfec1a93a407be4ed32..f78c2be4242b437ced3308795952102bf1359763 100644 (file)
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -175,4 +175,7 @@ config HAVE_PERF_EVENTS_NMI
  config HAVE_ARCH_JUMP_LABEL
         bool
  
+config HAVE_ARCH_MUTEX_CPU_RELAX
+       bool
+
  source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/include/asm/perf_event.h b/arch/alpha/include/asm/perf_event.h

index fe792ca818f64c4d9954e8b62e5f8064a5d5777e..5996e7a6757e4e058d4d7e7f9d10f76fbf6b6d20 100644 (file)
--- a/arch/alpha/include/asm/perf_event.h
+++ b/arch/alpha/include/asm/perf_event.h
@@ -1,10 +1,4 @@
  #ifndef __ASM_ALPHA_PERF_EVENT_H
  #define __ASM_ALPHA_PERF_EVENT_H
  
-#ifdef CONFIG_PERF_EVENTS
-extern void init_hw_perf_events(void);
-#else
-static inline void init_hw_perf_events(void)    { }
-#endif
-
  #endif /* __ASM_ALPHA_PERF_EVENT_H */
diff --git a/arch/alpha/kernel/irq_alpha.c b/arch/alpha/kernel/irq_alpha.c

index 5f77afb88e898b849f57e5c115f4688282e71c62..4c8bb374eb0a288d03d2cf5ad124e914638527f2 100644 (file)
--- a/arch/alpha/kernel/irq_alpha.c
+++ b/arch/alpha/kernel/irq_alpha.c
@@ -112,8 +112,6 @@ init_IRQ(void)
         wrent(entInt, 0);
  
         alpha_mv.init_irq();
-
-       init_hw_perf_events();
  }
  
  /*
diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c

index 1cc49683fb69b2a5f96639e71a2f1af821479e77..90561c45e7d8928e8e137e33164d2d2d661a28b8 100644 (file)
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -14,6 +14,7 @@
  #include <linux/kernel.h>
  #include <linux/kdebug.h>
  #include <linux/mutex.h>
+#include <linux/init.h>
  
  #include <asm/hwrpb.h>
  #include <asm/atomic.h>
@@ -863,13 +864,13 @@ static void alpha_perf_event_irq_handler(unsigned long la_ptr,
  /*
   * Init call to initialise performance events at kernel startup.
   */
-void __init init_hw_perf_events(void)
+int __init init_hw_perf_events(void)
  {
         pr_info("Performance events: ");
  
         if (!supported_cpu()) {
                 pr_cont("No support for your CPU.\n");
-               return;
+               return 0;
         }
  
         pr_cont("Supported CPU type!\n");
@@ -881,6 +882,8 @@ void __init init_hw_perf_events(void)
         /* And set up PMU specification */
         alpha_pmu = &ev67_pmu;
  
-       perf_pmu_register(&pmu);
-}
+       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
  
+       return 0;
+}
+early_initcall(init_hw_perf_events);
diff --git a/arch/arm/include/asm/hardware/it8152.h b/arch/arm/include/asm/hardware/it8152.h

index 21fa272301f804b9bad3218b74147f6e450fa026..b2f95c72287c861dc03c09e110242982bb51a5c5 100644 (file)
--- a/arch/arm/include/asm/hardware/it8152.h
+++ b/arch/arm/include/asm/hardware/it8152.h
@@ -76,6 +76,7 @@ extern unsigned long it8152_base_address;
    IT8152_PD_IRQ(0)  Audio controller (ACR)
   */
  #define IT8152_IRQ(x)   (IRQ_BOARD_START + (x))
+#define IT8152_LAST_IRQ        (IRQ_BOARD_START + 40)
  
  /* IRQ-sources in 3 groups - local devices, LPC (serial), and external PCI */
  #define IT8152_LD_IRQ_COUNT     9
diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h

index 1fc684e70ab6a6c9915ad5fbd302066ccb050a29..7080e2c8fa6209e0445b59a7b33ba8bd825797af 100644 (file)
--- a/arch/arm/include/asm/highmem.h
+++ b/arch/arm/include/asm/highmem.h
@@ -25,9 +25,6 @@ extern void *kmap_high(struct page *page);
  extern void *kmap_high_get(struct page *page);
  extern void kunmap_high(struct page *page);
  
-extern void *kmap_high_l1_vipt(struct page *page, pte_t *saved_pte);
-extern void kunmap_high_l1_vipt(struct page *page, pte_t saved_pte);
-
  /*
   * The following functions are already defined by <linux/highmem.h>
   * when CONFIG_HIGHMEM is not set.
diff --git a/arch/arm/include/asm/sizes.h b/arch/arm/include/asm/sizes.h

index 4fc1565e4f930860722bca331330d0606e494d1c..316bb2b2be3dd7ca64638ea2d13d502e66ee5fde 100644 (file)
--- a/arch/arm/include/asm/sizes.h
+++ b/arch/arm/include/asm/sizes.h
@@ -13,9 +13,6 @@
   * along with this program; if not, write to the Free Software
   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   */
-/* DO NOT EDIT!! - this file automatically generated
- *                 from .s file by awk -f s2h.awk
- */
  /*  Size definitions
   *  Copyright (C) ARM Limited 1998. All rights reserved.
   */
@@ -25,6 +22,9 @@
  
  /* handy sizes */
  #define SZ_16                          0x00000010
+#define SZ_32                          0x00000020
+#define SZ_64                          0x00000040
+#define SZ_128                         0x00000080
  #define SZ_256                         0x00000100
  #define SZ_512                         0x00000200
  
diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h

index 1120f18a6b17695e48c37a6b4d1d7e50d306e478..80025948b8ad378e29d162030e962bb1dd462255 100644 (file)
--- a/arch/arm/include/asm/system.h
+++ b/arch/arm/include/asm/system.h
@@ -150,6 +150,7 @@ extern unsigned int user_debug;
  #define rmb()          dmb()
  #define wmb()          mb()
  #else
+#include <asm/memory.h>
  #define mb()   do { if (arch_is_coherent()) dmb(); else barrier(); } while (0)
  #define rmb()  do { if (arch_is_coherent()) dmb(); else barrier(); } while (0)
  #define wmb()  do { if (arch_is_coherent()) dmb(); else barrier(); } while (0)
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S

index 8bfa98757cd2f3fc9ef128011944a55fa7838ffd..80bf8cd88d7c522c319354c75b13d5484b9ede96 100644 (file)
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -29,6 +29,9 @@ ret_fast_syscall:
         ldr     r1, [tsk, #TI_FLAGS]
         tst     r1, #_TIF_WORK_MASK
         bne     fast_work_pending
+#if defined(CONFIG_IRQSOFF_TRACER)
+       asm_trace_hardirqs_on
+#endif
  
         /* perform architecture specific actions before user return */
         arch_ret_to_user r1, lr
@@ -65,6 +68,9 @@ ret_slow_syscall:
         tst     r1, #_TIF_WORK_MASK
         bne     work_pending
  no_work_pending:
+#if defined(CONFIG_IRQSOFF_TRACER)
+       asm_trace_hardirqs_on
+#endif
         /* perform architecture specific actions before user return */
         arch_ret_to_user r1, lr
  
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c

index 07a50357492ac6858bc21d0b7913aefc93cbb1cf..fdfa4976b0bfeca637178609bb5e8d51e63ac672 100644 (file)
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -3034,11 +3034,11 @@ init_hw_perf_events(void)
                 pr_info("no hardware support available\n");
         }
  
-       perf_pmu_register(&pmu);
+       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
  
         return 0;
  }
-arch_initcall(init_hw_perf_events);
+early_initcall(init_hw_perf_events);
  
  /*
   * Callchain handling code.
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c

index 8c1959590252e7161f1da38497eddba9b0538afb..9066473c0ebc3991a2f793e8c78616e26b31decb 100644 (file)
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -310,7 +310,6 @@ asmlinkage void __cpuinit secondary_start_kernel(void)
          * All kernel threads share the same mm context; grab a
          * reference and switch to it.
          */
-       atomic_inc(&mm->mm_users);
         atomic_inc(&mm->mm_count);
         current->active_mm = mm;
         cpumask_set_cpu(cpu, mm_cpumask(mm));
diff --git a/arch/arm/mach-pxa/Kconfig b/arch/arm/mach-pxa/Kconfig

index dd235ecc9d6c5946d6610bb7adef42cb57476210..c93e73d54dd1e7c5cebf41b13bc31b37a600391d 100644 (file)
--- a/arch/arm/mach-pxa/Kconfig
+++ b/arch/arm/mach-pxa/Kconfig
@@ -540,6 +540,7 @@ config MACH_ICONTROL
  config ARCH_PXA_ESERIES
         bool "PXA based Toshiba e-series PDAs"
         select PXA25x
+       select FB_W100
  
  config MACH_E330
         bool "Toshiba e330"
diff --git a/arch/arm/mach-pxa/sleep.S b/arch/arm/mach-pxa/sleep.S

index 52c30b01a67139e88fdb71af7dd7371525167bfd..ae008110db4edd934716e7b0a07d26aab1d84813 100644 (file)
--- a/arch/arm/mach-pxa/sleep.S
+++ b/arch/arm/mach-pxa/sleep.S
@@ -353,8 +353,8 @@ resume_turn_on_mmu:
  
         @ Let us ensure we jump to resume_after_mmu only when the mcr above
         @ actually took effect.  They call it the "cpwait" operation.
-       mrc     p15, 0, r1, c2, c0, 0           @ queue a dependency on CP15
-       sub     pc, r2, r1, lsr #32             @ jump to virtual addr
+       mrc     p15, 0, r0, c2, c0, 0           @ queue a dependency on CP15
+       sub     pc, r2, r0, lsr #32             @ jump to virtual addr
         nop
         nop
         nop
diff --git a/arch/arm/mm/cache-feroceon-l2.c b/arch/arm/mm/cache-feroceon-l2.c

index 6e77c042d8e9417ad5b9141c6eab37767693e3ed..e0b0e7a4ec68a3c577959e9116b9c7fa5d0be09d 100644 (file)
--- a/arch/arm/mm/cache-feroceon-l2.c
+++ b/arch/arm/mm/cache-feroceon-l2.c
@@ -13,13 +13,9 @@
   */
  
  #include <linux/init.h>
+#include <linux/highmem.h>
  #include <asm/cacheflush.h>
-#include <asm/kmap_types.h>
-#include <asm/fixmap.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
  #include <plat/cache-feroceon-l2.h>
-#include "mm.h"
  
  /*
   * Low-level cache maintenance operations.
@@ -39,27 +35,30 @@
   * between which we don't want to be preempted.
   */
  
-static inline unsigned long l2_start_va(unsigned long paddr)
+static inline unsigned long l2_get_va(unsigned long paddr)
  {
  #ifdef CONFIG_HIGHMEM
         /*
-        * Let's do our own fixmap stuff in a minimal way here.
          * Because range ops can't be done on physical addresses,
          * we simply install a virtual mapping for it only for the
          * TLB lookup to occur, hence no need to flush the untouched
-        * memory mapping.  This is protected with the disabling of
-        * interrupts by the caller.
+        * memory mapping afterwards (note: a cache flush may happen
+        * in some circumstances depending on the path taken in kunmap_atomic).
          */
-       unsigned long idx = KM_L2_CACHE + KM_TYPE_NR * smp_processor_id();
-       unsigned long vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-       set_pte_ext(TOP_PTE(vaddr), pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL), 0);
-       local_flush_tlb_kernel_page(vaddr);
-       return vaddr + (paddr & ~PAGE_MASK);
+       void *vaddr = kmap_atomic_pfn(paddr >> PAGE_SHIFT);
+       return (unsigned long)vaddr + (paddr & ~PAGE_MASK);
  #else
         return __phys_to_virt(paddr);
  #endif
  }
  
+static inline void l2_put_va(unsigned long vaddr)
+{
+#ifdef CONFIG_HIGHMEM
+       kunmap_atomic((void *)vaddr);
+#endif
+}
+
  static inline void l2_clean_pa(unsigned long addr)
  {
         __asm__("mcr p15, 1, %0, c15, c9, 3" : : "r" (addr));
@@ -76,13 +75,14 @@ static inline void l2_clean_pa_range(unsigned long start, unsigned long end)
          */
         BUG_ON((start ^ end) >> PAGE_SHIFT);
  
-       raw_local_irq_save(flags);
-       va_start = l2_start_va(start);
+       va_start = l2_get_va(start);
         va_end = va_start + (end - start);
+       raw_local_irq_save(flags);
         __asm__("mcr p15, 1, %0, c15, c9, 4\n\t"
                 "mcr p15, 1, %1, c15, c9, 5"
                 : : "r" (va_start), "r" (va_end));
         raw_local_irq_restore(flags);
+       l2_put_va(va_start);
  }
  
  static inline void l2_clean_inv_pa(unsigned long addr)
@@ -106,13 +106,14 @@ static inline void l2_inv_pa_range(unsigned long start, unsigned long end)
          */
         BUG_ON((start ^ end) >> PAGE_SHIFT);
  
-       raw_local_irq_save(flags);
-       va_start = l2_start_va(start);
+       va_start = l2_get_va(start);
         va_end = va_start + (end - start);
+       raw_local_irq_save(flags);
         __asm__("mcr p15, 1, %0, c15, c11, 4\n\t"
                 "mcr p15, 1, %1, c15, c11, 5"
                 : : "r" (va_start), "r" (va_end));
         raw_local_irq_restore(flags);
+       l2_put_va(va_start);
  }
  
  static inline void l2_inv_all(void)
diff --git a/arch/arm/mm/cache-xsc3l2.c b/arch/arm/mm/cache-xsc3l2.c

index c3154928bccdf9750bef6da4e4fc29c2aad732f5..5a32020471e3bab2fc1e966a6e968c17a87f1d37 100644 (file)
--- a/arch/arm/mm/cache-xsc3l2.c
+++ b/arch/arm/mm/cache-xsc3l2.c
@@ -17,14 +17,10 @@
   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
   */
  #include <linux/init.h>
+#include <linux/highmem.h>
  #include <asm/system.h>
  #include <asm/cputype.h>
  #include <asm/cacheflush.h>
-#include <asm/kmap_types.h>
-#include <asm/fixmap.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include "mm.h"
  
  #define CR_L2  (1 << 26)
  
@@ -71,16 +67,15 @@ static inline void xsc3_l2_inv_all(void)
         dsb();
  }
  
+static inline void l2_unmap_va(unsigned long va)
+{
  #ifdef CONFIG_HIGHMEM
-#define l2_map_save_flags(x)           raw_local_save_flags(x)
-#define l2_map_restore_flags(x)                raw_local_irq_restore(x)
-#else
-#define l2_map_save_flags(x)           ((x) = 0)
-#define l2_map_restore_flags(x)                ((void)(x))
+       if (va != -1)
+               kunmap_atomic((void *)va);
  #endif
+}
  
-static inline unsigned long l2_map_va(unsigned long pa, unsigned long prev_va,
-                                     unsigned long flags)
+static inline unsigned long l2_map_va(unsigned long pa, unsigned long prev_va)
  {
  #ifdef CONFIG_HIGHMEM
         unsigned long va = prev_va & PAGE_MASK;
@@ -89,17 +84,10 @@ static inline unsigned long l2_map_va(unsigned long pa, unsigned long prev_va,
                 /*
                  * Switching to a new page.  Because cache ops are
                  * using virtual addresses only, we must put a mapping
-                * in place for it.  We also enable interrupts for a
-                * short while and disable them again to protect this
-                * mapping.
+                * in place for it.
                  */
-               unsigned long idx;
-               raw_local_irq_restore(flags);
-               idx = KM_L2_CACHE + KM_TYPE_NR * smp_processor_id();
-               va = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-               raw_local_irq_restore(flags | PSR_I_BIT);
-               set_pte_ext(TOP_PTE(va), pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL), 0);
-               local_flush_tlb_kernel_page(va);
+               l2_unmap_va(prev_va);
+               va = (unsigned long)kmap_atomic_pfn(pa >> PAGE_SHIFT);
         }
         return va + (pa_offset >> (32 - PAGE_SHIFT));
  #else
@@ -109,7 +97,7 @@ static inline unsigned long l2_map_va(unsigned long pa, unsigned long prev_va,
  
  static void xsc3_l2_inv_range(unsigned long start, unsigned long end)
  {
-       unsigned long vaddr, flags;
+       unsigned long vaddr;
  
         if (start == 0 && end == -1ul) {
                 xsc3_l2_inv_all();
@@ -117,13 +105,12 @@ static void xsc3_l2_inv_range(unsigned long start, unsigned long end)
         }
  
         vaddr = -1;  /* to force the first mapping */
-       l2_map_save_flags(flags);
  
         /*
          * Clean and invalidate partial first cache line.
          */
         if (start & (CACHE_LINE_SIZE - 1)) {
-               vaddr = l2_map_va(start & ~(CACHE_LINE_SIZE - 1), vaddr, flags);
+               vaddr = l2_map_va(start & ~(CACHE_LINE_SIZE - 1), vaddr);
                 xsc3_l2_clean_mva(vaddr);
                 xsc3_l2_inv_mva(vaddr);
                 start = (start | (CACHE_LINE_SIZE - 1)) + 1;
@@ -133,7 +120,7 @@ static void xsc3_l2_inv_range(unsigned long start, unsigned long end)
          * Invalidate all full cache lines between 'start' and 'end'.
          */
         while (start < (end & ~(CACHE_LINE_SIZE - 1))) {
-               vaddr = l2_map_va(start, vaddr, flags);
+               vaddr = l2_map_va(start, vaddr);
                 xsc3_l2_inv_mva(vaddr);
                 start += CACHE_LINE_SIZE;
         }
@@ -142,31 +129,30 @@ static void xsc3_l2_inv_range(unsigned long start, unsigned long end)
          * Clean and invalidate partial last cache line.
          */
         if (start < end) {
-               vaddr = l2_map_va(start, vaddr, flags);
+               vaddr = l2_map_va(start, vaddr);
                 xsc3_l2_clean_mva(vaddr);
                 xsc3_l2_inv_mva(vaddr);
         }
  
-       l2_map_restore_flags(flags);
+       l2_unmap_va(vaddr);
  
         dsb();
  }
  
  static void xsc3_l2_clean_range(unsigned long start, unsigned long end)
  {
-       unsigned long vaddr, flags;
+       unsigned long vaddr;
  
         vaddr = -1;  /* to force the first mapping */
-       l2_map_save_flags(flags);
  
         start &= ~(CACHE_LINE_SIZE - 1);
         while (start < end) {
-               vaddr = l2_map_va(start, vaddr, flags);
+               vaddr = l2_map_va(start, vaddr);
                 xsc3_l2_clean_mva(vaddr);
                 start += CACHE_LINE_SIZE;
         }
  
-       l2_map_restore_flags(flags);
+       l2_unmap_va(vaddr);
  
         dsb();
  }
@@ -193,7 +179,7 @@ static inline void xsc3_l2_flush_all(void)
  
  static void xsc3_l2_flush_range(unsigned long start, unsigned long end)
  {
-       unsigned long vaddr, flags;
+       unsigned long vaddr;
  
         if (start == 0 && end == -1ul) {
                 xsc3_l2_flush_all();
@@ -201,17 +187,16 @@ static void xsc3_l2_flush_range(unsigned long start, unsigned long end)
         }
  
         vaddr = -1;  /* to force the first mapping */
-       l2_map_save_flags(flags);
  
         start &= ~(CACHE_LINE_SIZE - 1);
         while (start < end) {
-               vaddr = l2_map_va(start, vaddr, flags);
+               vaddr = l2_map_va(start, vaddr);
                 xsc3_l2_clean_mva(vaddr);
                 xsc3_l2_inv_mva(vaddr);
                 start += CACHE_LINE_SIZE;
         }
  
-       l2_map_restore_flags(flags);
+       l2_unmap_va(vaddr);
  
         dsb();
  }
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c

index ac6a36142fcd5a28084b3c669fac9e800fd65497..809f1bf9fa29ef0fda15cf7563cffeed7a5bb98d 100644 (file)
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -17,6 +17,7 @@
  #include <linux/init.h>
  #include <linux/device.h>
  #include <linux/dma-mapping.h>
+#include <linux/highmem.h>
  
  #include <asm/memory.h>
  #include <asm/highmem.h>
@@ -480,10 +481,10 @@ static void dma_cache_maint_page(struct page *page, unsigned long offset,
                                 op(vaddr, len, dir);
                                 kunmap_high(page);
                         } else if (cache_is_vipt()) {
-                               pte_t saved_pte;
-                               vaddr = kmap_high_l1_vipt(page, &saved_pte);
+                               /* unmapped pages might still be cached */
+                               vaddr = kmap_atomic(page);
                                 op(vaddr + offset, len, dir);
-                               kunmap_high_l1_vipt(page, saved_pte);
+                               kunmap_atomic(vaddr);
                         }
                 } else {
                         vaddr = page_address(page) + offset;
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c

index 391ffae750986404df8658d53e93fdc69b970ba4..c29f2839f1d2b72c8aa99e17db44e120a42bc7ee 100644 (file)
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -10,6 +10,7 @@
  #include <linux/module.h>
  #include <linux/mm.h>
  #include <linux/pagemap.h>
+#include <linux/highmem.h>
  
  #include <asm/cacheflush.h>
  #include <asm/cachetype.h>
@@ -180,10 +181,10 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page)
                         __cpuc_flush_dcache_area(addr, PAGE_SIZE);
                         kunmap_high(page);
                 } else if (cache_is_vipt()) {
-                       pte_t saved_pte;
-                       addr = kmap_high_l1_vipt(page, &saved_pte);
+                       /* unmapped pages might still be cached */
+                       addr = kmap_atomic(page);
                         __cpuc_flush_dcache_area(addr, PAGE_SIZE);
-                       kunmap_high_l1_vipt(page, saved_pte);
+                       kunmap_atomic(addr);
                 }
         }
  
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c

index c435fd9e1da95c9fdc9d7fab83b3a42caef1b905..807c0573abbe82533a884f87ea7ea243051a228d 100644 (file)
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -140,90 +140,3 @@ struct page *kmap_atomic_to_page(const void *ptr)
         pte = TOP_PTE(vaddr);
         return pte_page(*pte);
  }
-
-#ifdef CONFIG_CPU_CACHE_VIPT
-
-#include <linux/percpu.h>
-
-/*
- * The VIVT cache of a highmem page is always flushed before the page
- * is unmapped. Hence unmapped highmem pages need no cache maintenance
- * in that case.
- *
- * However unmapped pages may still be cached with a VIPT cache, and
- * it is not possible to perform cache maintenance on them using physical
- * addresses unfortunately.  So we have no choice but to set up a temporary
- * virtual mapping for that purpose.
- *
- * Yet this VIPT cache maintenance may be triggered from DMA support
- * functions which are possibly called from interrupt context. As we don't
- * want to keep interrupt disabled all the time when such maintenance is
- * taking place, we therefore allow for some reentrancy by preserving and
- * restoring the previous fixmap entry before the interrupted context is
- * resumed.  If the reentrancy depth is 0 then there is no need to restore
- * the previous fixmap, and leaving the current one in place allow it to
- * be reused the next time without a TLB flush (common with DMA).
- */
-
-static DEFINE_PER_CPU(int, kmap_high_l1_vipt_depth);
-
-void *kmap_high_l1_vipt(struct page *page, pte_t *saved_pte)
-{
-       unsigned int idx, cpu;
-       int *depth;
-       unsigned long vaddr, flags;
-       pte_t pte, *ptep;
-
-       if (!in_interrupt())
-               preempt_disable();
-
-       cpu = smp_processor_id();
-       depth = &per_cpu(kmap_high_l1_vipt_depth, cpu);
-
-       idx = KM_L1_CACHE + KM_TYPE_NR * cpu;
-       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-       ptep = TOP_PTE(vaddr);
-       pte = mk_pte(page, kmap_prot);
-
-       raw_local_irq_save(flags);
-       (*depth)++;
-       if (pte_val(*ptep) == pte_val(pte)) {
-               *saved_pte = pte;
-       } else {
-               *saved_pte = *ptep;
-               set_pte_ext(ptep, pte, 0);
-               local_flush_tlb_kernel_page(vaddr);
-       }
-       raw_local_irq_restore(flags);
-
-       return (void *)vaddr;
-}
-
-void kunmap_high_l1_vipt(struct page *page, pte_t saved_pte)
-{
-       unsigned int idx, cpu = smp_processor_id();
-       int *depth = &per_cpu(kmap_high_l1_vipt_depth, cpu);
-       unsigned long vaddr, flags;
-       pte_t pte, *ptep;
-
-       idx = KM_L1_CACHE + KM_TYPE_NR * cpu;
-       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-       ptep = TOP_PTE(vaddr);
-       pte = mk_pte(page, kmap_prot);
-
-       BUG_ON(pte_val(*ptep) != pte_val(pte));
-       BUG_ON(*depth <= 0);
-
-       raw_local_irq_save(flags);
-       (*depth)--;
-       if (*depth != 0 && pte_val(pte) != pte_val(saved_pte)) {
-               set_pte_ext(ptep, saved_pte, 0);
-               local_flush_tlb_kernel_page(vaddr);
-       }
-       raw_local_irq_restore(flags);
-
-       if (!in_interrupt())
-               preempt_enable();
-}
-
-#endif  /* CONFIG_CPU_CACHE_VIPT */
diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c

index 5c7c6fc07565bd468a9f78b327073f0212795a42..183e0d226669193700c72f405f1e4e43303612a4 100644 (file)
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -1047,6 +1047,6 @@ init_hw_perf_events(void)
  
         return 0;
  }
-arch_initcall(init_hw_perf_events);
+early_initcall(init_hw_perf_events);
  
  #endif /* defined(CONFIG_CPU_MIPS32)... */
diff --git a/arch/mn10300/kernel/irq.c b/arch/mn10300/kernel/irq.c

index c2e44597c22b1fc5b69ebf13cca307d43c2d1a52..ac11754ecec544c965c196f9dc857b2b398b22e0 100644 (file)
--- a/arch/mn10300/kernel/irq.c
+++ b/arch/mn10300/kernel/irq.c
@@ -459,7 +459,7 @@ void migrate_irqs(void)
                         tmp = CROSS_GxICR(irq, new);
  
                         x &= GxICR_LEVEL | GxICR_ENABLE;
-                       if (GxICR(irq) & GxICR_REQUEST) {
+                       if (GxICR(irq) & GxICR_REQUEST)
                                 x |= GxICR_REQUEST | GxICR_DETECT;
                         CROSS_GxICR(irq, new) = x;
                         tmp = CROSS_GxICR(irq, new);
diff --git a/arch/powerpc/kernel/e500-pmu.c b/arch/powerpc/kernel/e500-pmu.c

index 7c07de0d89436ea4bac7350de09ab73d5f89b2e4..b150b510510f167d2782f645999303dfa297e2ad 100644 (file)
--- a/arch/powerpc/kernel/e500-pmu.c
+++ b/arch/powerpc/kernel/e500-pmu.c
@@ -126,4 +126,4 @@ static int init_e500_pmu(void)
         return register_fsl_emb_pmu(&e500_pmu);
  }
  
-arch_initcall(init_e500_pmu);
+early_initcall(init_e500_pmu);
diff --git a/arch/powerpc/kernel/mpc7450-pmu.c b/arch/powerpc/kernel/mpc7450-pmu.c

index 09d72028f317755428865f5105ee6b76b829ad7a..2cc5e0301d0b532a2291e400cb7bdc0a87aa89cd 100644 (file)
--- a/arch/powerpc/kernel/mpc7450-pmu.c
+++ b/arch/powerpc/kernel/mpc7450-pmu.c
@@ -414,4 +414,4 @@ static int init_mpc7450_pmu(void)
         return register_power_pmu(&mpc7450_pmu);
  }
  
-arch_initcall(init_mpc7450_pmu);
+early_initcall(init_mpc7450_pmu);
diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c

index 3129c855933c2a3857b0c4b3321b259b851279b8..5674807057899cd09dc366d128b1769afc6cd3e9 100644 (file)
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -1379,7 +1379,7 @@ int register_power_pmu(struct power_pmu *pmu)
                 freeze_events_kernel = MMCR0_FCHV;
  #endif /* CONFIG_PPC64 */
  
-       perf_pmu_register(&power_pmu);
+       perf_pmu_register(&power_pmu, "cpu", PERF_TYPE_RAW);
         perf_cpu_notifier(power_pmu_notifier);
  
         return 0;
diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c

index 7ecca59ddf77fe20bd46b470d9392cdd16fd5ba9..4dcf5f831e9d01f8694443ac7d56146b6f7a9777 100644 (file)
--- a/arch/powerpc/kernel/perf_event_fsl_emb.c
+++ b/arch/powerpc/kernel/perf_event_fsl_emb.c
@@ -681,7 +681,7 @@ int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu)
         pr_info("%s performance monitor hardware support registered\n",
                 pmu->name);
  
-       perf_pmu_register(&fsl_emb_pmu);
+       perf_pmu_register(&fsl_emb_pmu, "cpu", PERF_TYPE_RAW);
  
         return 0;
  }
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c

index 2a361cdda635881ba4e96bf33f0a3dd6b8c273ea..ead8b3c2649ebba98c00423727e7dae54c6f5a7f 100644 (file)
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -613,4 +613,4 @@ static int init_power4_pmu(void)
         return register_power_pmu(&power4_pmu);
  }
  
-arch_initcall(init_power4_pmu);
+early_initcall(init_power4_pmu);
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c

index 199de527d411446918651bd374bc5e9586ace46f..eca0ac595cb6c5b790ea4fd37d77ebb3a8ee474b 100644 (file)
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -682,4 +682,4 @@ static int init_power5p_pmu(void)
         return register_power_pmu(&power5p_pmu);
  }
  
-arch_initcall(init_power5p_pmu);
+early_initcall(init_power5p_pmu);
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c

index 98b6a729a9dd127cc2c88e799b58bdbea2fa313c..d5ff0f64a5e645e01ddc6f9b56c91c60e14b204a 100644 (file)
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -621,4 +621,4 @@ static int init_power5_pmu(void)
         return register_power_pmu(&power5_pmu);
  }
  
-arch_initcall(init_power5_pmu);
+early_initcall(init_power5_pmu);
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c

index 84a607bda8fbc129562943d7d2ce7fe0ee0f11bd..31603927e376e7e8854bf6a0faf86bc2bc9e56f7 100644 (file)
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -544,4 +544,4 @@ static int init_power6_pmu(void)
         return register_power_pmu(&power6_pmu);
  }
  
-arch_initcall(init_power6_pmu);
+early_initcall(init_power6_pmu);
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c

index 852f7b7f6b4045801df807b997c7b110426ce7a6..593740fcb799d6fc9c29faca49425ad97b15b19a 100644 (file)
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -369,4 +369,4 @@ static int init_power7_pmu(void)
         return register_power_pmu(&power7_pmu);
  }
  
-arch_initcall(init_power7_pmu);
+early_initcall(init_power7_pmu);
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c

index 3fee685de4df49e01a3a85ff069f3d409354c924..9a6e093858fe13fd30d2a79adb82e38e8c424b84 100644 (file)
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -494,4 +494,4 @@ static int init_ppc970_pmu(void)
         return register_power_pmu(&ppc970_pmu);
  }
  
-arch_initcall(init_ppc970_pmu);
+early_initcall(init_ppc970_pmu);
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig

index e0b98e71ff4797e5807f9e5d99f674aeb90456b8..6c6d7b339aae4f49fac84b7b3edc069f4ccf3adf 100644 (file)
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -99,6 +99,7 @@ config S390
         select HAVE_KERNEL_LZMA
         select HAVE_KERNEL_LZO
         select HAVE_GET_USER_PAGES_FAST
+       select HAVE_ARCH_MUTEX_CPU_RELAX
         select ARCH_INLINE_SPIN_TRYLOCK
         select ARCH_INLINE_SPIN_TRYLOCK_BH
         select ARCH_INLINE_SPIN_LOCK
diff --git a/arch/s390/include/asm/mutex.h b/arch/s390/include/asm/mutex.h

index 458c1f7fbc1808d48982aa0c5fe89bfe3df2098c..688271f5f2e452b9951599550f33ed0ddcfe0a7c 100644 (file)
--- a/arch/s390/include/asm/mutex.h
+++ b/arch/s390/include/asm/mutex.h
@@ -7,3 +7,5 @@
   */
  
  #include <asm-generic/mutex-dec.h>
+
+#define arch_mutex_cpu_relax() barrier()
diff --git a/arch/sh/kernel/cpu/sh4/perf_event.c b/arch/sh/kernel/cpu/sh4/perf_event.c

index dbf3b4bb71febb0ba38e9ec2f6c27731ea0970be..748955df018d801db05137f1831cf18f01938b55 100644 (file)
--- a/arch/sh/kernel/cpu/sh4/perf_event.c
+++ b/arch/sh/kernel/cpu/sh4/perf_event.c
@@ -250,4 +250,4 @@ static int __init sh7750_pmu_init(void)
  
         return register_sh_pmu(&sh7750_pmu);
  }
-arch_initcall(sh7750_pmu_init);
+early_initcall(sh7750_pmu_init);
diff --git a/arch/sh/kernel/cpu/sh4a/perf_event.c b/arch/sh/kernel/cpu/sh4a/perf_event.c

index 580276525731531643c9165d5c45ce28f5eade20..17e6bebfede067c26379efde6b8e0a69fc6565e9 100644 (file)
--- a/arch/sh/kernel/cpu/sh4a/perf_event.c
+++ b/arch/sh/kernel/cpu/sh4a/perf_event.c
@@ -284,4 +284,4 @@ static int __init sh4a_pmu_init(void)
  
         return register_sh_pmu(&sh4a_pmu);
  }
-arch_initcall(sh4a_pmu_init);
+early_initcall(sh4a_pmu_init);
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c

index 5a4b33435650c8ea108668d8e7e30786a20bd335..2ee21a47b5af6e1aac2889712c69848b7194f317 100644 (file)
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -389,7 +389,7 @@ int __cpuinit register_sh_pmu(struct sh_pmu *_pmu)
  
         WARN_ON(_pmu->num_events > MAX_HWEVENTS);
  
-       perf_pmu_register(&pmu);
+       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
         perf_cpu_notifier(sh_pmu_notifier);
         return 0;
  }
diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h

index 6e8bfa1786dab1f45d3dff5a1dcacc31a08d4844..4d3dbe3703e9001f53f65f64d38629bda6dedb63 100644 (file)
--- a/arch/sparc/include/asm/perf_event.h
+++ b/arch/sparc/include/asm/perf_event.h
@@ -4,8 +4,6 @@
  #ifdef CONFIG_PERF_EVENTS
  #include <asm/ptrace.h>
  
-extern void init_hw_perf_events(void);
-
  #define perf_arch_fetch_caller_regs(regs, ip)          \
  do {                                                   \
         unsigned long _pstate, _asi, _pil, _i7, _fp;    \
@@ -26,8 +24,6 @@ do {                                                  \
         (regs)->u_regs[UREG_I6] = _fp;                  \
         (regs)->u_regs[UREG_I7] = _i7;                  \
  } while (0)
-#else
-static inline void init_hw_perf_events(void)   { }
  #endif
  
  #endif
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c

index a4bd7ba74c89d9f25221f29e616f49f26517aad5..300f810142f57e82cce5984a56d9e5e920aebb7d 100644 (file)
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -270,8 +270,6 @@ int __init nmi_init(void)
                         atomic_set(&nmi_active, -1);
                 }
         }
-       if (!err)
-               init_hw_perf_events();
  
         return err;
  }
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c

index 0d6deb55a2ae7e4189b5ab60aec81cd8df28adb6..760578687e7ca86cb0bb63cc7dc68721d51bee90 100644 (file)
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1307,20 +1307,23 @@ static bool __init supported_pmu(void)
         return false;
  }
  
-void __init init_hw_perf_events(void)
+int __init init_hw_perf_events(void)
  {
         pr_info("Performance events: ");
  
         if (!supported_pmu()) {
                 pr_cont("No support for PMU type '%s'\n", sparc_pmu_type);
-               return;
+               return 0;
         }
  
         pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
  
-       perf_pmu_register(&pmu);
+       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
         register_die_notifier(&perf_event_nmi_notifier);
+
+       return 0;
  }
+early_initcall(init_hw_perf_events);
  
  void perf_callchain_kernel(struct perf_callchain_entry *entry,
                            struct pt_regs *regs)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index e330da21b84f0636751b7e18e921ecff9cc31f55..b6fccb07123e206e23a4b73534d4891f66753b90 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -377,6 +377,18 @@ config X86_ELAN
  
           If unsure, choose "PC-compatible" instead.
  
+config X86_INTEL_CE
+       bool "CE4100 TV platform"
+       depends on PCI
+       depends on PCI_GODIRECT
+       depends on X86_32
+       depends on X86_EXTENDED_PLATFORM
+       select X86_REBOOTFIXUPS
+       ---help---
+         Select for the Intel CE media processor (CE4100) SOC.
+         This option compiles in support for the CE4100 SOC for settop
+         boxes and media devices.
+
  config X86_MRST
         bool "Moorestown MID platform"
         depends on PCI
@@ -385,6 +397,10 @@ config X86_MRST
         depends on X86_EXTENDED_PLATFORM
         depends on X86_IO_APIC
         select APB_TIMER
+       select I2C
+       select SPI
+       select INTEL_SCU_IPC
+       select X86_PLATFORM_DEVICES
         ---help---
           Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
           Internet Device(MID) platform. Moorestown consists of two chips:
@@ -466,6 +482,19 @@ config X86_ES7000
           Support for Unisys ES7000 systems.  Say 'Y' here if this kernel is
           supposed to run on an IA32-based Unisys ES7000 system.
  
+config X86_32_IRIS
+       tristate "Eurobraille/Iris poweroff module"
+       depends on X86_32
+       ---help---
+         The Iris machines from EuroBraille do not have APM or ACPI support
+         to shut themselves down properly.  A special I/O sequence is
+         needed to do so, which is what this module does at
+         kernel shutdown.
+
+         This is only for Iris machines from EuroBraille.
+
+         If unused, say N.
+
  config SCHED_OMIT_FRAME_POINTER
         def_bool y
         prompt "Single-depth WCHAN output"
@@ -1141,16 +1170,16 @@ config NUMA
  comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
         depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
  
-config K8_NUMA
+config AMD_NUMA
         def_bool y
         prompt "Old style AMD Opteron NUMA detection"
         depends on X86_64 && NUMA && PCI
         ---help---
-         Enable K8 NUMA node topology detection.  You should say Y here if
-         you have a multi processor AMD K8 system. This uses an old
-         method to read the NUMA configuration directly from the builtin
-         Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
-         instead, which also takes priority if both are compiled in.
+         Enable AMD NUMA node topology detection.  You should say Y here if
+         you have a multi processor AMD system. This uses an old method to
+         read the NUMA configuration directly from the builtin Northbridge
+         of Opteron. It is recommended to use X86_64_ACPI_NUMA instead,
+         which also takes priority if both are compiled in.
  
  config X86_64_ACPI_NUMA
         def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug

index b59ee765414ea3891d6d4914485ba366fcee5663..45143bbcfe5e487d53e33bddaa7193ab68a5275e 100644 (file)
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -117,6 +117,17 @@ config DEBUG_RODATA_TEST
           feature as well as for the change_page_attr() infrastructure.
           If in doubt, say "N"
  
+config DEBUG_SET_MODULE_RONX
+       bool "Set loadable kernel module data as NX and text as RO"
+       depends on MODULES
+       ---help---
+         This option helps catch unintended modifications to loadable
+         kernel module's text and read-only data. It also prevents execution
+         of module data. Such protection may interfere with run-time code
+         patching and dynamic kernel tracing - and they might also protect
+         against certain classes of kernel exploits.
+         If in doubt, say "N".
+
  config DEBUG_NX_TEST
         tristate "Testcase for the NX non-executable stack feature"
         depends on DEBUG_KERNEL && m
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S

index 52f85a196fa033df961d20349ce6b7437409e843..35af09d13dc13b5d41ec7e19e066c7b5b676f30a 100644 (file)
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -182,7 +182,7 @@ no_longmode:
         hlt
         jmp     1b
  
-#include "../../kernel/verify_cpu_64.S"
+#include "../../kernel/verify_cpu.S"
  
         /*
          * Be careful here startup_64 needs to be at a predictable
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h

index 76561d20ea2f27f0edfd0eee6d043b98c6aa6e90..13009d1af99a33e2bbee39fbe80a194fac85ece4 100644 (file)
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -66,6 +66,7 @@ extern void alternatives_smp_module_add(struct module *mod, char *name,
  extern void alternatives_smp_module_del(struct module *mod);
  extern void alternatives_smp_switch(int smp);
  extern int alternatives_text_reserved(void *start, void *end);
+extern bool skip_smp_alternatives;
  #else
  static inline void alternatives_smp_module_add(struct module *mod, char *name,
                                                void *locks, void *locks_end,
@@ -180,8 +181,15 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
   * On the local CPU you need to be protected again NMI or MCE handlers seeing an
   * inconsistent instruction while you patch.
   */
+struct text_poke_param {
+       void *addr;
+       const void *opcode;
+       size_t len;
+};
+
  extern void *text_poke(void *addr, const void *opcode, size_t len);
  extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
+extern void text_poke_smp_batch(struct text_poke_param *params, int n);
  
  #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
  #define IDEAL_NOP_SIZE_5 5
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h

index c8517f81b21e73f9f2c428a26f2fb8995f73011f..6aee50d655d12f6792e495c5d5d004aaef9377e2 100644 (file)
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -3,36 +3,53 @@
  
  #include <linux/pci.h>
  
-extern struct pci_device_id k8_nb_ids[];
+extern struct pci_device_id amd_nb_misc_ids[];
  struct bootnode;
  
-extern int early_is_k8_nb(u32 value);
-extern int cache_k8_northbridges(void);
-extern void k8_flush_garts(void);
-extern int k8_get_nodes(struct bootnode *nodes);
-extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
-extern int k8_scan_nodes(void);
+extern int early_is_amd_nb(u32 value);
+extern int amd_cache_northbridges(void);
+extern void amd_flush_garts(void);
+extern int amd_get_nodes(struct bootnode *nodes);
+extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn);
+extern int amd_scan_nodes(void);
  
-struct k8_northbridge_info {
+struct amd_northbridge {
+       struct pci_dev *misc;
+};
+
+struct amd_northbridge_info {
         u16 num;
-       u8 gart_supported;
-       struct pci_dev **nb_misc;
+       u64 flags;
+       struct amd_northbridge *nb;
  };
-extern struct k8_northbridge_info k8_northbridges;
+extern struct amd_northbridge_info amd_northbridges;
+
+#define AMD_NB_GART                    0x1
+#define AMD_NB_L3_INDEX_DISABLE                0x2
  
  #ifdef CONFIG_AMD_NB
  
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
+static inline int amd_nb_num(void)
  {
-       return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL;
+       return amd_northbridges.num;
  }
  
-#else
+static inline int amd_nb_has_feature(int feature)
+{
+       return ((amd_northbridges.flags & feature) == feature);
+}
  
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
+static inline struct amd_northbridge *node_to_amd_nb(int node)
  {
-       return NULL;
+       return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
  }
+
+#else
+
+#define amd_nb_num(x)          0
+#define amd_nb_has_feature(x)  false
+#define node_to_amd_nb(x)      NULL
+
  #endif
  
  
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h

index f6ce0bda3b98a74906cb1c8297f4ba150430699d..cf12007796db95f1a48635a55016ff01a0f811c2 100644 (file)
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -238,6 +238,7 @@ extern void setup_boot_APIC_clock(void);
  extern void setup_secondary_APIC_clock(void);
  extern int APIC_init_uniprocessor(void);
  extern void enable_NMI_through_LVT0(void);
+extern int apic_force_enable(void);
  
  /*
   * On 32bit this is mach-xxx local
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h

index a859ca461fb0432585f952e08337610b1165a204..47a30ff8e51782a31f146c78b458ea6a89bcacc3 100644 (file)
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -145,6 +145,7 @@
  
  #ifdef CONFIG_X86_32
  # define MAX_IO_APICS 64
+# define MAX_LOCAL_APIC 256
  #else
  # define MAX_IO_APICS 128
  # define MAX_LOCAL_APIC 32768
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h

index 8e6218550e774b56fd30f3171bc163bfb511b947..c8bfe63a06de289057321af73c114e3521d9088a 100644 (file)
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -124,6 +124,7 @@ enum {
         X86_SUBARCH_LGUEST,
         X86_SUBARCH_XEN,
         X86_SUBARCH_MRST,
+       X86_SUBARCH_CE4100,
         X86_NR_SUBARCHS,
  };
  
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h

index 9479a037419fe1358a96cece0d877a269c71e365..0141b234406fb01f8418320ea49c997fdbd14cbb 100644 (file)
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -117,6 +117,10 @@ enum fixed_addresses {
         FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
         FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
         __end_of_permanent_fixed_addresses,
+
+#ifdef CONFIG_X86_MRST
+       FIX_LNW_VRTC,
+#endif
         /*
          * 256 temporary boot-time mappings, used by early_ioremap(),
          * before ioremap() is functional.
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h

index 4aa2bb3b242ab76733e0f7e5ba95454471297c1a..ef328901c80240f4a1471d3e4bdd795daffc6621 100644 (file)
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -93,6 +93,17 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
         int err;
  
         /* See comment in fxsave() below. */
+#ifdef CONFIG_AS_FXSAVEQ
+       asm volatile("1:  fxrstorq %[fx]\n\t"
+                    "2:\n"
+                    ".section .fixup,\"ax\"\n"
+                    "3:  movl $-1,%[err]\n"
+                    "    jmp  2b\n"
+                    ".previous\n"
+                    _ASM_EXTABLE(1b, 3b)
+                    : [err] "=r" (err)
+                    : [fx] "m" (*fx), "0" (0));
+#else
         asm volatile("1:  rex64/fxrstor (%[fx])\n\t"
                      "2:\n"
                      ".section .fixup,\"ax\"\n"
@@ -102,6 +113,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
                      _ASM_EXTABLE(1b, 3b)
                      : [err] "=r" (err)
                      : [fx] "R" (fx), "m" (*fx), "0" (0));
+#endif
         return err;
  }
  
@@ -119,6 +131,17 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
                 return -EFAULT;
  
         /* See comment in fxsave() below. */
+#ifdef CONFIG_AS_FXSAVEQ
+       asm volatile("1:  fxsaveq %[fx]\n\t"
+                    "2:\n"
+                    ".section .fixup,\"ax\"\n"
+                    "3:  movl $-1,%[err]\n"
+                    "    jmp  2b\n"
+                    ".previous\n"
+                    _ASM_EXTABLE(1b, 3b)
+                    : [err] "=r" (err), [fx] "=m" (*fx)
+                    : "0" (0));
+#else
         asm volatile("1:  rex64/fxsave (%[fx])\n\t"
                      "2:\n"
                      ".section .fixup,\"ax\"\n"
@@ -128,6 +151,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
                      _ASM_EXTABLE(1b, 3b)
                      : [err] "=r" (err), "=m" (*fx)
                      : [fx] "R" (fx), "0" (0));
+#endif
         if (unlikely(err) &&
             __clear_user(fx, sizeof(struct i387_fxsave_struct)))
                 err = -EFAULT;
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h

index a6b28d017c2fb9aae9e5f5d2bae03db4dcadbd43..0c5ca4e30d7bda949a3470ad6c89f391d19ac623 100644 (file)
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -159,7 +159,7 @@ struct io_apic_irq_attr;
  extern int io_apic_set_pci_routing(struct device *dev, int irq,
                  struct io_apic_irq_attr *irq_attr);
  void setup_IO_APIC_irq_extra(u32 gsi);
-extern void ioapic_init_mappings(void);
+extern void ioapic_and_gsi_init(void);
  extern void ioapic_insert_resources(void);
  
  extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
@@ -168,10 +168,9 @@ extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
  extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
  extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
  
-extern void probe_nr_irqs_gsi(void);
  extern int get_nr_irqs_gsi(void);
-
  extern void setup_ioapic_ids_from_mpc(void);
+extern void setup_ioapic_ids_from_mpc_nocheck(void);
  
  struct mp_ioapic_gsi{
         u32 gsi_base;
@@ -189,9 +188,8 @@ extern void __init pre_init_apic_IRQ0(void);
  #define io_apic_assign_pci_irqs 0
  #define setup_ioapic_ids_from_mpc x86_init_noop
  static const int timer_through_8259 = 0;
-static inline void ioapic_init_mappings(void)  { }
+static inline void ioapic_and_gsi_init(void) { }
  static inline void ioapic_insert_resources(void) { }
-static inline void probe_nr_irqs_gsi(void)     { }
  #define gsi_top (NR_IRQS_LEGACY)
  static inline int mp_find_ioapic(u32 gsi) { return 0; }
  
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h

index 13b0ebaa512f77764e06956632b32755f8ca2bfa..ba870bb6dd8ef30ab81a317a8eb43dcb83066630 100644 (file)
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -15,10 +15,6 @@ static inline int irq_canonicalize(int irq)
         return ((irq == 2) ? 9 : irq);
  }
  
-#ifdef CONFIG_X86_LOCAL_APIC
-# define ARCH_HAS_NMI_WATCHDOG
-#endif
-
  #ifdef CONFIG_X86_32
  extern void irq_ctx_init(int cpu);
  #else
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h

index 5bdfca86581beb3b45c60fd1f8d900a5daa68bf9..f23eb2528464f4a51ad6d14d70db0f7c33d92e0a 100644 (file)
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -28,7 +28,7 @@ extern void die(const char *, struct pt_regs *,long);
  extern int __must_check __die(const char *, struct pt_regs *, long);
  extern void show_registers(struct pt_regs *regs);
  extern void show_trace(struct task_struct *t, struct pt_regs *regs,
-                      unsigned long *sp, unsigned long bp);
+                      unsigned long *sp);
  extern void __show_regs(struct pt_regs *regs, int all);
  extern void show_regs(struct pt_regs *regs);
  extern unsigned long oops_begin(void);
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h

index ef51b501e22a6e53bf4ae7e2d9e2566760f72ee1..24215072d0e1e5894d4643634bfe1ef9786eef55 100644 (file)
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -48,6 +48,12 @@ static inline struct microcode_ops * __init init_intel_microcode(void)
  
  #ifdef CONFIG_MICROCODE_AMD
  extern struct microcode_ops * __init init_amd_microcode(void);
+
+static inline void get_ucode_data(void *to, const u8 *from, size_t n)
+{
+       memcpy(to, from, n);
+}
+
  #else
  static inline struct microcode_ops * __init init_amd_microcode(void)
  {
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h

index c82868e9f905f04779778542298ce5560ae2e865..0c90dd9f05053c83591df6e04ec5d7979fee779f 100644 (file)
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -5,8 +5,9 @@
  
  #include <asm/mpspec_def.h>
  #include <asm/x86_init.h>
+#include <asm/apicdef.h>
  
-extern int apic_version[MAX_APICS];
+extern int apic_version[];
  extern int pic_mode;
  
  #ifdef CONFIG_X86_32
@@ -107,7 +108,7 @@ extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
                                  int active_high_low);
  #endif /* CONFIG_ACPI */
  
-#define PHYSID_ARRAY_SIZE      BITS_TO_LONGS(MAX_APICS)
+#define PHYSID_ARRAY_SIZE      BITS_TO_LONGS(MAX_LOCAL_APIC)
  
  struct physid_mask {
         unsigned long mask[PHYSID_ARRAY_SIZE];
@@ -122,31 +123,31 @@ typedef struct physid_mask physid_mask_t;
         test_and_set_bit(physid, (map).mask)
  
  #define physids_and(dst, src1, src2)                                   \
-       bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+       bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
  
  #define physids_or(dst, src1, src2)                                    \
-       bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+       bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
  
  #define physids_clear(map)                                     \
-       bitmap_zero((map).mask, MAX_APICS)
+       bitmap_zero((map).mask, MAX_LOCAL_APIC)
  
  #define physids_complement(dst, src)                           \
-       bitmap_complement((dst).mask, (src).mask, MAX_APICS)
+       bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC)
  
  #define physids_empty(map)                                     \
-       bitmap_empty((map).mask, MAX_APICS)
+       bitmap_empty((map).mask, MAX_LOCAL_APIC)
  
  #define physids_equal(map1, map2)                              \
-       bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
+       bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC)
  
  #define physids_weight(map)                                    \
-       bitmap_weight((map).mask, MAX_APICS)
+       bitmap_weight((map).mask, MAX_LOCAL_APIC)
  
  #define physids_shift_right(d, s, n)                           \
-       bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
+       bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC)
  
  #define physids_shift_left(d, s, n)                            \
-       bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
+       bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC)
  
  static inline unsigned long physids_coerce(physid_mask_t *map)
  {
@@ -159,14 +160,6 @@ static inline void physids_promote(unsigned long physids, physid_mask_t *map)
         map->mask[0] = physids;
  }
  
-/* Note: will create very large stack frames if physid_mask_t is big */
-#define physid_mask_of_physid(physid)                                  \
-       ({                                                              \
-               physid_mask_t __physid_mask = PHYSID_MASK_NONE;         \
-               physid_set(physid, __physid_mask);                      \
-               __physid_mask;                                          \
-       })
-
  static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
  {
         physids_clear(*map);
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h

index 4a7f96d7c188edd92387cdec4a3be36e23028f35..c0a955a9a08784f662a071d976ef57bc0637c8c6 100644 (file)
--- a/arch/x86/include/asm/mpspec_def.h
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -15,13 +15,6 @@
  
  #ifdef CONFIG_X86_32
  # define MAX_MPC_ENTRY 1024
-# define MAX_APICS      256
-#else
-# if NR_CPUS <= 255
-#  define MAX_APICS     255
-# else
-#  define MAX_APICS   32768
-# endif
  #endif
  
  /* Intel MP Floating Pointer Structure */
diff --git a/arch/x86/include/asm/mrst-vrtc.h b/arch/x86/include/asm/mrst-vrtc.h

new file mode 100644 (file)

index 0000000..73668ab
--- /dev/null
+++ b/arch/x86/include/asm/mrst-vrtc.h
@@ -0,0 +1,9 @@
+#ifndef _MRST_VRTC_H
+#define _MRST_VRTC_H
+
+extern unsigned char vrtc_cmos_read(unsigned char reg);
+extern void vrtc_cmos_write(unsigned char val, unsigned char reg);
+extern unsigned long vrtc_get_time(void);
+extern int vrtc_set_mmss(unsigned long nowtime);
+
+#endif
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h

index 4a711a684b174435bd5aae838515a836101eb389..719f00b28ff5358caf87d736ed5b4100dafdce9e 100644 (file)
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -14,7 +14,9 @@
  #include <linux/sfi.h>
  
  extern int pci_mrst_init(void);
-int __init sfi_parse_mrtc(struct sfi_table_header *table);
+extern int __init sfi_parse_mrtc(struct sfi_table_header *table);
+extern int sfi_mrtc_num;
+extern struct sfi_rtc_table_entry sfi_mrtc_array[];
  
  /*
   * Medfield is the follow-up of Moorestown, it combines two chip solution into
@@ -50,4 +52,14 @@ extern void mrst_early_console_init(void);
  
  extern struct console early_hsu_console;
  extern void hsu_early_console_init(void);
+
+extern void intel_scu_devices_create(void);
+extern void intel_scu_devices_destroy(void);
+
+/* VRTC timer */
+#define MRST_VRTC_MAP_SZ       (1024)
+/*#define MRST_VRTC_PGOFFSET   (0xc00) */
+
+extern void mrst_rtc_init(void);
+
  #endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h

index 622c80b7dbee2aad4f2cbf20a121fbe3a10f2956..4d0dfa0d998e9f80ce244d86e1fd583513aaaaca 100644 (file)
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -123,6 +123,10 @@
  #define MSR_AMD64_IBSCTL               0xc001103a
  #define MSR_AMD64_IBSBRTARGET          0xc001103b
  
+/* Fam 15h MSRs */
+#define MSR_F15H_PERF_CTL              0xc0010200
+#define MSR_F15H_PERF_CTR              0xc0010201
+
  /* Fam 10h MSRs */
  #define MSR_FAM10H_MMIO_CONF_BASE      0xc0010058
  #define FAM10H_MMIO_CONF_ENABLE                (1<<0)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h

index 932f0f86b4b76252e6e6434ab9c15d81c3b17004..c4021b9535102547712c92203e8afe5809e567fc 100644 (file)
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -5,41 +5,15 @@
  #include <asm/irq.h>
  #include <asm/io.h>
  
-#ifdef ARCH_HAS_NMI_WATCHDOG
-
-/**
- * do_nmi_callback
- *
- * Check to see if a callback exists and execute it.  Return 1
- * if the handler exists and was handled successfully.
- */
-int do_nmi_callback(struct pt_regs *regs, int cpu);
+#ifdef CONFIG_X86_LOCAL_APIC
  
  extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
-extern int check_nmi_watchdog(void);
-#if !defined(CONFIG_LOCKUP_DETECTOR)
-extern int nmi_watchdog_enabled;
-#endif
  extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
  extern int reserve_perfctr_nmi(unsigned int);
  extern void release_perfctr_nmi(unsigned int);
  extern int reserve_evntsel_nmi(unsigned int);
  extern void release_evntsel_nmi(unsigned int);
  
-extern void setup_apic_nmi_watchdog(void *);
-extern void stop_apic_nmi_watchdog(void *);
-extern void disable_timer_nmi_watchdog(void);
-extern void enable_timer_nmi_watchdog(void);
-extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason);
-extern void cpu_nmi_set_wd_enabled(void);
-
-extern atomic_t nmi_active;
-extern unsigned int nmi_watchdog;
-#define NMI_NONE       0
-#define NMI_IO_APIC    1
-#define NMI_LOCAL_APIC 2
-#define NMI_INVALID    3
-
  struct ctl_table;
  extern int proc_nmi_enabled(struct ctl_table *, int ,
                         void __user *, size_t *, loff_t *);
@@ -47,33 +21,8 @@ extern int unknown_nmi_panic;
  
  void arch_trigger_all_cpu_backtrace(void);
  #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
-
-static inline void localise_nmi_watchdog(void)
-{
-       if (nmi_watchdog == NMI_IO_APIC)
-               nmi_watchdog = NMI_LOCAL_APIC;
-}
-
-/* check if nmi_watchdog is active (ie was specified at boot) */
-static inline int nmi_watchdog_active(void)
-{
-       /*
-        * actually it should be:
-        *      return (nmi_watchdog == NMI_LOCAL_APIC ||
-        *              nmi_watchdog == NMI_IO_APIC)
-        * but since they are power of two we could use a
-        * cheaper way --cvg
-        */
-       return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
-}
  #endif
  
-void lapic_watchdog_stop(void);
-int lapic_watchdog_init(unsigned nmi_hz);
-int lapic_wd_event(unsigned nmi_hz);
-unsigned lapic_adjust_nmi_hz(unsigned hz);
-void disable_lapic_nmi_watchdog(void);
-void enable_lapic_nmi_watchdog(void);
  void stop_nmi(void);
  void restart_nmi(void);
  
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h

index ef9975812c77f0702cc9fb2a1fb616b9cb1b8053..7709c12431b8075761ee36aadf22149fe4adfa4b 100644 (file)
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -112,7 +112,7 @@ static inline void arch_safe_halt(void)
  
  static inline void halt(void)
  {
-       PVOP_VCALL0(pv_irq_ops.safe_halt);
+       PVOP_VCALL0(pv_irq_ops.halt);
  }
  
  static inline void wbinvd(void)
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h

index ca0437c714b2aa3c94195a67dfe49e5eeb349b19..6761292296307163a0f5cbb4af1e27642d33da10 100644 (file)
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -65,6 +65,7 @@ extern unsigned long pci_mem_start;
  
  #define PCIBIOS_MIN_CARDBUS_IO 0x4000
  
+extern int pcibios_enabled;
  void pcibios_config_init(void);
  struct pci_bus *pcibios_scan_root(int bus);
  
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h

index 550e26b1dbb3593f324910f0197402966ae91299..d9d4dae305f6991efa446ec0ef4e7fad8054bdc3 100644 (file)
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -125,7 +125,6 @@ union cpuid10_edx {
  #define IBS_OP_MAX_CNT_EXT     0x007FFFFFULL   /* not a register bit mask */
  
  #ifdef CONFIG_PERF_EVENTS
-extern void init_hw_perf_events(void);
  extern void perf_events_lapic_init(void);
  
  #define PERF_EVENT_INDEX_OFFSET                        0
@@ -156,7 +155,6 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
  }
  
  #else
-static inline void init_hw_perf_events(void)           { }
  static inline void perf_events_lapic_init(void)        { }
  #endif
  
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h

index a70cd216be5d729db1f364340f911d632819f18d..295e2ff18a6a80be6ec3425d91e73239c76f1198 100644 (file)
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -744,14 +744,6 @@ enum P4_ESCR_EMASKS {
  };
  
  /*
- * P4 PEBS specifics (Replay Event only)
- *
- * Format (bits):
- *   0-6: metric from P4_PEBS_METRIC enum
- *    7 : reserved
- *    8 : reserved
- * 9-11 : reserved
- *
   * Note we have UOP and PEBS bits reserved for now
   * just in case if we will need them once
   */
@@ -788,5 +780,60 @@ enum P4_PEBS_METRIC {
         P4_PEBS_METRIC__max
  };
  
+/*
+ * Notes on internal configuration of ESCR+CCCR tuples
+ *
+ * Since P4 has quite the different architecture of
+ * performance registers in compare with "architectural"
+ * once and we have on 64 bits to keep configuration
+ * of performance event, the following trick is used.
+ *
+ * 1) Since both ESCR and CCCR registers have only low
+ *    32 bits valuable, we pack them into a single 64 bit
+ *    configuration. Low 32 bits of such config correspond
+ *    to low 32 bits of CCCR register and high 32 bits
+ *    correspond to low 32 bits of ESCR register.
+ *
+ * 2) The meaning of every bit of such config field can
+ *    be found in Intel SDM but it should be noted that
+ *    we "borrow" some reserved bits for own usage and
+ *    clean them or set to a proper value when we do
+ *    a real write to hardware registers.
+ *
+ * 3) The format of bits of config is the following
+ *    and should be either 0 or set to some predefined
+ *    values:
+ *
+ *    Low 32 bits
+ *    -----------
+ *      0-6: P4_PEBS_METRIC enum
+ *     7-11:                    reserved
+ *       12:                    reserved (Enable)
+ *    13-15:                    reserved (ESCR select)
+ *    16-17: Active Thread
+ *       18: Compare
+ *       19: Complement
+ *    20-23: Threshold
+ *       24: Edge
+ *       25:                    reserved (FORCE_OVF)
+ *       26:                    reserved (OVF_PMI_T0)
+ *       27:                    reserved (OVF_PMI_T1)
+ *    28-29:                    reserved
+ *       30:                    reserved (Cascade)
+ *       31:                    reserved (OVF)
+ *
+ *    High 32 bits
+ *    ------------
+ *        0:                    reserved (T1_USR)
+ *        1:                    reserved (T1_OS)
+ *        2:                    reserved (T0_USR)
+ *        3:                    reserved (T0_OS)
+ *        4: Tag Enable
+ *      5-8: Tag Value
+ *     9-24: Event Mask (may use P4_ESCR_EMASK_BIT helper)
+ *    25-30: enum P4_EVENTS
+ *       31:                    reserved (HT thread)
+ */
+
  #endif /* PERF_EVENT_P4_H */
  
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h

index d6763b139a844243b9fbb8dc620e633fe7b5825a..db8aa19a08a22d35e608625b251bcee7fa5fe1ee 100644 (file)
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -53,6 +53,12 @@ extern void x86_mrst_early_setup(void);
  static inline void x86_mrst_early_setup(void) { }
  #endif
  
+#ifdef CONFIG_X86_INTEL_CE
+extern void x86_ce4100_early_setup(void);
+#else
+static inline void x86_ce4100_early_setup(void) { }
+#endif
+
  #ifndef _SETUP
  
  /*
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h

index 1def60114906bf1b1dfa98ef3bd8ac177f65010f..6c22bf353f26495b1fa71dc5a92cdaa05e5b1d8e 100644 (file)
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -48,7 +48,6 @@ static inline void __init smpboot_setup_io_apic(void)
                 setup_IO_APIC();
         else {
                 nr_ioapics = 0;
-               localise_nmi_watchdog();
         }
  #endif
  }
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h

index 2b16a2ad23dc6b9647028c0808f8f45b094e74ac..52b5c7ed3608d9fc439c5ca69bf58ca1b8ebef88 100644 (file)
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -7,6 +7,7 @@
  #define _ASM_X86_STACKTRACE_H
  
  #include <linux/uaccess.h>
+#include <linux/ptrace.h>
  
  extern int kstack_depth_to_print;
  
@@ -46,7 +47,7 @@ struct stacktrace_ops {
  };
  
  void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp,
+               unsigned long *stack,
                 const struct stacktrace_ops *ops, void *data);
  
  #ifdef CONFIG_X86_32
@@ -57,13 +58,39 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
  #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
  #endif
  
+#ifdef CONFIG_FRAME_POINTER
+static inline unsigned long
+stack_frame(struct task_struct *task, struct pt_regs *regs)
+{
+       unsigned long bp;
+
+       if (regs)
+               return regs->bp;
+
+       if (task == current) {
+               /* Grab bp right from our regs */
+               get_bp(bp);
+               return bp;
+       }
+
+       /* bp is the last reg pushed by switch_to */
+       return *(unsigned long *)task->thread.sp;
+}
+#else
+static inline unsigned long
+stack_frame(struct task_struct *task, struct pt_regs *regs)
+{
+       return 0;
+}
+#endif
+
  extern void
  show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp, char *log_lvl);
+                  unsigned long *stack, char *log_lvl);
  
  extern void
  show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *sp, unsigned long bp, char *log_lvl);
+                  unsigned long *sp, char *log_lvl);
  
  extern unsigned int code_bytes;
  
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h

index 5469630b27f56d732b10036ba381df39f28ca52b..fa7b9176b76cb33820034403fd8f4a50dc49709c 100644 (file)
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -10,12 +10,6 @@
  unsigned long long native_sched_clock(void);
  extern int recalibrate_cpu_khz(void);
  
-#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
-extern int timer_ack;
-#else
-# define timer_ack (0)
-#endif
-
  extern int no_timer_check;
  
  /* Accelerators for sched_clock()
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h

index 42d412fd8b02cdd369b5cc8db5a67aa5cc7a0770..ce1d54c8a433a6b866977beeb9c6681e0db64746 100644 (file)
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -26,20 +26,22 @@
   * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
   * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
   *
- * We will use 31 sets, one for sending BAU messages from each of the 32
+ * We will use one set for sending BAU messages from each of the
   * cpu's on the uvhub.
   *
   * TLB shootdown will use the first of the 8 descriptors of each set.
   * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
   */
  
+#define MAX_CPUS_PER_UVHUB             64
+#define MAX_CPUS_PER_SOCKET            32
+#define UV_ADP_SIZE                    64 /* hardware-provided max. */
+#define UV_CPUS_PER_ACT_STATUS         32 /* hardware-provided max. */
  #define UV_ITEMS_PER_DESCRIPTOR                8
  /* the 'throttle' to prevent the hardware stay-busy bug */
  #define MAX_BAU_CONCURRENT             3
-#define UV_CPUS_PER_ACT_STATUS         32
  #define UV_ACT_STATUS_MASK             0x3
  #define UV_ACT_STATUS_SIZE             2
-#define UV_ADP_SIZE                    32
  #define UV_DISTRIBUTION_SIZE           256
  #define UV_SW_ACK_NPENDING             8
  #define UV_NET_ENDPOINT_INTD           0x38
@@ -100,7 +102,6 @@
   * number of destination side software ack resources
   */
  #define DEST_NUM_RESOURCES             8
-#define MAX_CPUS_PER_NODE              32
  /*
   * completion statuses for sending a TLB flush message
   */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile

index 1e994754d323f400b85c5d1d7350c6e52a702ff9..34244b2cd880cff373e744ec34193adb8287ab2a 100644 (file)
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -85,7 +85,6 @@ obj-$(CONFIG_DOUBLEFAULT)     += doublefault_32.o
  obj-$(CONFIG_KGDB)             += kgdb.o
  obj-$(CONFIG_VM86)             += vm86_32.o
  obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
-obj-$(CONFIG_EARLY_PRINTK_MRST)        += early_printk_mrst.o
  
  obj-$(CONFIG_HPET_TIMER)       += hpet.o
  obj-$(CONFIG_APB_TIMER)                += apb_timer.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c

index 71232b941b6c9c6409fd14e9479f3d1625eeaeb0..17c8090fabd4703d324240328e2a15a55299bb70 100644 (file)
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -198,6 +198,11 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
  {
         unsigned int ver = 0;
  
+       if (id >= (MAX_LOCAL_APIC-1)) {
+               printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
+               return;
+       }
+
         if (!enabled) {
                 ++disabled_cpus;
                 return;
@@ -910,13 +915,13 @@ static int __init acpi_parse_madt_lapic_entries(void)
         acpi_register_lapic_address(acpi_lapic_addr);
  
         count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
-                                     acpi_parse_sapic, MAX_APICS);
+                                     acpi_parse_sapic, MAX_LOCAL_APIC);
  
         if (!count) {
                 x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
-                                               acpi_parse_x2apic, MAX_APICS);
+                                       acpi_parse_x2apic, MAX_LOCAL_APIC);
                 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
-                                             acpi_parse_lapic, MAX_APICS);
+                                       acpi_parse_lapic, MAX_LOCAL_APIC);
         }
         if (!count && !x2count) {
                 printk(KERN_ERR PREFIX "No LAPIC entries present\n");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c

index 5079f24c955a2d3b9b66532cd2c8cd45e5116d14..123608531c8f933b819a3fc7c135748137c8eb5b 100644 (file)
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -353,6 +353,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
         mutex_unlock(&smp_alt);
  }
  
+bool skip_smp_alternatives;
  void alternatives_smp_switch(int smp)
  {
         struct smp_alt_module *mod;
@@ -368,7 +369,7 @@ void alternatives_smp_switch(int smp)
         printk("lockdep: fixing up alternatives.\n");
  #endif
  
-       if (noreplace_smp || smp_alt_once)
+       if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
                 return;
         BUG_ON(!smp && (num_online_cpus() > 1));
  
@@ -591,17 +592,21 @@ static atomic_t stop_machine_first;
  static int wrote_text;
  
  struct text_poke_params {
-       void *addr;
-       const void *opcode;
-       size_t len;
+       struct text_poke_param *params;
+       int nparams;
  };
  
  static int __kprobes stop_machine_text_poke(void *data)
  {
         struct text_poke_params *tpp = data;
+       struct text_poke_param *p;
+       int i;
  
         if (atomic_dec_and_test(&stop_machine_first)) {
-               text_poke(tpp->addr, tpp->opcode, tpp->len);
+               for (i = 0; i < tpp->nparams; i++) {
+                       p = &tpp->params[i];
+                       text_poke(p->addr, p->opcode, p->len);
+               }
                 smp_wmb();      /* Make sure other cpus see that this has run */
                 wrote_text = 1;
         } else {
@@ -610,8 +615,12 @@ static int __kprobes stop_machine_text_poke(void *data)
                 smp_mb();       /* Load wrote_text before following execution */
         }
  
-       flush_icache_range((unsigned long)tpp->addr,
-                          (unsigned long)tpp->addr + tpp->len);
+       for (i = 0; i < tpp->nparams; i++) {
+               p = &tpp->params[i];
+               flush_icache_range((unsigned long)p->addr,
+                                  (unsigned long)p->addr + p->len);
+       }
+
         return 0;
  }
  
@@ -631,10 +640,13 @@ static int __kprobes stop_machine_text_poke(void *data)
  void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
  {
         struct text_poke_params tpp;
+       struct text_poke_param p;
  
-       tpp.addr = addr;
-       tpp.opcode = opcode;
-       tpp.len = len;
+       p.addr = addr;
+       p.opcode = opcode;
+       p.len = len;
+       tpp.params = &p;
+       tpp.nparams = 1;
         atomic_set(&stop_machine_first, 1);
         wrote_text = 0;
         /* Use __stop_machine() because the caller already got online_cpus. */
@@ -642,6 +654,26 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
         return addr;
  }
  
+/**
+ * text_poke_smp_batch - Update instructions on a live kernel on SMP
+ * @params: an array of text_poke parameters
+ * @n: the number of elements in params.
+ *
+ * Modify multi-byte instruction by using stop_machine() on SMP. Since the
+ * stop_machine() is heavy task, it is better to aggregate text_poke requests
+ * and do it once if possible.
+ *
+ * Note: Must be called under get_online_cpus() and text_mutex.
+ */
+void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
+{
+       struct text_poke_params tpp = {.params = params, .nparams = n};
+
+       atomic_set(&stop_machine_first, 1);
+       wrote_text = 0;
+       stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+}
+
  #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
  
  #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c

index 8f6463d8ed0de1ebfece6cb1138a15697f657197..affacb5e0065a1392713da260ecf8abfbab2c405 100644 (file)
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -12,95 +12,116 @@
  
  static u32 *flush_words;
  
-struct pci_device_id k8_nb_ids[] = {
+struct pci_device_id amd_nb_misc_ids[] = {
         { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
         { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
         { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
         {}
  };
-EXPORT_SYMBOL(k8_nb_ids);
+EXPORT_SYMBOL(amd_nb_misc_ids);
  
-struct k8_northbridge_info k8_northbridges;
-EXPORT_SYMBOL(k8_northbridges);
+struct amd_northbridge_info amd_northbridges;
+EXPORT_SYMBOL(amd_northbridges);
  
-static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
+static struct pci_dev *next_northbridge(struct pci_dev *dev,
+                                       struct pci_device_id *ids)
  {
         do {
                 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
                 if (!dev)
                         break;
-       } while (!pci_match_id(&k8_nb_ids[0], dev));
+       } while (!pci_match_id(ids, dev));
         return dev;
  }
  
-int cache_k8_northbridges(void)
+int amd_cache_northbridges(void)
  {
-       int i;
-       struct pci_dev *dev;
+       int i = 0;
+       struct amd_northbridge *nb;
+       struct pci_dev *misc;
  
-       if (k8_northbridges.num)
+       if (amd_nb_num())
                 return 0;
  
-       dev = NULL;
-       while ((dev = next_k8_northbridge(dev)) != NULL)
-               k8_northbridges.num++;
+       misc = NULL;
+       while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
+               i++;
  
-       /* some CPU families (e.g. family 0x11) do not support GART */
-       if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
-           boot_cpu_data.x86 == 0x15)
-               k8_northbridges.gart_supported = 1;
+       if (i == 0)
+               return 0;
  
-       k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
-                                         sizeof(void *), GFP_KERNEL);
-       if (!k8_northbridges.nb_misc)
+       nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
+       if (!nb)
                 return -ENOMEM;
  
-       if (!k8_northbridges.num) {
-               k8_northbridges.nb_misc[0] = NULL;
-               return 0;
-       }
+       amd_northbridges.nb = nb;
+       amd_northbridges.num = i;
  
-       if (k8_northbridges.gart_supported) {
-               flush_words = kmalloc(k8_northbridges.num * sizeof(u32),
-                                     GFP_KERNEL);
-               if (!flush_words) {
-                       kfree(k8_northbridges.nb_misc);
-                       return -ENOMEM;
-               }
-       }
+       misc = NULL;
+       for (i = 0; i != amd_nb_num(); i++) {
+               node_to_amd_nb(i)->misc = misc =
+                       next_northbridge(misc, amd_nb_misc_ids);
+        }
+
+       /* some CPU families (e.g. family 0x11) do not support GART */
+       if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+           boot_cpu_data.x86 == 0x15)
+               amd_northbridges.flags |= AMD_NB_GART;
+
+       /*
+        * Some CPU families support L3 Cache Index Disable. There are some
+        * limitations because of E382 and E388 on family 0x10.
+        */
+       if (boot_cpu_data.x86 == 0x10 &&
+           boot_cpu_data.x86_model >= 0x8 &&
+           (boot_cpu_data.x86_model > 0x9 ||
+            boot_cpu_data.x86_mask >= 0x1))
+               amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
  
-       dev = NULL;
-       i = 0;
-       while ((dev = next_k8_northbridge(dev)) != NULL) {
-               k8_northbridges.nb_misc[i] = dev;
-               if (k8_northbridges.gart_supported)
-                       pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
-       }
-       k8_northbridges.nb_misc[i] = NULL;
         return 0;
  }
-EXPORT_SYMBOL_GPL(cache_k8_northbridges);
+EXPORT_SYMBOL_GPL(amd_cache_northbridges);
  
  /* Ignores subdevice/subvendor but as far as I can figure out
     they're useless anyways */
-int __init early_is_k8_nb(u32 device)
+int __init early_is_amd_nb(u32 device)
  {
         struct pci_device_id *id;
         u32 vendor = device & 0xffff;
         device >>= 16;
-       for (id = k8_nb_ids; id->vendor; id++)
+       for (id = amd_nb_misc_ids; id->vendor; id++)
                 if (vendor == id->vendor && device == id->device)
                         return 1;
         return 0;
  }
  
-void k8_flush_garts(void)
+int amd_cache_gart(void)
+{
+       int i;
+
+       if (!amd_nb_has_feature(AMD_NB_GART))
+               return 0;
+
+       flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
+       if (!flush_words) {
+               amd_northbridges.flags &= ~AMD_NB_GART;
+               return -ENOMEM;
+       }
+
+       for (i = 0; i != amd_nb_num(); i++)
+               pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+                                     &flush_words[i]);
+
+       return 0;
+}
+
+void amd_flush_garts(void)
  {
         int flushed, i;
         unsigned long flags;
         static DEFINE_SPINLOCK(gart_lock);
  
-       if (!k8_northbridges.gart_supported)
+       if (!amd_nb_has_feature(AMD_NB_GART))
                 return;
  
         /* Avoid races between AGP and IOMMU. In theory it's not needed
@@ -109,16 +130,16 @@ void k8_flush_garts(void)
            that it doesn't matter to serialize more. -AK */
         spin_lock_irqsave(&gart_lock, flags);
         flushed = 0;
-       for (i = 0; i < k8_northbridges.num; i++) {
-               pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c,
-                                      flush_words[i]|1);
+       for (i = 0; i < amd_nb_num(); i++) {
+               pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+                                      flush_words[i] | 1);
                 flushed++;
         }
-       for (i = 0; i < k8_northbridges.num; i++) {
+       for (i = 0; i < amd_nb_num(); i++) {
                 u32 w;
                 /* Make sure the hardware actually executed the flush*/
                 for (;;) {
-                       pci_read_config_dword(k8_northbridges.nb_misc[i],
+                       pci_read_config_dword(node_to_amd_nb(i)->misc,
                                               0x9c, &w);
                         if (!(w & 1))
                                 break;
@@ -129,19 +150,23 @@ void k8_flush_garts(void)
         if (!flushed)
                 printk("nothing to flush?\n");
  }
-EXPORT_SYMBOL_GPL(k8_flush_garts);
+EXPORT_SYMBOL_GPL(amd_flush_garts);
  
-static __init int init_k8_nbs(void)
+static __init int init_amd_nbs(void)
  {
         int err = 0;
  
-       err = cache_k8_northbridges();
+       err = amd_cache_northbridges();
  
         if (err < 0)
-               printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
+               printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n");
+
+       if (amd_cache_gart() < 0)
+               printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, "
+                      "GART support disabled.\n");
  
         return err;
  }
  
  /* This has to go after the PCI subsystem */
-fs_initcall(init_k8_nbs);
+fs_initcall(init_amd_nbs);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c

index 92543c73cf8ed8d085dc581fe8171b3bbb6f939e..7c9ab59653e8bc5e229ba9e96734d20d4db50db5 100644 (file)
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -315,6 +315,7 @@ static void apbt_setup_irq(struct apbt_dev *adev)
  
         if (system_state == SYSTEM_BOOTING) {
                 irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+               irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
                 /* APB timer irqs are set up as mp_irqs, timer is edge type */
                 __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
                 if (request_irq(adev->irq, apbt_interrupt_handler,
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c

index b3a16e8f0703d47f50a354223bfe8c6e9382126e..dcd7c83e1659212ea5bab9d1ba0b1f0d8e4942c5 100644 (file)
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -206,7 +206,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
   * Do an PCI bus scan by hand because we're running before the PCI
   * subsystem.
   *
- * All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+ * All AMD AGP bridges are AGPv3 compliant, so we can do this scan
   * generically. It's probably overkill to always scan all slots because
   * the AGP bridges should be always an own bus on the HT hierarchy,
   * but do it here for future safety.
@@ -303,7 +303,7 @@ void __init early_gart_iommu_check(void)
                 dev_limit = bus_dev_ranges[i].dev_limit;
  
                 for (slot = dev_base; slot < dev_limit; slot++) {
-                       if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+                       if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
                                 continue;
  
                         ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
@@ -358,7 +358,7 @@ void __init early_gart_iommu_check(void)
                 dev_limit = bus_dev_ranges[i].dev_limit;
  
                 for (slot = dev_base; slot < dev_limit; slot++) {
-                       if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+                       if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
                                 continue;
  
                         ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
@@ -400,7 +400,7 @@ int __init gart_iommu_hole_init(void)
                 dev_limit = bus_dev_ranges[i].dev_limit;
  
                 for (slot = dev_base; slot < dev_limit; slot++) {
-                       if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+                       if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
                                 continue;
  
                         iommu_detected = 1;
@@ -518,7 +518,7 @@ out:
                 dev_base = bus_dev_ranges[i].dev_base;
                 dev_limit = bus_dev_ranges[i].dev_limit;
                 for (slot = dev_base; slot < dev_limit; slot++) {
-                       if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+                       if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
                                 continue;
  
                         write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile

index 910f20b457c464d34f1e9874269d652ea0da325e..3966b564ea478746bc77d66886249a294177b21a 100644 (file)
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -3,10 +3,7 @@
  #
  
  obj-$(CONFIG_X86_LOCAL_APIC)   += apic.o apic_noop.o probe_$(BITS).o ipi.o
-ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
-obj-$(CONFIG_X86_LOCAL_APIC)   += nmi.o
-endif
-obj-$(CONFIG_HARDLOCKUP_DETECTOR)      += hw_nmi.o
+obj-y                          += hw_nmi.o
  
  obj-$(CONFIG_X86_IO_APIC)      += io_apic.o
  obj-$(CONFIG_SMP)              += ipi.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c

index 78218135b48e6169d155fb4a097e5b6c8e30e53a..879999a5230fc613a0815cd056b4820ebb5cf95a 100644 (file)
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -31,7 +31,6 @@
  #include <linux/init.h>
  #include <linux/cpu.h>
  #include <linux/dmi.h>
-#include <linux/nmi.h>
  #include <linux/smp.h>
  #include <linux/mm.h>
  
@@ -432,17 +431,18 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
         reserved = reserve_eilvt_offset(offset, new);
  
         if (reserved != new) {
-               pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but "
-                      "vector 0x%x was already reserved by another core, "
-                      "APIC%lX=0x%x\n",
-                      smp_processor_id(), new, reserved, reg, old);
+               pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+                      "vector 0x%x, but the register is already in use for "
+                      "vector 0x%x on another cpu\n",
+                      smp_processor_id(), reg, offset, new, reserved);
                 return -EINVAL;
         }
  
         if (!eilvt_entry_is_changeable(old, new)) {
-               pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but "
-                      "register already in use, APIC%lX=0x%x\n",
-                      smp_processor_id(), new, reg, old);
+               pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+                      "vector 0x%x, but the register is already in use for "
+                      "vector 0x%x on this cpu\n",
+                      smp_processor_id(), reg, offset, new, old);
                 return -EBUSY;
         }
  
@@ -799,11 +799,7 @@ void __init setup_boot_APIC_clock(void)
          * PIT/HPET going.  Otherwise register lapic as a dummy
          * device.
          */
-       if (nmi_watchdog != NMI_IO_APIC)
-               lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-       else
-               pr_warning("APIC timer registered as dummy,"
-                       " due to nmi_watchdog=%d!\n", nmi_watchdog);
+       lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
  
         /* Setup the lapic or request the broadcast */
         setup_APIC_timer();
@@ -1387,7 +1383,6 @@ void __cpuinit end_local_APIC_setup(void)
         }
  #endif
  
-       setup_apic_nmi_watchdog(NULL);
         apic_pm_activate();
  
         /*
@@ -1538,13 +1533,60 @@ static int __init detect_init_APIC(void)
         return 0;
  }
  #else
+
+static int apic_verify(void)
+{
+       u32 features, h, l;
+
+       /*
+        * The APIC feature bit should now be enabled
+        * in `cpuid'
+        */
+       features = cpuid_edx(1);
+       if (!(features & (1 << X86_FEATURE_APIC))) {
+               pr_warning("Could not enable APIC!\n");
+               return -1;
+       }
+       set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+       /* The BIOS may have set up the APIC at some other address */
+       rdmsr(MSR_IA32_APICBASE, l, h);
+       if (l & MSR_IA32_APICBASE_ENABLE)
+               mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+
+       pr_info("Found and enabled local APIC!\n");
+       return 0;
+}
+
+int apic_force_enable(void)
+{
+       u32 h, l;
+
+       if (disable_apic)
+               return -1;
+
+       /*
+        * Some BIOSes disable the local APIC in the APIC_BASE
+        * MSR. This can only be done in software for Intel P6 or later
+        * and AMD K7 (Model > 1) or later.
+        */
+       rdmsr(MSR_IA32_APICBASE, l, h);
+       if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+               pr_info("Local APIC disabled by BIOS -- reenabling.\n");
+               l &= ~MSR_IA32_APICBASE_BASE;
+               l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
+               wrmsr(MSR_IA32_APICBASE, l, h);
+               enabled_via_apicbase = 1;
+       }
+       return apic_verify();
+}
+
  /*
   * Detect and initialize APIC
   */
  static int __init detect_init_APIC(void)
  {
-       u32 h, l, features;
-
         /* Disabled by kernel option? */
         if (disable_apic)
                 return -1;
@@ -1574,38 +1616,12 @@ static int __init detect_init_APIC(void)
                                 "you can enable it with \"lapic\"\n");
                         return -1;
                 }
-               /*
-                * Some BIOSes disable the local APIC in the APIC_BASE
-                * MSR. This can only be done in software for Intel P6 or later
-                * and AMD K7 (Model > 1) or later.
-                */
-               rdmsr(MSR_IA32_APICBASE, l, h);
-               if (!(l & MSR_IA32_APICBASE_ENABLE)) {
-                       pr_info("Local APIC disabled by BIOS -- reenabling.\n");
-                       l &= ~MSR_IA32_APICBASE_BASE;
-                       l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
-                       wrmsr(MSR_IA32_APICBASE, l, h);
-                       enabled_via_apicbase = 1;
-               }
-       }
-       /*
-        * The APIC feature bit should now be enabled
-        * in `cpuid'
-        */
-       features = cpuid_edx(1);
-       if (!(features & (1 << X86_FEATURE_APIC))) {
-               pr_warning("Could not enable APIC!\n");
-               return -1;
+               if (apic_force_enable())
+                       return -1;
+       } else {
+               if (apic_verify())
+                       return -1;
         }
-       set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
-       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
-       /* The BIOS may have set up the APIC at some other address */
-       rdmsr(MSR_IA32_APICBASE, l, h);
-       if (l & MSR_IA32_APICBASE_ENABLE)
-               mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
-
-       pr_info("Found and enabled local APIC!\n");
  
         apic_pm_activate();
  
@@ -1693,7 +1709,7 @@ void __init init_apic_mappings(void)
   * This initializes the IO-APIC and APIC hardware if this is
   * a UP kernel.
   */
-int apic_version[MAX_APICS];
+int apic_version[MAX_LOCAL_APIC];
  
  int __init APIC_init_uniprocessor(void)
  {
@@ -1758,17 +1774,10 @@ int __init APIC_init_uniprocessor(void)
                 setup_IO_APIC();
         else {
                 nr_ioapics = 0;
-               localise_nmi_watchdog();
         }
-#else
-       localise_nmi_watchdog();
  #endif
  
         x86_init.timers.setup_percpu_clockev();
-#ifdef CONFIG_X86_64
-       check_nmi_watchdog();
-#endif
-
         return 0;
  }
  
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c

index 62f6e1e55b90d7f9a2bc460e73ba8895da23c4ff..72ec29e1ae0605990dc7f3b9fc1511bf5cab3cbe 100644 (file)
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -17,20 +17,31 @@
  #include <linux/nmi.h>
  #include <linux/module.h>
  
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
  u64 hw_nmi_get_sample_period(void)
  {
         return (u64)(cpu_khz) * 1000 * 60;
  }
+#endif
  
-#ifdef ARCH_HAS_NMI_WATCHDOG
-
+#ifdef arch_trigger_all_cpu_backtrace
  /* For reliability, we're prepared to waste bits here. */
  static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
  
+/* "in progress" flag of arch_trigger_all_cpu_backtrace */
+static unsigned long backtrace_flag;
+
  void arch_trigger_all_cpu_backtrace(void)
  {
         int i;
  
+       if (test_and_set_bit(0, &backtrace_flag))
+               /*
+                * If there is already a trigger_all_cpu_backtrace() in progress
+                * (backtrace_flag == 1), don't output double cpu dump infos.
+                */
+               return;
+
         cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
  
         printk(KERN_INFO "sending NMI to all CPUs:\n");
@@ -42,6 +53,9 @@ void arch_trigger_all_cpu_backtrace(void)
                         break;
                 mdelay(1);
         }
+
+       clear_bit(0, &backtrace_flag);
+       smp_mb__after_clear_bit();
  }
  
  static int __kprobes
@@ -50,7 +64,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
  {
         struct die_args *args = __args;
         struct pt_regs *regs;
-       int cpu = smp_processor_id();
+       int cpu;
  
         switch (cmd) {
         case DIE_NMI:
@@ -62,6 +76,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
         }
  
         regs = args->regs;
+       cpu = smp_processor_id();
  
         if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
                 static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -91,18 +106,3 @@ static int __init register_trigger_all_cpu_backtrace(void)
  }
  early_initcall(register_trigger_all_cpu_backtrace);
  #endif
-
-/* STUB calls to mimic old nmi_watchdog behaviour */
-#if defined(CONFIG_X86_LOCAL_APIC)
-unsigned int nmi_watchdog = NMI_NONE;
-EXPORT_SYMBOL(nmi_watchdog);
-void acpi_nmi_enable(void) { return; }
-void acpi_nmi_disable(void) { return; }
-#endif
-atomic_t nmi_active = ATOMIC_INIT(0);           /* oprofile uses this */
-EXPORT_SYMBOL(nmi_active);
-int unknown_nmi_panic;
-void cpu_nmi_set_wd_enabled(void) { return; }
-void stop_apic_nmi_watchdog(void *unused) { return; }
-void setup_apic_nmi_watchdog(void *unused) { return; }
-int __init check_nmi_watchdog(void) { return 0; }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c

index fadcd743a74f8bdcd5effbaf7e28b01ea3003532..f6cd5b41077034405045fec84fcde39a4b0b3212 100644 (file)
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -54,7 +54,6 @@
  #include <asm/dma.h>
  #include <asm/timer.h>
  #include <asm/i8259.h>
-#include <asm/nmi.h>
  #include <asm/msidef.h>
  #include <asm/hypertransport.h>
  #include <asm/setup.h>
@@ -1934,8 +1933,7 @@ void disable_IO_APIC(void)
   *
   * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
   */
-
-void __init setup_ioapic_ids_from_mpc(void)
+void __init setup_ioapic_ids_from_mpc_nocheck(void)
  {
         union IO_APIC_reg_00 reg_00;
         physid_mask_t phys_id_present_map;
@@ -1944,15 +1942,6 @@ void __init setup_ioapic_ids_from_mpc(void)
         unsigned char old_id;
         unsigned long flags;
  
-       if (acpi_ioapic)
-               return;
-       /*
-        * Don't check I/O APIC IDs for xAPIC systems.  They have
-        * no meaning without the serial APIC bus.
-        */
-       if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-               || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-               return;
         /*
          * This is broken; anything with a real cpu count has to
          * circumvent this idiocy regardless.
@@ -2006,7 +1995,6 @@ void __init setup_ioapic_ids_from_mpc(void)
                         physids_or(phys_id_present_map, phys_id_present_map, tmp);
                 }
  
-
                 /*
                  * We need to adjust the IRQ routing table
                  * if the ID changed.
@@ -2042,6 +2030,21 @@ void __init setup_ioapic_ids_from_mpc(void)
                         apic_printk(APIC_VERBOSE, " ok.\n");
         }
  }
+
+void __init setup_ioapic_ids_from_mpc(void)
+{
+
+       if (acpi_ioapic)
+               return;
+       /*
+        * Don't check I/O APIC IDs for xAPIC systems.  They have
+        * no meaning without the serial APIC bus.
+        */
+       if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+               || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+               return;
+       setup_ioapic_ids_from_mpc_nocheck();
+}
  #endif
  
  int no_timer_check __initdata;
@@ -2642,24 +2645,6 @@ static void lapic_register_intr(int irq)
                                       "edge");
  }
  
-static void __init setup_nmi(void)
-{
-       /*
-        * Dirty trick to enable the NMI watchdog ...
-        * We put the 8259A master into AEOI mode and
-        * unmask on all local APICs LVT0 as NMI.
-        *
-        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
-        * is from Maciej W. Rozycki - so we do not have to EOI from
-        * the NMI handler or the timer interrupt.
-        */
-       apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
-
-       enable_NMI_through_LVT0();
-
-       apic_printk(APIC_VERBOSE, " done.\n");
-}
-
  /*
   * This looks a bit hackish but it's about the only one way of sending
   * a few INTA cycles to 8259As and any associated glue logic.  ICR does
@@ -2765,15 +2750,6 @@ static inline void __init check_timer(void)
          */
         apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
         legacy_pic->init(1);
-#ifdef CONFIG_X86_32
-       {
-               unsigned int ver;
-
-               ver = apic_read(APIC_LVR);
-               ver = GET_APIC_VERSION(ver);
-               timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
-       }
-#endif
  
         pin1  = find_isa_irq_pin(0, mp_INT);
         apic1 = find_isa_irq_apic(0, mp_INT);
@@ -2821,10 +2797,6 @@ static inline void __init check_timer(void)
                                 unmask_ioapic(cfg);
                 }
                 if (timer_irq_works()) {
-                       if (nmi_watchdog == NMI_IO_APIC) {
-                               setup_nmi();
-                               legacy_pic->unmask(0);
-                       }
                         if (disable_timer_pin_1 > 0)
                                 clear_IO_APIC_pin(0, pin1);
                         goto out;
@@ -2850,11 +2822,6 @@ static inline void __init check_timer(void)
                 if (timer_irq_works()) {
                         apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
                         timer_through_8259 = 1;
-                       if (nmi_watchdog == NMI_IO_APIC) {
-                               legacy_pic->mask(0);
-                               setup_nmi();
-                               legacy_pic->unmask(0);
-                       }
                         goto out;
                 }
                 /*
@@ -2866,15 +2833,6 @@ static inline void __init check_timer(void)
                 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
         }
  
-       if (nmi_watchdog == NMI_IO_APIC) {
-               apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
-                           "through the IO-APIC - disabling NMI Watchdog!\n");
-               nmi_watchdog = NMI_NONE;
-       }
-#ifdef CONFIG_X86_32
-       timer_ack = 0;
-#endif
-
         apic_printk(APIC_QUIET, KERN_INFO
                     "...trying to set up timer as Virtual Wire IRQ...\n");
  
@@ -3639,7 +3597,7 @@ int __init io_apic_get_redir_entries (int ioapic)
         return reg_01.bits.entries + 1;
  }
  
-void __init probe_nr_irqs_gsi(void)
+static void __init probe_nr_irqs_gsi(void)
  {
         int nr;
  
@@ -3956,7 +3914,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
         return res;
  }
  
-void __init ioapic_init_mappings(void)
+void __init ioapic_and_gsi_init(void)
  {
         unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
         struct resource *ioapic_res;
@@ -3994,6 +3952,8 @@ fake_ioapic_page:
                 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
                 ioapic_res++;
         }
+
+       probe_nr_irqs_gsi();
  }
  
  void __init ioapic_insert_resources(void)
@@ -4103,7 +4063,8 @@ void __init pre_init_apic_IRQ0(void)
  
         printk(KERN_INFO "Early APIC setup for system timer0\n");
  #ifndef CONFIG_SMP
-       phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+       physid_set_mask_of_physid(boot_cpu_physical_apicid,
+                                        &phys_cpu_present_map);
  #endif
         /* Make sure the irq descriptor is set up */
         cfg = alloc_irq_and_cfg_at(0, 0);
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c

deleted file mode 100644 (file)

index c90041c..0000000
--- a/arch/x86/kernel/apic/nmi.c
+++ /dev/null
@@ -1,567 +0,0 @@
-/*
- *  NMI watchdog support on APIC systems
- *
- *  Started by Ingo Molnar <mingo@redhat.com>
- *
- *  Fixes:
- *  Mikael Pettersson  : AMD K7 support for local APIC NMI watchdog.
- *  Mikael Pettersson  : Power Management for local APIC NMI watchdog.
- *  Mikael Pettersson  : Pentium 4 support for local APIC NMI watchdog.
- *  Pavel Machek and
- *  Mikael Pettersson  : PM converted to driver model. Disable/enable API.
- */
-
-#include <asm/apic.h>
-
-#include <linux/nmi.h>
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/sysdev.h>
-#include <linux/sysctl.h>
-#include <linux/percpu.h>
-#include <linux/kprobes.h>
-#include <linux/cpumask.h>
-#include <linux/kernel_stat.h>
-#include <linux/kdebug.h>
-#include <linux/smp.h>
-
-#include <asm/i8259.h>
-#include <asm/io_apic.h>
-#include <asm/proto.h>
-#include <asm/timer.h>
-
-#include <asm/mce.h>
-
-#include <asm/mach_traps.h>
-
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-
-/* For reliability, we're prepared to waste bits here. */
-static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-
-/* nmi_active:
- * >0: the lapic NMI watchdog is active, but can be disabled
- * <0: the lapic NMI watchdog has not been set up, and cannot
- *     be enabled
- *  0: the lapic NMI watchdog is disabled, but can be enabled
- */
-atomic_t nmi_active = ATOMIC_INIT(0);          /* oprofile uses this */
-EXPORT_SYMBOL(nmi_active);
-
-unsigned int nmi_watchdog = NMI_NONE;
-EXPORT_SYMBOL(nmi_watchdog);
-
-static int panic_on_timeout;
-
-static unsigned int nmi_hz = HZ;
-static DEFINE_PER_CPU(short, wd_enabled);
-static int endflag __initdata;
-
-static inline unsigned int get_nmi_count(int cpu)
-{
-       return per_cpu(irq_stat, cpu).__nmi_count;
-}
-
-static inline int mce_in_progress(void)
-{
-#if defined(CONFIG_X86_MCE)
-       return atomic_read(&mce_entry) > 0;
-#endif
-       return 0;
-}
-
-/*
- * Take the local apic timer and PIT/HPET into account. We don't
- * know which one is active, when we have highres/dyntick on
- */
-static inline unsigned int get_timer_irqs(int cpu)
-{
-       return per_cpu(irq_stat, cpu).apic_timer_irqs +
-               per_cpu(irq_stat, cpu).irq0_irqs;
-}
-
-#ifdef CONFIG_SMP
-/*
- * The performance counters used by NMI_LOCAL_APIC don't trigger when
- * the CPU is idle. To make sure the NMI watchdog really ticks on all
- * CPUs during the test make them busy.
- */
-static __init void nmi_cpu_busy(void *data)
-{
-       local_irq_enable_in_hardirq();
-       /*
-        * Intentionally don't use cpu_relax here. This is
-        * to make sure that the performance counter really ticks,
-        * even if there is a simulator or similar that catches the
-        * pause instruction. On a real HT machine this is fine because
-        * all other CPUs are busy with "useless" delay loops and don't
-        * care if they get somewhat less cycles.
-        */
-       while (endflag == 0)
-               mb();
-}
-#endif
-
-static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
-{
-       printk(KERN_CONT "\n");
-
-       printk(KERN_WARNING
-               "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
-                       cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
-
-       printk(KERN_WARNING
-               "Please report this to bugzilla.kernel.org,\n");
-       printk(KERN_WARNING
-               "and attach the output of the 'dmesg' command.\n");
-
-       per_cpu(wd_enabled, cpu) = 0;
-       atomic_dec(&nmi_active);
-}
-
-static void __acpi_nmi_disable(void *__unused)
-{
-       apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
-int __init check_nmi_watchdog(void)
-{
-       unsigned int *prev_nmi_count;
-       int cpu;
-
-       if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
-               return 0;
-
-       prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
-       if (!prev_nmi_count)
-               goto error;
-
-       printk(KERN_INFO "Testing NMI watchdog ... ");
-
-#ifdef CONFIG_SMP
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
-#endif
-
-       for_each_possible_cpu(cpu)
-               prev_nmi_count[cpu] = get_nmi_count(cpu);
-       local_irq_enable();
-       mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
-
-       for_each_online_cpu(cpu) {
-               if (!per_cpu(wd_enabled, cpu))
-                       continue;
-               if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
-                       report_broken_nmi(cpu, prev_nmi_count);
-       }
-       endflag = 1;
-       if (!atomic_read(&nmi_active)) {
-               kfree(prev_nmi_count);
-               atomic_set(&nmi_active, -1);
-               goto error;
-       }
-       printk("OK.\n");
-
-       /*
-        * now that we know it works we can reduce NMI frequency to
-        * something more reasonable; makes a difference in some configs
-        */
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               nmi_hz = lapic_adjust_nmi_hz(1);
-
-       kfree(prev_nmi_count);
-       return 0;
-error:
-       if (nmi_watchdog == NMI_IO_APIC) {
-               if (!timer_through_8259)
-                       legacy_pic->mask(0);
-               on_each_cpu(__acpi_nmi_disable, NULL, 1);
-       }
-
-#ifdef CONFIG_X86_32
-       timer_ack = 0;
-#endif
-       return -1;
-}
-
-static int __init setup_nmi_watchdog(char *str)
-{
-       unsigned int nmi;
-
-       if (!strncmp(str, "panic", 5)) {
-               panic_on_timeout = 1;
-               str = strchr(str, ',');
-               if (!str)
-                       return 1;
-               ++str;
-       }
-
-       if (!strncmp(str, "lapic", 5))
-               nmi_watchdog = NMI_LOCAL_APIC;
-       else if (!strncmp(str, "ioapic", 6))
-               nmi_watchdog = NMI_IO_APIC;
-       else {
-               get_option(&str, &nmi);
-               if (nmi >= NMI_INVALID)
-                       return 0;
-               nmi_watchdog = nmi;
-       }
-
-       return 1;
-}
-__setup("nmi_watchdog=", setup_nmi_watchdog);
-
-/*
- * Suspend/resume support
- */
-#ifdef CONFIG_PM
-
-static int nmi_pm_active; /* nmi_active before suspend */
-
-static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
-{
-       /* only CPU0 goes here, other CPUs should be offline */
-       nmi_pm_active = atomic_read(&nmi_active);
-       stop_apic_nmi_watchdog(NULL);
-       BUG_ON(atomic_read(&nmi_active) != 0);
-       return 0;
-}
-
-static int lapic_nmi_resume(struct sys_device *dev)
-{
-       /* only CPU0 goes here, other CPUs should be offline */
-       if (nmi_pm_active > 0) {
-               setup_apic_nmi_watchdog(NULL);
-               touch_nmi_watchdog();
-       }
-       return 0;
-}
-
-static struct sysdev_class nmi_sysclass = {
-       .name           = "lapic_nmi",
-       .resume         = lapic_nmi_resume,
-       .suspend        = lapic_nmi_suspend,
-};
-
-static struct sys_device device_lapic_nmi = {
-       .id     = 0,
-       .cls    = &nmi_sysclass,
-};
-
-static int __init init_lapic_nmi_sysfs(void)
-{
-       int error;
-
-       /*
-        * should really be a BUG_ON but b/c this is an
-        * init call, it just doesn't work.  -dcz
-        */
-       if (nmi_watchdog != NMI_LOCAL_APIC)
-               return 0;
-
-       if (atomic_read(&nmi_active) < 0)
-               return 0;
-
-       error = sysdev_class_register(&nmi_sysclass);
-       if (!error)
-               error = sysdev_register(&device_lapic_nmi);
-       return error;
-}
-
-/* must come after the local APIC's device_initcall() */
-late_initcall(init_lapic_nmi_sysfs);
-
-#endif /* CONFIG_PM */
-
-static void __acpi_nmi_enable(void *__unused)
-{
-       apic_write(APIC_LVT0, APIC_DM_NMI);
-}
-
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
-       if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-               on_each_cpu(__acpi_nmi_enable, NULL, 1);
-}
-
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
-       if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-               on_each_cpu(__acpi_nmi_disable, NULL, 1);
-}
-
-/*
- * This function is called as soon the LAPIC NMI watchdog driver has everything
- * in place and it's ready to check if the NMIs belong to the NMI watchdog
- */
-void cpu_nmi_set_wd_enabled(void)
-{
-       __get_cpu_var(wd_enabled) = 1;
-}
-
-void setup_apic_nmi_watchdog(void *unused)
-{
-       if (__get_cpu_var(wd_enabled))
-               return;
-
-       /* cheap hack to support suspend/resume */
-       /* if cpu0 is not active neither should the other cpus */
-       if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
-               return;
-
-       switch (nmi_watchdog) {
-       case NMI_LOCAL_APIC:
-               if (lapic_watchdog_init(nmi_hz) < 0) {
-                       __get_cpu_var(wd_enabled) = 0;
-                       return;
-               }
-               /* FALL THROUGH */
-       case NMI_IO_APIC:
-               __get_cpu_var(wd_enabled) = 1;
-               atomic_inc(&nmi_active);
-       }
-}
-
-void stop_apic_nmi_watchdog(void *unused)
-{
-       /* only support LOCAL and IO APICs for now */
-       if (!nmi_watchdog_active())
-               return;
-       if (__get_cpu_var(wd_enabled) == 0)
-               return;
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               lapic_watchdog_stop();
-       else
-               __acpi_nmi_disable(NULL);
-       __get_cpu_var(wd_enabled) = 0;
-       atomic_dec(&nmi_active);
-}
-
-/*
- * the best way to detect whether a CPU has a 'hard lockup' problem
- * is to check it's local APIC timer IRQ counts. If they are not
- * changing then that CPU has some problem.
- *
- * as these watchdog NMI IRQs are generated on every CPU, we only
- * have to check the current processor.
- *
- * since NMIs don't listen to _any_ locks, we have to be extremely
- * careful not to rely on unsafe variables. The printk might lock
- * up though, so we have to break up any console locks first ...
- * [when there will be more tty-related locks, break them up here too!]
- */
-
-static DEFINE_PER_CPU(unsigned, last_irq_sum);
-static DEFINE_PER_CPU(long, alert_counter);
-static DEFINE_PER_CPU(int, nmi_touch);
-
-void touch_nmi_watchdog(void)
-{
-       if (nmi_watchdog_active()) {
-               unsigned cpu;
-
-               /*
-                * Tell other CPUs to reset their alert counters. We cannot
-                * do it ourselves because the alert count increase is not
-                * atomic.
-                */
-               for_each_present_cpu(cpu) {
-                       if (per_cpu(nmi_touch, cpu) != 1)
-                               per_cpu(nmi_touch, cpu) = 1;
-               }
-       }
-
-       /*
-        * Tickle the softlockup detector too:
-        */
-       touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-
-notrace __kprobes int
-nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
-{
-       /*
-        * Since current_thread_info()-> is always on the stack, and we
-        * always switch the stack NMI-atomically, it's safe to use
-        * smp_processor_id().
-        */
-       unsigned int sum;
-       int touched = 0;
-       int cpu = smp_processor_id();
-       int rc = 0;
-
-       sum = get_timer_irqs(cpu);
-
-       if (__get_cpu_var(nmi_touch)) {
-               __get_cpu_var(nmi_touch) = 0;
-               touched = 1;
-       }
-
-       /* We can be called before check_nmi_watchdog, hence NULL check. */
-       if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-               static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
-
-               raw_spin_lock(&lock);
-               printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
-               show_regs(regs);
-               dump_stack();
-               raw_spin_unlock(&lock);
-               cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
-
-               rc = 1;
-       }
-
-       /* Could check oops_in_progress here too, but it's safer not to */
-       if (mce_in_progress())
-               touched = 1;
-
-       /* if the none of the timers isn't firing, this cpu isn't doing much */
-       if (!touched && __get_cpu_var(last_irq_sum) == sum) {
-               /*
-                * Ayiee, looks like this CPU is stuck ...
-                * wait a few IRQs (5 seconds) before doing the oops ...
-                */
-               __this_cpu_inc(alert_counter);
-               if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
-                       /*
-                        * die_nmi will return ONLY if NOTIFY_STOP happens..
-                        */
-                       die_nmi("BUG: NMI Watchdog detected LOCKUP",
-                               regs, panic_on_timeout);
-       } else {
-               __get_cpu_var(last_irq_sum) = sum;
-               __this_cpu_write(alert_counter, 0);
-       }
-
-       /* see if the nmi watchdog went off */
-       if (!__get_cpu_var(wd_enabled))
-               return rc;
-       switch (nmi_watchdog) {
-       case NMI_LOCAL_APIC:
-               rc |= lapic_wd_event(nmi_hz);
-               break;
-       case NMI_IO_APIC:
-               /*
-                * don't know how to accurately check for this.
-                * just assume it was a watchdog timer interrupt
-                * This matches the old behaviour.
-                */
-               rc = 1;
-               break;
-       }
-       return rc;
-}
-
-#ifdef CONFIG_SYSCTL
-
-static void enable_ioapic_nmi_watchdog_single(void *unused)
-{
-       __get_cpu_var(wd_enabled) = 1;
-       atomic_inc(&nmi_active);
-       __acpi_nmi_enable(NULL);
-}
-
-static void enable_ioapic_nmi_watchdog(void)
-{
-       on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
-       touch_nmi_watchdog();
-}
-
-static void disable_ioapic_nmi_watchdog(void)
-{
-       on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
-}
-
-static int __init setup_unknown_nmi_panic(char *str)
-{
-       unknown_nmi_panic = 1;
-       return 1;
-}
-__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
-
-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
-{
-       unsigned char reason = get_nmi_reason();
-       char buf[64];
-
-       sprintf(buf, "NMI received for unknown reason %02x\n", reason);
-       die_nmi(buf, regs, 1); /* Always panic here */
-       return 0;
-}
-
-/*
- * proc handler for /proc/sys/kernel/nmi
- */
-int proc_nmi_enabled(struct ctl_table *table, int write,
-                       void __user *buffer, size_t *length, loff_t *ppos)
-{
-       int old_state;
-
-       nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
-       old_state = nmi_watchdog_enabled;
-       proc_dointvec(table, write, buffer, length, ppos);
-       if (!!old_state == !!nmi_watchdog_enabled)
-               return 0;
-
-       if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
-               printk(KERN_WARNING
-                       "NMI watchdog is permanently disabled\n");
-               return -EIO;
-       }
-
-       if (nmi_watchdog == NMI_LOCAL_APIC) {
-               if (nmi_watchdog_enabled)
-                       enable_lapic_nmi_watchdog();
-               else
-                       disable_lapic_nmi_watchdog();
-       } else if (nmi_watchdog == NMI_IO_APIC) {
-               if (nmi_watchdog_enabled)
-                       enable_ioapic_nmi_watchdog();
-               else
-                       disable_ioapic_nmi_watchdog();
-       } else {
-               printk(KERN_WARNING
-                       "NMI watchdog doesn't know what hardware to touch\n");
-               return -EIO;
-       }
-       return 0;
-}
-
-#endif /* CONFIG_SYSCTL */
-
-int do_nmi_callback(struct pt_regs *regs, int cpu)
-{
-#ifdef CONFIG_SYSCTL
-       if (unknown_nmi_panic)
-               return unknown_nmi_panic_callback(regs, cpu);
-#endif
-       return 0;
-}
-
-void arch_trigger_all_cpu_backtrace(void)
-{
-       int i;
-
-       cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
-
-       printk(KERN_INFO "sending NMI to all CPUs:\n");
-       apic->send_IPI_all(NMI_VECTOR);
-
-       /* Wait for up to 10 seconds for all CPUs to do the backtrace */
-       for (i = 0; i < 10 * 1000; i++) {
-               if (cpumask_empty(to_cpumask(backtrace_mask)))
-                       break;
-               mdelay(1);
-       }
-}
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c

index c1c52c341f40a607be5b252746db31f36b8dc561..2a3f2a7db243f8b846ef5d3032287037d0111a69 100644 (file)
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -48,6 +48,16 @@ unsigned int uv_apicid_hibits;
  EXPORT_SYMBOL_GPL(uv_apicid_hibits);
  static DEFINE_SPINLOCK(uv_nmi_lock);
  
+static unsigned long __init uv_early_read_mmr(unsigned long addr)
+{
+       unsigned long val, *mmr;
+
+       mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
+       val = *mmr;
+       early_iounmap(mmr, sizeof(*mmr));
+       return val;
+}
+
  static inline bool is_GRU_range(u64 start, u64 end)
  {
         return start >= gru_start_paddr && end <= gru_end_paddr;
@@ -58,28 +68,24 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end)
         return is_ISA_range(start, end) || is_GRU_range(start, end);
  }
  
-static int early_get_nodeid(void)
+static int __init early_get_pnodeid(void)
  {
         union uvh_node_id_u node_id;
-       unsigned long *mmr;
-
-       mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
-       node_id.v = *mmr;
-       early_iounmap(mmr, sizeof(*mmr));
+       union uvh_rh_gam_config_mmr_u  m_n_config;
+       int pnode;
  
         /* Currently, all blades have same revision number */
+       node_id.v = uv_early_read_mmr(UVH_NODE_ID);
+       m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
         uv_min_hub_revision_id = node_id.s.revision;
  
-       return node_id.s.node_id;
+       pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
+       return pnode;
  }
  
  static void __init early_get_apic_pnode_shift(void)
  {
-       unsigned long *mmr;
-
-       mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr));
-       uvh_apicid.v = *mmr;
-       early_iounmap(mmr, sizeof(*mmr));
+       uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
         if (!uvh_apicid.v)
                 /*
                  * Old bios, use default value
@@ -95,21 +101,17 @@ static void __init early_get_apic_pnode_shift(void)
  static void __init uv_set_apicid_hibit(void)
  {
         union uvh_lb_target_physical_apic_id_mask_u apicid_mask;
-       unsigned long *mmr;
  
-       mmr = early_ioremap(UV_LOCAL_MMR_BASE |
-               UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK, sizeof(*mmr));
-       apicid_mask.v = *mmr;
-       early_iounmap(mmr, sizeof(*mmr));
+       apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK);
         uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK;
  }
  
  static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
  {
-       int nodeid;
+       int pnodeid;
  
         if (!strcmp(oem_id, "SGI")) {
-               nodeid = early_get_nodeid();
+               pnodeid = early_get_pnodeid();
                 early_get_apic_pnode_shift();
                 x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
                 x86_platform.nmi_init = uv_nmi_init;
@@ -119,7 +121,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
                         uv_system_type = UV_X2APIC;
                 else if (!strcmp(oem_table_id, "UVH")) {
                         __get_cpu_var(x2apic_extra_bits) =
-                               nodeid << (uvh_apicid.s.pnode_shift - 1);
+                               pnodeid << uvh_apicid.s.pnode_shift;
                         uv_system_type = UV_NON_UNIQUE_APIC;
                         uv_set_apicid_hibit();
                         return 1;
@@ -682,27 +684,32 @@ void uv_nmi_init(void)
  void __init uv_system_init(void)
  {
         union uvh_rh_gam_config_mmr_u  m_n_config;
+       union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
         union uvh_node_id_u node_id;
         unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
-       int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
+       int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io;
         int gnode_extra, max_pnode = 0;
         unsigned long mmr_base, present, paddr;
-       unsigned short pnode_mask;
+       unsigned short pnode_mask, pnode_io_mask;
  
         map_low_mmrs();
  
         m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
         m_val = m_n_config.s.m_skt;
         n_val = m_n_config.s.n_skt;
+       mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
+       n_io = mmioh.s.n_io;
         mmr_base =
             uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
             ~UV_MMR_ENABLE;
         pnode_mask = (1 << n_val) - 1;
+       pnode_io_mask = (1 << n_io) - 1;
+
         node_id.v = uv_read_local_mmr(UVH_NODE_ID);
         gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
         gnode_upper = ((unsigned long)gnode_extra  << m_val);
-       printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
-                       n_val, m_val, gnode_upper, gnode_extra);
+       printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n",
+                       n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask);
  
         printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
  
@@ -735,7 +742,7 @@ void __init uv_system_init(void)
                 for (j = 0; j < 64; j++) {
                         if (!test_bit(j, &present))
                                 continue;
-                       pnode = (i * 64 + j);
+                       pnode = (i * 64 + j) & pnode_mask;
                         uv_blade_info[blade].pnode = pnode;
                         uv_blade_info[blade].nr_possible_cpus = 0;
                         uv_blade_info[blade].nr_online_cpus = 0;
@@ -756,6 +763,7 @@ void __init uv_system_init(void)
                 /*
                  * apic_pnode_shift must be set before calling uv_apicid_to_pnode();
                  */
+               uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
                 uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
                 pnode = uv_apicid_to_pnode(apicid);
                 blade = boot_pnode_to_blade(pnode);
@@ -772,7 +780,6 @@ void __init uv_system_init(void)
                 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
                 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
                 uv_cpu_hub_info(cpu)->pnode = pnode;
-               uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
                 uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
                 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
                 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
@@ -796,7 +803,7 @@ void __init uv_system_init(void)
  
         map_gru_high(max_pnode);
         map_mmr_high(max_pnode);
-       map_mmioh_high(max_pnode);
+       map_mmioh_high(max_pnode & pnode_io_mask);
  
         uv_cpu_init();
         uv_scir_register_cpu_notifier();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c

index 4b68bda30938d0a55ed39eeaeff68157266a9ea0..1d59834396bdc145c630e671d1bccd7769689a88 100644 (file)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -894,7 +894,6 @@ void __init identify_boot_cpu(void)
  #else
         vgetcpu_set_mode();
  #endif
-       init_hw_perf_events();
  }
  
  void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c

index 17ad0336621135a2310f3e9a2e5d9bf3d2f668ba..9ecf81f9b90fb0c73416d958b1aa216b17e1ecfa 100644 (file)
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -149,8 +149,7 @@ union _cpuid4_leaf_ecx {
  };
  
  struct amd_l3_cache {
-       struct   pci_dev *dev;
-       bool     can_disable;
+       struct   amd_northbridge *nb;
         unsigned indices;
         u8       subcaches[4];
  };
@@ -311,14 +310,12 @@ struct _cache_attr {
  /*
   * L3 cache descriptors
   */
-static struct amd_l3_cache **__cpuinitdata l3_caches;
-
  static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
  {
         unsigned int sc0, sc1, sc2, sc3;
         u32 val = 0;
  
-       pci_read_config_dword(l3->dev, 0x1C4, &val);
+       pci_read_config_dword(l3->nb->misc, 0x1C4, &val);
  
         /* calculate subcache sizes */
         l3->subcaches[0] = sc0 = !(val & BIT(0));
@@ -330,47 +327,14 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
         l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
  }
  
-static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
-{
-       struct amd_l3_cache *l3;
-       struct pci_dev *dev = node_to_k8_nb_misc(node);
-
-       l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
-       if (!l3) {
-               printk(KERN_WARNING "Error allocating L3 struct\n");
-               return NULL;
-       }
-
-       l3->dev = dev;
-
-       amd_calc_l3_indices(l3);
-
-       return l3;
-}
-
-static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
-                                          int index)
+static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf,
+                                       int index)
  {
+       static struct amd_l3_cache *__cpuinitdata l3_caches;
         int node;
  
-       if (boot_cpu_data.x86 != 0x10)
-               return;
-
-       if (index < 3)
-               return;
-
-       /* see errata #382 and #388 */
-       if (boot_cpu_data.x86_model < 0x8)
-               return;
-
-       if ((boot_cpu_data.x86_model == 0x8 ||
-            boot_cpu_data.x86_model == 0x9)
-               &&
-            boot_cpu_data.x86_mask < 0x1)
-                       return;
-
-       /* not in virtualized environments */
-       if (k8_northbridges.num == 0)
+       /* only for L3, and not in virtualized environments */
+       if (index < 3 || amd_nb_num() == 0)
                 return;
  
         /*
@@ -378,7 +342,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
          * never freed but this is done only on shutdown so it doesn't matter.
          */
         if (!l3_caches) {
-               int size = k8_northbridges.num * sizeof(struct amd_l3_cache *);
+               int size = amd_nb_num() * sizeof(struct amd_l3_cache);
  
                 l3_caches = kzalloc(size, GFP_ATOMIC);
                 if (!l3_caches)
@@ -387,14 +351,12 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
  
         node = amd_get_nb_id(smp_processor_id());
  
-       if (!l3_caches[node]) {
-               l3_caches[node] = amd_init_l3_cache(node);
-               l3_caches[node]->can_disable = true;
+       if (!l3_caches[node].nb) {
+               l3_caches[node].nb = node_to_amd_nb(node);
+               amd_calc_l3_indices(&l3_caches[node]);
         }
  
-       WARN_ON(!l3_caches[node]);
-
-       this_leaf->l3 = l3_caches[node];
+       this_leaf->l3 = &l3_caches[node];
  }
  
  /*
@@ -408,7 +370,7 @@ int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
  {
         unsigned int reg = 0;
  
-       pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
+       pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, &reg);
  
         /* check whether this slot is activated already */
         if (reg & (3UL << 30))
@@ -422,7 +384,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
  {
         int index;
  
-       if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+       if (!this_leaf->l3 ||
+           !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
                 return -EINVAL;
  
         index = amd_get_l3_disable_slot(this_leaf->l3, slot);
@@ -457,7 +420,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
                 if (!l3->subcaches[i])
                         continue;
  
-               pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+               pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
  
                 /*
                  * We need to WBINVD on a core on the node containing the L3
@@ -467,7 +430,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
                 wbinvd_on_cpu(cpu);
  
                 reg |= BIT(31);
-               pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+               pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
         }
  }
  
@@ -524,7 +487,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
-       if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+       if (!this_leaf->l3 ||
+           !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
                 return -EINVAL;
  
         cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
@@ -545,7 +509,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
  #define STORE_CACHE_DISABLE(slot)                                      \
  static ssize_t                                                         \
  store_cache_disable_##slot(struct _cpuid4_info *this_leaf,             \
-                           const char *buf, size_t count)              \
+                          const char *buf, size_t count)               \
  {                                                                      \
         return store_cache_disable(this_leaf, buf, count, slot);        \
  }
@@ -558,10 +522,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
                 show_cache_disable_1, store_cache_disable_1);
  
  #else  /* CONFIG_AMD_NB */
-static void __cpuinit
-amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
-{
-};
+#define amd_init_l3_cache(x, y)
  #endif /* CONFIG_AMD_NB */
  
  static int
@@ -575,7 +536,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
  
         if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
                 amd_cpuid4(index, &eax, &ebx, &ecx);
-               amd_check_l3_disable(this_leaf, index);
+               amd_init_l3_cache(this_leaf, index);
         } else {
                 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
         }
@@ -983,30 +944,48 @@ define_one_ro(size);
  define_one_ro(shared_cpu_map);
  define_one_ro(shared_cpu_list);
  
-#define DEFAULT_SYSFS_CACHE_ATTRS      \
-       &type.attr,                     \
-       &level.attr,                    \
-       &coherency_line_size.attr,      \
-       &physical_line_partition.attr,  \
-       &ways_of_associativity.attr,    \
-       &number_of_sets.attr,           \
-       &size.attr,                     \
-       &shared_cpu_map.attr,           \
-       &shared_cpu_list.attr
-
  static struct attribute *default_attrs[] = {
-       DEFAULT_SYSFS_CACHE_ATTRS,
+       &type.attr,
+       &level.attr,
+       &coherency_line_size.attr,
+       &physical_line_partition.attr,
+       &ways_of_associativity.attr,
+       &number_of_sets.attr,
+       &size.attr,
+       &shared_cpu_map.attr,
+       &shared_cpu_list.attr,
         NULL
  };
  
-static struct attribute *default_l3_attrs[] = {
-       DEFAULT_SYSFS_CACHE_ATTRS,
  #ifdef CONFIG_AMD_NB
-       &cache_disable_0.attr,
-       &cache_disable_1.attr,
+static struct attribute ** __cpuinit amd_l3_attrs(void)
+{
+       static struct attribute **attrs;
+       int n;
+
+       if (attrs)
+               return attrs;
+
+       n = sizeof (default_attrs) / sizeof (struct attribute *);
+
+       if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+               n += 2;
+
+       attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
+       if (attrs == NULL)
+               return attrs = default_attrs;
+
+       for (n = 0; default_attrs[n]; n++)
+               attrs[n] = default_attrs[n];
+
+       if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+               attrs[n++] = &cache_disable_0.attr;
+               attrs[n++] = &cache_disable_1.attr;
+       }
+
+       return attrs;
+}
  #endif
-       NULL
-};
  
  static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
  {
@@ -1117,11 +1096,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
  
                 this_leaf = CPUID4_INFO_IDX(cpu, i);
  
-               if (this_leaf->l3 && this_leaf->l3->can_disable)
-                       ktype_cache.default_attrs = default_l3_attrs;
-               else
-                       ktype_cache.default_attrs = default_attrs;
-
+               ktype_cache.default_attrs = default_attrs;
+#ifdef CONFIG_AMD_NB
+               if (this_leaf->l3)
+                       ktype_cache.default_attrs = amd_l3_attrs();
+#endif
                 retval = kobject_init_and_add(&(this_object->kobj),
                                               &ktype_cache,
                                               per_cpu(ici_cache_kobject, cpu),
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c

index 80c482382d5c95e06b71ffdf91b6f5d362bf2b45..5bf2fac52aca7771b6b7827117b9d2b2778fd8ad 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -31,8 +31,6 @@
  #include <asm/mce.h>
  #include <asm/msr.h>
  
-#define PFX               "mce_threshold: "
-#define VERSION           "version 1.1.1"
  #define NR_BANKS          6
  #define NR_BLOCKS         9
  #define THRESHOLD_MAX     0xFFF
@@ -59,12 +57,6 @@ struct threshold_block {
         struct list_head        miscj;
  };
  
-/* defaults used early on boot */
-static struct threshold_block threshold_defaults = {
-       .interrupt_enable       = 0,
-       .threshold_limit        = THRESHOLD_MAX,
-};
-
  struct threshold_bank {
         struct kobject          *kobj;
         struct threshold_block  *blocks;
@@ -89,50 +81,101 @@ static void amd_threshold_interrupt(void);
  struct thresh_restart {
         struct threshold_block  *b;
         int                     reset;
+       int                     set_lvt_off;
+       int                     lvt_off;
         u16                     old_limit;
  };
  
+static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
+{
+       int msr = (hi & MASK_LVTOFF_HI) >> 20;
+
+       if (apic < 0) {
+               pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
+                      "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
+                      b->bank, b->block, b->address, hi, lo);
+               return 0;
+       }
+
+       if (apic != msr) {
+               pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
+                      "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
+                      b->cpu, apic, b->bank, b->block, b->address, hi, lo);
+               return 0;
+       }
+
+       return 1;
+};
+
  /* must be called with correct cpu affinity */
  /* Called via smp_call_function_single() */
  static void threshold_restart_bank(void *_tr)
  {
         struct thresh_restart *tr = _tr;
-       u32 mci_misc_hi, mci_misc_lo;
+       u32 hi, lo;
  
-       rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+       rdmsr(tr->b->address, lo, hi);
  
-       if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+       if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
                 tr->reset = 1;  /* limit cannot be lower than err count */
  
         if (tr->reset) {                /* reset err count and overflow bit */
-               mci_misc_hi =
-                   (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
+               hi =
+                   (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
                     (THRESHOLD_MAX - tr->b->threshold_limit);
         } else if (tr->old_limit) {     /* change limit w/o reset */
-               int new_count = (mci_misc_hi & THRESHOLD_MAX) +
+               int new_count = (hi & THRESHOLD_MAX) +
                     (tr->old_limit - tr->b->threshold_limit);
  
-               mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
+               hi = (hi & ~MASK_ERR_COUNT_HI) |
                     (new_count & THRESHOLD_MAX);
         }
  
+       if (tr->set_lvt_off) {
+               if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
+                       /* set new lvt offset */
+                       hi &= ~MASK_LVTOFF_HI;
+                       hi |= tr->lvt_off << 20;
+               }
+       }
+
         tr->b->interrupt_enable ?
-           (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
-           (mci_misc_hi &= ~MASK_INT_TYPE_HI);
+           (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
+           (hi &= ~MASK_INT_TYPE_HI);
  
-       mci_misc_hi |= MASK_COUNT_EN_HI;
-       wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+       hi |= MASK_COUNT_EN_HI;
+       wrmsr(tr->b->address, lo, hi);
+}
+
+static void mce_threshold_block_init(struct threshold_block *b, int offset)
+{
+       struct thresh_restart tr = {
+               .b                      = b,
+               .set_lvt_off            = 1,
+               .lvt_off                = offset,
+       };
+
+       b->threshold_limit              = THRESHOLD_MAX;
+       threshold_restart_bank(&tr);
+};
+
+static int setup_APIC_mce(int reserved, int new)
+{
+       if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
+                                             APIC_EILVT_MSG_FIX, 0))
+               return new;
+
+       return reserved;
  }
  
  /* cpu init entry point, called from mce.c with preempt off */
  void mce_amd_feature_init(struct cpuinfo_x86 *c)
  {
+       struct threshold_block b;
         unsigned int cpu = smp_processor_id();
         u32 low = 0, high = 0, address = 0;
         unsigned int bank, block;
-       struct thresh_restart tr;
-       int lvt_off = -1;
-       u8 offset;
+       int offset = -1;
  
         for (bank = 0; bank < NR_BANKS; ++bank) {
                 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -163,39 +206,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
                         if (shared_bank[bank] && c->cpu_core_id)
                                 break;
  #endif
-                       offset = (high & MASK_LVTOFF_HI) >> 20;
-                       if (lvt_off < 0) {
-                               if (setup_APIC_eilvt(offset,
-                                                    THRESHOLD_APIC_VECTOR,
-                                                    APIC_EILVT_MSG_FIX, 0)) {
-                                       pr_err(FW_BUG "cpu %d, failed to "
-                                              "setup threshold interrupt "
-                                              "for bank %d, block %d "
-                                              "(MSR%08X=0x%x%08x)",
-                                              smp_processor_id(), bank, block,
-                                              address, high, low);
-                                       continue;
-                               }
-                               lvt_off = offset;
-                       } else if (lvt_off != offset) {
-                               pr_err(FW_BUG "cpu %d, invalid threshold "
-                                      "interrupt offset %d for bank %d,"
-                                      "block %d (MSR%08X=0x%x%08x)",
-                                      smp_processor_id(), lvt_off, bank,
-                                      block, address, high, low);
-                               continue;
-                       }
-
-                       high &= ~MASK_LVTOFF_HI;
-                       high |= lvt_off << 20;
-                       wrmsr(address, low, high);
+                       offset = setup_APIC_mce(offset,
+                                               (high & MASK_LVTOFF_HI) >> 20);
  
-                       threshold_defaults.address = address;
-                       tr.b = &threshold_defaults;
-                       tr.reset = 0;
-                       tr.old_limit = 0;
-                       threshold_restart_bank(&tr);
+                       memset(&b, 0, sizeof(b));
+                       b.cpu           = cpu;
+                       b.bank          = bank;
+                       b.block         = block;
+                       b.address       = address;
  
+                       mce_threshold_block_init(&b, offset);
                         mce_threshold_vector = amd_threshold_interrupt;
                 }
         }
@@ -298,9 +318,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
  
         b->interrupt_enable = !!new;
  
+       memset(&tr, 0, sizeof(tr));
         tr.b            = b;
-       tr.reset        = 0;
-       tr.old_limit    = 0;
  
         smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
  
@@ -321,10 +340,10 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
         if (new < 1)
                 new = 1;
  
+       memset(&tr, 0, sizeof(tr));
         tr.old_limit = b->threshold_limit;
         b->threshold_limit = new;
         tr.b = b;
-       tr.reset = 0;
  
         smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
  
@@ -603,9 +622,9 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
                         continue;
                 err = threshold_create_bank(cpu, bank);
                 if (err)
-                       goto out;
+                       return err;
         }
-out:
+
         return err;
  }
  
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c

index 6d75b9145b13f0e68a106acd76b0d458c827d099..0a360d146596b6d01f8c833e655150faa3ae9605 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -330,9 +330,6 @@ static bool reserve_pmc_hardware(void)
  {
         int i;
  
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               disable_lapic_nmi_watchdog();
-
         for (i = 0; i < x86_pmu.num_counters; i++) {
                 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
                         goto perfctr_fail;
@@ -355,9 +352,6 @@ perfctr_fail:
         for (i--; i >= 0; i--)
                 release_perfctr_nmi(x86_pmu.perfctr + i);
  
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               enable_lapic_nmi_watchdog();
-
         return false;
  }
  
@@ -369,9 +363,6 @@ static void release_pmc_hardware(void)
                 release_perfctr_nmi(x86_pmu.perfctr + i);
                 release_evntsel_nmi(x86_pmu.eventsel + i);
         }
-
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               enable_lapic_nmi_watchdog();
  }
  
  #else
@@ -384,15 +375,53 @@ static void release_pmc_hardware(void) {}
  static bool check_hw_exists(void)
  {
         u64 val, val_new = 0;
-       int ret = 0;
+       int i, reg, ret = 0;
+
+       /*
+        * Check to see if the BIOS enabled any of the counters, if so
+        * complain and bail.
+        */
+       for (i = 0; i < x86_pmu.num_counters; i++) {
+               reg = x86_pmu.eventsel + i;
+               ret = rdmsrl_safe(reg, &val);
+               if (ret)
+                       goto msr_fail;
+               if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
+                       goto bios_fail;
+       }
  
+       if (x86_pmu.num_counters_fixed) {
+               reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+               ret = rdmsrl_safe(reg, &val);
+               if (ret)
+                       goto msr_fail;
+               for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+                       if (val & (0x03 << i*4))
+                               goto bios_fail;
+               }
+       }
+
+       /*
+        * Now write a value and read it back to see if it matches,
+        * this is needed to detect certain hardware emulators (qemu/kvm)
+        * that don't trap on the MSR access and always return 0s.
+        */
         val = 0xabcdUL;
-       ret |= checking_wrmsrl(x86_pmu.perfctr, val);
+       ret = checking_wrmsrl(x86_pmu.perfctr, val);
         ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new);
         if (ret || val != val_new)
-               return false;
+               goto msr_fail;
  
         return true;
+
+bios_fail:
+       printk(KERN_CONT "Broken BIOS detected, using software events only.\n");
+       printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
+       return false;
+
+msr_fail:
+       printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
+       return false;
  }
  
  static void reserve_ds_buffers(void);
@@ -451,7 +480,7 @@ static int x86_setup_perfctr(struct perf_event *event)
         struct hw_perf_event *hwc = &event->hw;
         u64 config;
  
-       if (!hwc->sample_period) {
+       if (!is_sampling_event(event)) {
                 hwc->sample_period = x86_pmu.max_period;
                 hwc->last_period = hwc->sample_period;
                 local64_set(&hwc->period_left, hwc->sample_period);
@@ -1362,7 +1391,7 @@ static void __init pmu_check_apic(void)
         pr_info("no hardware sampling interrupt available.\n");
  }
  
-void __init init_hw_perf_events(void)
+int __init init_hw_perf_events(void)
  {
         struct event_constraint *c;
         int err;
@@ -1377,20 +1406,18 @@ void __init init_hw_perf_events(void)
                 err = amd_pmu_init();
                 break;
         default:
-               return;
+               return 0;
         }
         if (err != 0) {
                 pr_cont("no PMU driver, software events only.\n");
-               return;
+               return 0;
         }
  
         pmu_check_apic();
  
         /* sanity check that the hardware exists or is emulated */
-       if (!check_hw_exists()) {
-               pr_cont("Broken PMU hardware detected, software events only.\n");
-               return;
-       }
+       if (!check_hw_exists())
+               return 0;
  
         pr_cont("%s PMU driver.\n", x86_pmu.name);
  
@@ -1438,9 +1465,12 @@ void __init init_hw_perf_events(void)
         pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
         pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
  
-       perf_pmu_register(&pmu);
+       perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
         perf_cpu_notifier(x86_pmu_notifier);
+
+       return 0;
  }
+early_initcall(init_hw_perf_events);
  
  static inline void x86_pmu_read(struct perf_event *event)
  {
@@ -1686,7 +1716,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
  
         perf_callchain_store(entry, regs->ip);
  
-       dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
+       dump_trace(NULL, regs, NULL, &backtrace_ops, entry);
  }
  
  #ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c

index e421b8cd6944af860c4b28a1a176a14320ac9f79..67e2202a60393cd48a0f2251862c59f65bf7e667 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,7 +1,5 @@
  #ifdef CONFIG_CPU_SUP_AMD
  
-static DEFINE_RAW_SPINLOCK(amd_nb_lock);
-
  static __initconst const u64 amd_hw_cache_event_ids
                                 [PERF_COUNT_HW_CACHE_MAX]
                                 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -275,7 +273,7 @@ done:
         return &emptyconstraint;
  }
  
-static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
+static struct amd_nb *amd_alloc_nb(int cpu)
  {
         struct amd_nb *nb;
         int i;
@@ -285,7 +283,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
         if (!nb)
                 return NULL;
  
-       nb->nb_id = nb_id;
+       nb->nb_id = -1;
  
         /*
          * initialize all possible NB constraints
@@ -306,7 +304,7 @@ static int amd_pmu_cpu_prepare(int cpu)
         if (boot_cpu_data.x86_max_cores < 2)
                 return NOTIFY_OK;
  
-       cpuc->amd_nb = amd_alloc_nb(cpu, -1);
+       cpuc->amd_nb = amd_alloc_nb(cpu);
         if (!cpuc->amd_nb)
                 return NOTIFY_BAD;
  
@@ -325,8 +323,6 @@ static void amd_pmu_cpu_starting(int cpu)
         nb_id = amd_get_nb_id(cpu);
         WARN_ON_ONCE(nb_id == BAD_APICID);
  
-       raw_spin_lock(&amd_nb_lock);
-
         for_each_online_cpu(i) {
                 nb = per_cpu(cpu_hw_events, i).amd_nb;
                 if (WARN_ON_ONCE(!nb))
@@ -341,8 +337,6 @@ static void amd_pmu_cpu_starting(int cpu)
  
         cpuc->amd_nb->nb_id = nb_id;
         cpuc->amd_nb->refcnt++;
-
-       raw_spin_unlock(&amd_nb_lock);
  }
  
  static void amd_pmu_cpu_dead(int cpu)
@@ -354,8 +348,6 @@ static void amd_pmu_cpu_dead(int cpu)
  
         cpuhw = &per_cpu(cpu_hw_events, cpu);
  
-       raw_spin_lock(&amd_nb_lock);
-
         if (cpuhw->amd_nb) {
                 struct amd_nb *nb = cpuhw->amd_nb;
  
@@ -364,8 +356,6 @@ static void amd_pmu_cpu_dead(int cpu)
  
                 cpuhw->amd_nb = NULL;
         }
-
-       raw_spin_unlock(&amd_nb_lock);
  }
  
  static __initconst const struct x86_pmu amd_pmu = {
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c

index c8f5c088cad11ae3f245e1e7374bb43c915170d6..24e390e40f2e0b484d4b2b09084deb9d120d59b4 100644 (file)
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -816,6 +816,32 @@ static int intel_pmu_hw_config(struct perf_event *event)
         if (ret)
                 return ret;
  
+       if (event->attr.precise_ip &&
+           (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+               /*
+                * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+                * (0x003c) so that we can use it with PEBS.
+                *
+                * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+                * PEBS capable. However we can use INST_RETIRED.ANY_P
+                * (0x00c0), which is a PEBS capable event, to get the same
+                * count.
+                *
+                * INST_RETIRED.ANY_P counts the number of cycles that retires
+                * CNTMASK instructions. By setting CNTMASK to a value (16)
+                * larger than the maximum number of instructions that can be
+                * retired per cycle (4) and then inverting the condition, we
+                * count all cycles that retire 16 or less instructions, which
+                * is every cycle.
+                *
+                * Thereby we gain a PEBS capable cycle counter.
+                */
+               u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */
+
+               alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+               event->hw.config = alt_config;
+       }
+
         if (event->attr.type != PERF_TYPE_RAW)
                 return 0;
  
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c

index d9f4ff8fcd693c509b2d079b381a8e8683d10d9c..d5a236615501fd6a41fb6f6bc76bfd2369a47bc5 100644 (file)
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -16,32 +16,12 @@
  #include <linux/kernel.h>
  #include <linux/bitops.h>
  #include <linux/smp.h>
-#include <linux/nmi.h>
+#include <asm/nmi.h>
  #include <linux/kprobes.h>
  
  #include <asm/apic.h>
  #include <asm/perf_event.h>
  
-struct nmi_watchdog_ctlblk {
-       unsigned int cccr_msr;
-       unsigned int perfctr_msr;  /* the MSR to reset in NMI handler */
-       unsigned int evntsel_msr;  /* the MSR to select the events to handle */
-};
-
-/* Interface defining a CPU specific perfctr watchdog */
-struct wd_ops {
-       int (*reserve)(void);
-       void (*unreserve)(void);
-       int (*setup)(unsigned nmi_hz);
-       void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
-       void (*stop)(void);
-       unsigned perfctr;
-       unsigned evntsel;
-       u64 checkbit;
-};
-
-static const struct wd_ops *wd_ops;
-
  /*
   * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
   * offset from MSR_P4_BSU_ESCR0.
@@ -60,8 +40,6 @@ static const struct wd_ops *wd_ops;
  static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
  static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
  
-static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
-
  /* converts an msr to an appropriate reservation bit */
  static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
  {
@@ -172,623 +150,3 @@ void release_evntsel_nmi(unsigned int msr)
         clear_bit(counter, evntsel_nmi_owner);
  }
  EXPORT_SYMBOL(release_evntsel_nmi);
-
-void disable_lapic_nmi_watchdog(void)
-{
-       BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
-       if (atomic_read(&nmi_active) <= 0)
-               return;
-
-       on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
-
-       if (wd_ops)
-               wd_ops->unreserve();
-
-       BUG_ON(atomic_read(&nmi_active) != 0);
-}
-
-void enable_lapic_nmi_watchdog(void)
-{
-       BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
-       /* are we already enabled */
-       if (atomic_read(&nmi_active) != 0)
-               return;
-
-       /* are we lapic aware */
-       if (!wd_ops)
-               return;
-       if (!wd_ops->reserve()) {
-               printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
-               return;
-       }
-
-       on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
-       touch_nmi_watchdog();
-}
-
-/*
- * Activate the NMI watchdog via the local APIC.
- */
-
-static unsigned int adjust_for_32bit_ctr(unsigned int hz)
-{
-       u64 counter_val;
-       unsigned int retval = hz;
-
-       /*
-        * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
-        * are writable, with higher bits sign extending from bit 31.
-        * So, we can only program the counter with 31 bit values and
-        * 32nd bit should be 1, for 33.. to be 1.
-        * Find the appropriate nmi_hz
-        */
-       counter_val = (u64)cpu_khz * 1000;
-       do_div(counter_val, retval);
-       if (counter_val > 0x7fffffffULL) {
-               u64 count = (u64)cpu_khz * 1000;
-               do_div(count, 0x7fffffffUL);
-               retval = count + 1;
-       }
-       return retval;
-}
-
-static void write_watchdog_counter(unsigned int perfctr_msr,
-                               const char *descr, unsigned nmi_hz)
-{
-       u64 count = (u64)cpu_khz * 1000;
-
-       do_div(count, nmi_hz);
-       if (descr)
-               pr_debug("setting %s to -0x%08Lx\n", descr, count);
-       wrmsrl(perfctr_msr, 0 - count);
-}
-
-static void write_watchdog_counter32(unsigned int perfctr_msr,
-                               const char *descr, unsigned nmi_hz)
-{
-       u64 count = (u64)cpu_khz * 1000;
-
-       do_div(count, nmi_hz);
-       if (descr)
-               pr_debug("setting %s to -0x%08Lx\n", descr, count);
-       wrmsr(perfctr_msr, (u32)(-count), 0);
-}
-
-/*
- * AMD K7/K8/Family10h/Family11h support.
- * AMD keeps this interface nicely stable so there is not much variety
- */
-#define K7_EVNTSEL_ENABLE      (1 << 22)
-#define K7_EVNTSEL_INT         (1 << 20)
-#define K7_EVNTSEL_OS          (1 << 17)
-#define K7_EVNTSEL_USR         (1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING   0x76
-#define K7_NMI_EVENT           K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
-
-static int setup_k7_watchdog(unsigned nmi_hz)
-{
-       unsigned int perfctr_msr, evntsel_msr;
-       unsigned int evntsel;
-       struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-       perfctr_msr = wd_ops->perfctr;
-       evntsel_msr = wd_ops->evntsel;
-
-       wrmsrl(perfctr_msr, 0UL);
-
-       evntsel = K7_EVNTSEL_INT
-               | K7_EVNTSEL_OS
-               | K7_EVNTSEL_USR
-               | K7_NMI_EVENT;
-
-       /* setup the timer */
-       wrmsr(evntsel_msr, evntsel, 0);
-       write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz);
-
-       /* initialize the wd struct before enabling */
-       wd->perfctr_msr = perfctr_msr;
-       wd->evntsel_msr = evntsel_msr;
-       wd->cccr_msr = 0;  /* unused */
-
-       /* ok, everything is initialized, announce that we're set */
-       cpu_nmi_set_wd_enabled();
-
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-       evntsel |= K7_EVNTSEL_ENABLE;
-       wrmsr(evntsel_msr, evntsel, 0);
-
-       return 1;
-}
-
-static void single_msr_stop_watchdog(void)
-{
-       struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-       wrmsr(wd->evntsel_msr, 0, 0);
-}
-
-static int single_msr_reserve(void)
-{
-       if (!reserve_perfctr_nmi(wd_ops->perfctr))
-               return 0;
-
-       if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
-               release_perfctr_nmi(wd_ops->perfctr);
-               return 0;
-       }
-       return 1;
-}
-
-static void single_msr_unreserve(void)
-{
-       release_evntsel_nmi(wd_ops->evntsel);
-       release_perfctr_nmi(wd_ops->perfctr);
-}
-
-static void __kprobes
-single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
-       /* start the cycle over again */
-       write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops k7_wd_ops = {
-       .reserve        = single_msr_reserve,
-       .unreserve      = single_msr_unreserve,
-       .setup          = setup_k7_watchdog,
-       .rearm          = single_msr_rearm,
-       .stop           = single_msr_stop_watchdog,
-       .perfctr        = MSR_K7_PERFCTR0,
-       .evntsel        = MSR_K7_EVNTSEL0,
-       .checkbit       = 1ULL << 47,
-};
-
-/*
- * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
- */
-#define P6_EVNTSEL0_ENABLE     (1 << 22)
-#define P6_EVNTSEL_INT         (1 << 20)
-#define P6_EVNTSEL_OS          (1 << 17)
-#define P6_EVNTSEL_USR         (1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
-#define P6_NMI_EVENT           P6_EVENT_CPU_CLOCKS_NOT_HALTED
-
-static int setup_p6_watchdog(unsigned nmi_hz)
-{
-       unsigned int perfctr_msr, evntsel_msr;
-       unsigned int evntsel;
-       struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-       perfctr_msr = wd_ops->perfctr;
-       evntsel_msr = wd_ops->evntsel;
-
-       /* KVM doesn't implement this MSR */
-       if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
-               return 0;
-
-       evntsel = P6_EVNTSEL_INT
-               | P6_EVNTSEL_OS
-               | P6_EVNTSEL_USR
-               | P6_NMI_EVENT;
-
-       /* setup the timer */
-       wrmsr(evntsel_msr, evntsel, 0);
-       nmi_hz = adjust_for_32bit_ctr(nmi_hz);
-       write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz);
-
-       /* initialize the wd struct before enabling */
-       wd->perfctr_msr = perfctr_msr;
-       wd->evntsel_msr = evntsel_msr;
-       wd->cccr_msr = 0;  /* unused */
-
-       /* ok, everything is initialized, announce that we're set */
-       cpu_nmi_set_wd_enabled();
-
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-       evntsel |= P6_EVNTSEL0_ENABLE;
-       wrmsr(evntsel_msr, evntsel, 0);
-
-       return 1;
-}
-
-static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
-       /*
-        * P6 based Pentium M need to re-unmask
-        * the apic vector but it doesn't hurt
-        * other P6 variant.
-        * ArchPerfom/Core Duo also needs this
-        */
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-
-       /* P6/ARCH_PERFMON has 32 bit counter write */
-       write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops p6_wd_ops = {
-       .reserve        = single_msr_reserve,
-       .unreserve      = single_msr_unreserve,
-       .setup          = setup_p6_watchdog,
-       .rearm          = p6_rearm,
-       .stop           = single_msr_stop_watchdog,
-       .perfctr        = MSR_P6_PERFCTR0,
-       .evntsel        = MSR_P6_EVNTSEL0,
-       .checkbit       = 1ULL << 39,
-};
-
-/*
- * Intel P4 performance counters.
- * By far the most complicated of all.
- */
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL  (1 << 7)
-#define P4_ESCR_EVENT_SELECT(N)        ((N) << 25)
-#define P4_ESCR_OS             (1 << 3)
-#define P4_ESCR_USR            (1 << 2)
-#define P4_CCCR_OVF_PMI0       (1 << 26)
-#define P4_CCCR_OVF_PMI1       (1 << 27)
-#define P4_CCCR_THRESHOLD(N)   ((N) << 20)
-#define P4_CCCR_COMPLEMENT     (1 << 19)
-#define P4_CCCR_COMPARE                (1 << 18)
-#define P4_CCCR_REQUIRED       (3 << 16)
-#define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
-#define P4_CCCR_ENABLE         (1 << 12)
-#define P4_CCCR_OVF            (1 << 31)
-
-#define P4_CONTROLS 18
-static unsigned int p4_controls[18] = {
-       MSR_P4_BPU_CCCR0,
-       MSR_P4_BPU_CCCR1,
-       MSR_P4_BPU_CCCR2,
-       MSR_P4_BPU_CCCR3,
-       MSR_P4_MS_CCCR0,
-       MSR_P4_MS_CCCR1,
-       MSR_P4_MS_CCCR2,
-       MSR_P4_MS_CCCR3,
-       MSR_P4_FLAME_CCCR0,
-       MSR_P4_FLAME_CCCR1,
-       MSR_P4_FLAME_CCCR2,
-       MSR_P4_FLAME_CCCR3,
-       MSR_P4_IQ_CCCR0,
-       MSR_P4_IQ_CCCR1,
-       MSR_P4_IQ_CCCR2,
-       MSR_P4_IQ_CCCR3,
-       MSR_P4_IQ_CCCR4,
-       MSR_P4_IQ_CCCR5,
-};
-/*
- * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
- * CRU_ESCR0 (with any non-null event selector) through a complemented
- * max threshold. [IA32-Vol3, Section 14.9.9]
- */
-static int setup_p4_watchdog(unsigned nmi_hz)
-{
-       unsigned int perfctr_msr, evntsel_msr, cccr_msr;
-       unsigned int evntsel, cccr_val;
-       unsigned int misc_enable, dummy;
-       unsigned int ht_num;
-       struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-       rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
-       if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
-               return 0;
-
-#ifdef CONFIG_SMP
-       /* detect which hyperthread we are on */
-       if (smp_num_siblings == 2) {
-               unsigned int ebx, apicid;
-
-               ebx = cpuid_ebx(1);
-               apicid = (ebx >> 24) & 0xff;
-               ht_num = apicid & 1;
-       } else
-#endif
-               ht_num = 0;
-
-       /*
-        * performance counters are shared resources
-        * assign each hyperthread its own set
-        * (re-use the ESCR0 register, seems safe
-        * and keeps the cccr_val the same)
-        */
-       if (!ht_num) {
-               /* logical cpu 0 */
-               perfctr_msr = MSR_P4_IQ_PERFCTR0;
-               evntsel_msr = MSR_P4_CRU_ESCR0;
-               cccr_msr = MSR_P4_IQ_CCCR0;
-               cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
-
-               /*
-                * If we're on the kdump kernel or other situation, we may
-                * still have other performance counter registers set to
-                * interrupt and they'll keep interrupting forever because
-                * of the P4_CCCR_OVF quirk. So we need to ACK all the
-                * pending interrupts and disable all the registers here,
-                * before reenabling the NMI delivery. Refer to p4_rearm()
-                * about the P4_CCCR_OVF quirk.
-                */
-               if (reset_devices) {
-                       unsigned int low, high;
-                       int i;
-
-                       for (i = 0; i < P4_CONTROLS; i++) {
-                               rdmsr(p4_controls[i], low, high);
-                               low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
-                               wrmsr(p4_controls[i], low, high);
-                       }
-               }
-       } else {
-               /* logical cpu 1 */
-               perfctr_msr = MSR_P4_IQ_PERFCTR1;
-               evntsel_msr = MSR_P4_CRU_ESCR0;
-               cccr_msr = MSR_P4_IQ_CCCR1;
-
-               /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
-               if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
-                       cccr_val = P4_CCCR_OVF_PMI0;
-               else
-                       cccr_val = P4_CCCR_OVF_PMI1;
-               cccr_val |= P4_CCCR_ESCR_SELECT(4);
-       }
-
-       evntsel = P4_ESCR_EVENT_SELECT(0x3F)
-               | P4_ESCR_OS
-               | P4_ESCR_USR;
-
-       cccr_val |= P4_CCCR_THRESHOLD(15)
-                | P4_CCCR_COMPLEMENT
-                | P4_CCCR_COMPARE
-                | P4_CCCR_REQUIRED;
-
-       wrmsr(evntsel_msr, evntsel, 0);
-       wrmsr(cccr_msr, cccr_val, 0);
-       write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
-
-       wd->perfctr_msr = perfctr_msr;
-       wd->evntsel_msr = evntsel_msr;
-       wd->cccr_msr = cccr_msr;
-
-       /* ok, everything is initialized, announce that we're set */
-       cpu_nmi_set_wd_enabled();
-
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-       cccr_val |= P4_CCCR_ENABLE;
-       wrmsr(cccr_msr, cccr_val, 0);
-       return 1;
-}
-
-static void stop_p4_watchdog(void)
-{
-       struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-       wrmsr(wd->cccr_msr, 0, 0);
-       wrmsr(wd->evntsel_msr, 0, 0);
-}
-
-static int p4_reserve(void)
-{
-       if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
-               return 0;
-#ifdef CONFIG_SMP
-       if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
-               goto fail1;
-#endif
-       if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
-               goto fail2;
-       /* RED-PEN why is ESCR1 not reserved here? */
-       return 1;
- fail2:
-#ifdef CONFIG_SMP
-       if (smp_num_siblings > 1)
-               release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
- fail1:
-#endif
-       release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
-       return 0;
-}
-
-static void p4_unreserve(void)
-{
-#ifdef CONFIG_SMP
-       if (smp_num_siblings > 1)
-               release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
-#endif
-       release_evntsel_nmi(MSR_P4_CRU_ESCR0);
-       release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
-}
-
-static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
-       unsigned dummy;
-       /*
-        * P4 quirks:
-        * - An overflown perfctr will assert its interrupt
-        *   until the OVF flag in its CCCR is cleared.
-        * - LVTPC is masked on interrupt and must be
-        *   unmasked by the LVTPC handler.
-        */
-       rdmsrl(wd->cccr_msr, dummy);
-       dummy &= ~P4_CCCR_OVF;
-       wrmsrl(wd->cccr_msr, dummy);
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-       /* start the cycle over again */
-       write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops p4_wd_ops = {
-       .reserve        = p4_reserve,
-       .unreserve      = p4_unreserve,
-       .setup          = setup_p4_watchdog,
-       .rearm          = p4_rearm,
-       .stop           = stop_p4_watchdog,
-       /* RED-PEN this is wrong for the other sibling */
-       .perfctr        = MSR_P4_BPU_PERFCTR0,
-       .evntsel        = MSR_P4_BSU_ESCR0,
-       .checkbit       = 1ULL << 39,
-};
-
-/*
- * Watchdog using the Intel architected PerfMon.
- * Used for Core2 and hopefully all future Intel CPUs.
- */
-#define ARCH_PERFMON_NMI_EVENT_SEL     ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
-#define ARCH_PERFMON_NMI_EVENT_UMASK   ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
-
-static struct wd_ops intel_arch_wd_ops;
-
-static int setup_intel_arch_watchdog(unsigned nmi_hz)
-{
-       unsigned int ebx;
-       union cpuid10_eax eax;
-       unsigned int unused;
-       unsigned int perfctr_msr, evntsel_msr;
-       unsigned int evntsel;
-       struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
-       /*
-        * Check whether the Architectural PerfMon supports
-        * Unhalted Core Cycles Event or not.
-        * NOTE: Corresponding bit = 0 in ebx indicates event present.
-        */
-       cpuid(10, &(eax.full), &ebx, &unused, &unused);
-       if ((eax.split.mask_length <
-                       (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
-           (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
-               return 0;
-
-       perfctr_msr = wd_ops->perfctr;
-       evntsel_msr = wd_ops->evntsel;
-
-       wrmsrl(perfctr_msr, 0UL);
-
-       evntsel = ARCH_PERFMON_EVENTSEL_INT
-               | ARCH_PERFMON_EVENTSEL_OS
-               | ARCH_PERFMON_EVENTSEL_USR
-               | ARCH_PERFMON_NMI_EVENT_SEL
-               | ARCH_PERFMON_NMI_EVENT_UMASK;
-
-       /* setup the timer */
-       wrmsr(evntsel_msr, evntsel, 0);
-       nmi_hz = adjust_for_32bit_ctr(nmi_hz);
-       write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
-
-       wd->perfctr_msr = perfctr_msr;
-       wd->evntsel_msr = evntsel_msr;
-       wd->cccr_msr = 0;  /* unused */
-
-       /* ok, everything is initialized, announce that we're set */
-       cpu_nmi_set_wd_enabled();
-
-       apic_write(APIC_LVTPC, APIC_DM_NMI);
-       evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
-       wrmsr(evntsel_msr, evntsel, 0);
-       intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
-       return 1;
-}
-
-static struct wd_ops intel_arch_wd_ops __read_mostly = {
-       .reserve        = single_msr_reserve,
-       .unreserve      = single_msr_unreserve,
-       .setup          = setup_intel_arch_watchdog,
-       .rearm          = p6_rearm,
-       .stop           = single_msr_stop_watchdog,
-       .perfctr        = MSR_ARCH_PERFMON_PERFCTR1,
-       .evntsel        = MSR_ARCH_PERFMON_EVENTSEL1,
-};
-
-static void probe_nmi_watchdog(void)
-{
-       switch (boot_cpu_data.x86_vendor) {
-       case X86_VENDOR_AMD:
-               if (boot_cpu_data.x86 == 6 ||
-                   (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15))
-                       wd_ops = &k7_wd_ops;
-               return;
-       case X86_VENDOR_INTEL:
-               /* Work around where perfctr1 doesn't have a working enable
-                * bit as described in the following errata:
-                * AE49 Core Duo and Intel Core Solo 65 nm
-                * AN49 Intel Pentium Dual-Core
-                * AF49 Dual-Core Intel Xeon Processor LV
-                */
-               if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
-                   ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
-                    boot_cpu_data.x86_mask == 4))) {
-                       intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
-                       intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
-               }
-               if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-                       wd_ops = &intel_arch_wd_ops;
-                       break;
-               }
-               switch (boot_cpu_data.x86) {
-               case 6:
-                       if (boot_cpu_data.x86_model > 13)
-                               return;
-
-                       wd_ops = &p6_wd_ops;
-                       break;
-               case 15:
-                       wd_ops = &p4_wd_ops;
-                       break;
-               default:
-                       return;
-               }
-               break;
-       }
-}
-
-/* Interface to nmi.c */
-
-int lapic_watchdog_init(unsigned nmi_hz)
-{
-       if (!wd_ops) {
-               probe_nmi_watchdog();
-               if (!wd_ops) {
-                       printk(KERN_INFO "NMI watchdog: CPU not supported\n");
-                       return -1;
-               }
-
-               if (!wd_ops->reserve()) {
-                       printk(KERN_ERR
-                               "NMI watchdog: cannot reserve perfctrs\n");
-                       return -1;
-               }
-       }
-
-       if (!(wd_ops->setup(nmi_hz))) {
-               printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
-                      raw_smp_processor_id());
-               return -1;
-       }
-
-       return 0;
-}
-
-void lapic_watchdog_stop(void)
-{
-       if (wd_ops)
-               wd_ops->stop();
-}
-
-unsigned lapic_adjust_nmi_hz(unsigned hz)
-{
-       struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-       if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
-           wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
-               hz = adjust_for_32bit_ctr(hz);
-       return hz;
-}
-
-int __kprobes lapic_wd_event(unsigned nmi_hz)
-{
-       struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-       u64 ctr;
-
-       rdmsrl(wd->perfctr_msr, ctr);
-       if (ctr & wd_ops->checkbit) /* perfctr still running? */
-               return 0;
-
-       wd_ops->rearm(wd, nmi_hz);
-       return 1;
-}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c

index 6e8752c1bd5241fc9e7e63ee088f06c84d0526fb..8474c998cbd40d2f3481f87879f5a42d1f105e52 100644 (file)
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -175,21 +175,21 @@ static const struct stacktrace_ops print_trace_ops = {
  
  void
  show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp, char *log_lvl)
+               unsigned long *stack, char *log_lvl)
  {
         printk("%sCall Trace:\n", log_lvl);
-       dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+       dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
  }
  
  void show_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp)
+               unsigned long *stack)
  {
-       show_trace_log_lvl(task, regs, stack, bp, "");
+       show_trace_log_lvl(task, regs, stack, "");
  }
  
  void show_stack(struct task_struct *task, unsigned long *sp)
  {
-       show_stack_log_lvl(task, NULL, sp, 0, "");
+       show_stack_log_lvl(task, NULL, sp, "");
  }
  
  /*
@@ -210,7 +210,7 @@ void dump_stack(void)
                 init_utsname()->release,
                 (int)strcspn(init_utsname()->version, " "),
                 init_utsname()->version);
-       show_trace(NULL, NULL, &stack, bp);
+       show_trace(NULL, NULL, &stack);
  }
  EXPORT_SYMBOL(dump_stack);
  
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c

index 1bc7f75a5bdaf823999f7b90271808819b6f3ad3..74cc1eda384b8d26437a10fa55e3f64cfcf4545a 100644 (file)
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,11 +17,12 @@
  #include <asm/stacktrace.h>
  
  
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp,
+void dump_trace(struct task_struct *task,
+               struct pt_regs *regs, unsigned long *stack,
                 const struct stacktrace_ops *ops, void *data)
  {
         int graph = 0;
+       unsigned long bp;
  
         if (!task)
                 task = current;
@@ -34,18 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                         stack = (unsigned long *)task->thread.sp;
         }
  
-#ifdef CONFIG_FRAME_POINTER
-       if (!bp) {
-               if (task == current) {
-                       /* Grab bp right from our regs */
-                       get_bp(bp);
-               } else {
-                       /* bp is the last reg pushed by switch_to */
-                       bp = *(unsigned long *) task->thread.sp;
-               }
-       }
-#endif
-
+       bp = stack_frame(task, regs);
         for (;;) {
                 struct thread_info *context;
  
@@ -65,7 +55,7 @@ EXPORT_SYMBOL(dump_trace);
  
  void
  show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-                  unsigned long *sp, unsigned long bp, char *log_lvl)
+                  unsigned long *sp, char *log_lvl)
  {
         unsigned long *stack;
         int i;
@@ -87,7 +77,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
                 touch_nmi_watchdog();
         }
         printk(KERN_CONT "\n");
-       show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+       show_trace_log_lvl(task, regs, sp, log_lvl);
  }
  
  
@@ -112,8 +102,7 @@ void show_registers(struct pt_regs *regs)
                 u8 *ip;
  
                 printk(KERN_EMERG "Stack:\n");
-               show_stack_log_lvl(NULL, regs, &regs->sp,
-                               0, KERN_EMERG);
+               show_stack_log_lvl(NULL, regs, &regs->sp, KERN_EMERG);
  
                 printk(KERN_EMERG "Code: ");
  
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c

index 6a340485249a965f29686ac84117458d6cb72be6..64101335de19aad09ec03d75dea8260b28051747 100644 (file)
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -139,8 +139,8 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
   * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
   */
  
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp,
+void dump_trace(struct task_struct *task,
+               struct pt_regs *regs, unsigned long *stack,
                 const struct stacktrace_ops *ops, void *data)
  {
         const unsigned cpu = get_cpu();
@@ -149,6 +149,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
         unsigned used = 0;
         struct thread_info *tinfo;
         int graph = 0;
+       unsigned long bp;
  
         if (!task)
                 task = current;
@@ -160,18 +161,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                         stack = (unsigned long *)task->thread.sp;
         }
  
-#ifdef CONFIG_FRAME_POINTER
-       if (!bp) {
-               if (task == current) {
-                       /* Grab bp right from our regs */
-                       get_bp(bp);
-               } else {
-                       /* bp is the last reg pushed by switch_to */
-                       bp = *(unsigned long *) task->thread.sp;
-               }
-       }
-#endif
-
+       bp = stack_frame(task, regs);
         /*
          * Print function call entries in all stacks, starting at the
          * current stack address. If the stacks consist of nested
@@ -235,7 +225,7 @@ EXPORT_SYMBOL(dump_trace);
  
  void
  show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-                  unsigned long *sp, unsigned long bp, char *log_lvl)
+                  unsigned long *sp, char *log_lvl)
  {
         unsigned long *irq_stack_end;
         unsigned long *irq_stack;
@@ -279,7 +269,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
         preempt_enable();
  
         printk(KERN_CONT "\n");
-       show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+       show_trace_log_lvl(task, regs, sp, log_lvl);
  }
  
  void show_registers(struct pt_regs *regs)
@@ -308,7 +298,7 @@ void show_registers(struct pt_regs *regs)
  
                 printk(KERN_EMERG "Stack:\n");
                 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
-                               regs->bp, KERN_EMERG);
+                                  KERN_EMERG);
  
                 printk(KERN_EMERG "Code: ");
  
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c

index 4572f25f93255f8bb4a5e5158d3c9949f912b657..cd28a350f7f933162a6fa9bd0de08c64e22495f2 100644 (file)
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -240,7 +240,7 @@ static int __init setup_early_printk(char *buf)
                 if (!strncmp(buf, "xen", 3))
                         early_console_register(&xenboot_console, keep);
  #endif
-#ifdef CONFIG_X86_MRST_EARLY_PRINTK
+#ifdef CONFIG_EARLY_PRINTK_MRST
                 if (!strncmp(buf, "mrst", 4)) {
                         mrst_early_console_init();
                         early_console_register(&early_mrst_console, keep);
@@ -250,7 +250,6 @@ static int __init setup_early_printk(char *buf)
                         hsu_early_console_init();
                         early_console_register(&early_hsu_console, keep);
                 }
-
  #endif
                 buf++;
         }
diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c

deleted file mode 100644 (file)

index 65df603..0000000
--- a/arch/x86/kernel/early_printk_mrst.c
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * early_printk_mrst.c - early consoles for Intel MID platforms
- *
- * Copyright (c) 2008-2010, Intel Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-/*
- * This file implements two early consoles named mrst and hsu.
- * mrst is based on Maxim3110 spi-uart device, it exists in both
- * Moorestown and Medfield platforms, while hsu is based on a High
- * Speed UART device which only exists in the Medfield platform
- */
-
-#include <linux/serial_reg.h>
-#include <linux/serial_mfd.h>
-#include <linux/kmsg_dump.h>
-#include <linux/console.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/io.h>
-
-#include <asm/fixmap.h>
-#include <asm/pgtable.h>
-#include <asm/mrst.h>
-
-#define MRST_SPI_TIMEOUT               0x200000
-#define MRST_REGBASE_SPI0              0xff128000
-#define MRST_REGBASE_SPI1              0xff128400
-#define MRST_CLK_SPI0_REG              0xff11d86c
-
-/* Bit fields in CTRLR0 */
-#define SPI_DFS_OFFSET                 0
-
-#define SPI_FRF_OFFSET                 4
-#define SPI_FRF_SPI                    0x0
-#define SPI_FRF_SSP                    0x1
-#define SPI_FRF_MICROWIRE              0x2
-#define SPI_FRF_RESV                   0x3
-
-#define SPI_MODE_OFFSET                        6
-#define SPI_SCPH_OFFSET                        6
-#define SPI_SCOL_OFFSET                        7
-#define SPI_TMOD_OFFSET                        8
-#define        SPI_TMOD_TR                     0x0             /* xmit & recv */
-#define SPI_TMOD_TO                    0x1             /* xmit only */
-#define SPI_TMOD_RO                    0x2             /* recv only */
-#define SPI_TMOD_EPROMREAD             0x3             /* eeprom read mode */
-
-#define SPI_SLVOE_OFFSET               10
-#define SPI_SRL_OFFSET                 11
-#define SPI_CFS_OFFSET                 12
-
-/* Bit fields in SR, 7 bits */
-#define SR_MASK                                0x7f            /* cover 7 bits */
-#define SR_BUSY                                (1 << 0)
-#define SR_TF_NOT_FULL                 (1 << 1)
-#define SR_TF_EMPT                     (1 << 2)
-#define SR_RF_NOT_EMPT                 (1 << 3)
-#define SR_RF_FULL                     (1 << 4)
-#define SR_TX_ERR                      (1 << 5)
-#define SR_DCOL                                (1 << 6)
-
-struct dw_spi_reg {
-       u32     ctrl0;
-       u32     ctrl1;
-       u32     ssienr;
-       u32     mwcr;
-       u32     ser;
-       u32     baudr;
-       u32     txfltr;
-       u32     rxfltr;
-       u32     txflr;
-       u32     rxflr;
-       u32     sr;
-       u32     imr;
-       u32     isr;
-       u32     risr;
-       u32     txoicr;
-       u32     rxoicr;
-       u32     rxuicr;
-       u32     msticr;
-       u32     icr;
-       u32     dmacr;
-       u32     dmatdlr;
-       u32     dmardlr;
-       u32     idr;
-       u32     version;
-
-       /* Currently operates as 32 bits, though only the low 16 bits matter */
-       u32     dr;
-} __packed;
-
-#define dw_readl(dw, name)             __raw_readl(&(dw)->name)
-#define dw_writel(dw, name, val)       __raw_writel((val), &(dw)->name)
-
-/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
-static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
-
-static u32 *pclk_spi0;
-/* Always contains an accessable address, start with 0 */
-static struct dw_spi_reg *pspi;
-
-static struct kmsg_dumper dw_dumper;
-static int dumper_registered;
-
-static void dw_kmsg_dump(struct kmsg_dumper *dumper,
-                       enum kmsg_dump_reason reason,
-                       const char *s1, unsigned long l1,
-                       const char *s2, unsigned long l2)
-{
-       int i;
-
-       /* When run to this, we'd better re-init the HW */
-       mrst_early_console_init();
-
-       for (i = 0; i < l1; i++)
-               early_mrst_console.write(&early_mrst_console, s1 + i, 1);
-       for (i = 0; i < l2; i++)
-               early_mrst_console.write(&early_mrst_console, s2 + i, 1);
-}
-
-/* Set the ratio rate to 115200, 8n1, IRQ disabled */
-static void max3110_write_config(void)
-{
-       u16 config;
-
-       config = 0xc001;
-       dw_writel(pspi, dr, config);
-}
-
-/* Translate char to a eligible word and send to max3110 */
-static void max3110_write_data(char c)
-{
-       u16 data;
-
-       data = 0x8000 | c;
-       dw_writel(pspi, dr, data);
-}
-
-void mrst_early_console_init(void)
-{
-       u32 ctrlr0 = 0;
-       u32 spi0_cdiv;
-       u32 freq; /* Freqency info only need be searched once */
-
-       /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
-       pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
-                                                       MRST_CLK_SPI0_REG);
-       spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
-       freq = 100000000 / (spi0_cdiv + 1);
-
-       if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL)
-               mrst_spi_paddr = MRST_REGBASE_SPI1;
-
-       pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
-                                               mrst_spi_paddr);
-
-       /* Disable SPI controller */
-       dw_writel(pspi, ssienr, 0);
-
-       /* Set control param, 8 bits, transmit only mode */
-       ctrlr0 = dw_readl(pspi, ctrl0);
-
-       ctrlr0 &= 0xfcc0;
-       ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
-                     | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
-       dw_writel(pspi, ctrl0, ctrlr0);
-
-       /*
-        * Change the spi0 clk to comply with 115200 bps, use 100000 to
-        * calculate the clk dividor to make the clock a little slower
-        * than real baud rate.
-        */
-       dw_writel(pspi, baudr, freq/100000);
-
-       /* Disable all INT for early phase */
-       dw_writel(pspi, imr, 0x0);
-
-       /* Set the cs to spi-uart */
-       dw_writel(pspi, ser, 0x2);
-
-       /* Enable the HW, the last step for HW init */
-       dw_writel(pspi, ssienr, 0x1);
-
-       /* Set the default configuration */
-       max3110_write_config();
-
-       /* Register the kmsg dumper */
-       if (!dumper_registered) {
-               dw_dumper.dump = dw_kmsg_dump;
-               kmsg_dump_register(&dw_dumper);
-               dumper_registered = 1;
-       }
-}
-
-/* Slave select should be called in the read/write function */
-static void early_mrst_spi_putc(char c)
-{
-       unsigned int timeout;
-       u32 sr;
-
-       timeout = MRST_SPI_TIMEOUT;
-       /* Early putc needs to make sure the TX FIFO is not full */
-       while (--timeout) {
-               sr = dw_readl(pspi, sr);
-               if (!(sr & SR_TF_NOT_FULL))
-                       cpu_relax();
-               else
-                       break;
-       }
-
-       if (!timeout)
-               pr_warning("MRST earlycon: timed out\n");
-       else
-               max3110_write_data(c);
-}
-
-/* Early SPI only uses polling mode */
-static void early_mrst_spi_write(struct console *con, const char *str, unsigned n)
-{
-       int i;
-
-       for (i = 0; i < n && *str; i++) {
-               if (*str == '\n')
-                       early_mrst_spi_putc('\r');
-               early_mrst_spi_putc(*str);
-               str++;
-       }
-}
-
-struct console early_mrst_console = {
-       .name =         "earlymrst",
-       .write =        early_mrst_spi_write,
-       .flags =        CON_PRINTBUFFER,
-       .index =        -1,
-};
-
-/*
- * Following is the early console based on Medfield HSU (High
- * Speed UART) device.
- */
-#define HSU_PORT2_PADDR                0xffa28180
-
-static void __iomem *phsu;
-
-void hsu_early_console_init(void)
-{
-       u8 lcr;
-
-       phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
-                                                       HSU_PORT2_PADDR);
-
-       /* Disable FIFO */
-       writeb(0x0, phsu + UART_FCR);
-
-       /* Set to default 115200 bps, 8n1 */
-       lcr = readb(phsu + UART_LCR);
-       writeb((0x80 | lcr), phsu + UART_LCR);
-       writeb(0x18, phsu + UART_DLL);
-       writeb(lcr,  phsu + UART_LCR);
-       writel(0x3600, phsu + UART_MUL*4);
-
-       writeb(0x8, phsu + UART_MCR);
-       writeb(0x7, phsu + UART_FCR);
-       writeb(0x3, phsu + UART_LCR);
-
-       /* Clear IRQ status */
-       readb(phsu + UART_LSR);
-       readb(phsu + UART_RX);
-       readb(phsu + UART_IIR);
-       readb(phsu + UART_MSR);
-
-       /* Enable FIFO */
-       writeb(0x7, phsu + UART_FCR);
-}
-
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
-static void early_hsu_putc(char ch)
-{
-       unsigned int timeout = 10000; /* 10ms */
-       u8 status;
-
-       while (--timeout) {
-               status = readb(phsu + UART_LSR);
-               if (status & BOTH_EMPTY)
-                       break;
-               udelay(1);
-       }
-
-       /* Only write the char when there was no timeout */
-       if (timeout)
-               writeb(ch, phsu + UART_TX);
-}
-
-static void early_hsu_write(struct console *con, const char *str, unsigned n)
-{
-       int i;
-
-       for (i = 0; i < n && *str; i++) {
-               if (*str == '\n')
-                       early_hsu_putc('\r');
-               early_hsu_putc(*str);
-               str++;
-       }
-}
-
-struct console early_hsu_console = {
-       .name =         "earlyhsu",
-       .write =        early_hsu_write,
-       .flags =        CON_PRINTBUFFER,
-       .index =        -1,
-};
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c

index 3afb33f14d2d2c86a3c961d87aaae531d2631ac8..298448656b6079d074232518cb16e50895b4a5b8 100644 (file)
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -19,6 +19,7 @@
  #include <linux/sched.h>
  #include <linux/init.h>
  #include <linux/list.h>
+#include <linux/module.h>
  
  #include <trace/syscall.h>
  
@@ -49,6 +50,7 @@ static DEFINE_PER_CPU(int, save_modifying_code);
  int ftrace_arch_code_modify_prepare(void)
  {
         set_kernel_text_rw();
+       set_all_modules_text_rw();
         modifying_code = 1;
         return 0;
  }
@@ -56,6 +58,7 @@ int ftrace_arch_code_modify_prepare(void)
  int ftrace_arch_code_modify_post_process(void)
  {
         modifying_code = 0;
+       set_all_modules_text_ro();
         set_kernel_text_ro();
         return 0;
  }
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c

index 763310165fa0d1b0e4bf1891632a729e289980e8..7f138b3c3c52cf2d6790d8308fdc9a6dae8b41b7 100644 (file)
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -61,6 +61,9 @@ void __init i386_start_kernel(void)
         case X86_SUBARCH_MRST:
                 x86_mrst_early_setup();
                 break;
+       case X86_SUBARCH_CE4100:
+               x86_ce4100_early_setup();
+               break;
         default:
                 i386_default_early_setup();
                 break;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S

index c0dbd9ac24f0d5cf7e87f8f0439275656b877f73..5707fc8a7a4bcc8527178cb52ca38afad747ee5b 100644 (file)
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -316,6 +316,10 @@ ENTRY(startup_32_smp)
         subl $0x80000001, %eax
         cmpl $(0x8000ffff-0x80000001), %eax
         ja 6f
+
+       /* Clear bogus XD_DISABLE bits */
+       call verify_cpu
+
         mov $0x80000001, %eax
         cpuid
         /* Execute Disable bit supported? */
@@ -611,6 +615,8 @@ ignore_int:
  #endif
         iret
  
+#include "verify_cpu.S"
+
         __REFDATA
  .align 4
  ENTRY(initial_code)
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c

index 1cbd54c0df99189548a3a03f40fbb75a1703475a..5940282bd2f94ed886226bc717c189e593adab50 100644 (file)
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1184,6 +1184,10 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
  {
         struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
  
+       /* This is possible if op is under delayed unoptimizing */
+       if (kprobe_disabled(&op->kp))
+               return;
+
         preempt_disable();
         if (kprobe_running()) {
                 kprobes_inc_nmissed_count(&op->kp);
@@ -1401,10 +1405,16 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
         return 0;
  }
  
-/* Replace a breakpoint (int3) with a relative jump.  */
-int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
+#define MAX_OPTIMIZE_PROBES 256
+static struct text_poke_param *jump_poke_params;
+static struct jump_poke_buffer {
+       u8 buf[RELATIVEJUMP_SIZE];
+} *jump_poke_bufs;
+
+static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
+                                           u8 *insn_buf,
+                                           struct optimized_kprobe *op)
  {
-       unsigned char jmp_code[RELATIVEJUMP_SIZE];
         s32 rel = (s32)((long)op->optinsn.insn -
                         ((long)op->kp.addr + RELATIVEJUMP_SIZE));
  
@@ -1412,16 +1422,79 @@ int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
         memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
                RELATIVE_ADDR_SIZE);
  
-       jmp_code[0] = RELATIVEJUMP_OPCODE;
-       *(s32 *)(&jmp_code[1]) = rel;
+       insn_buf[0] = RELATIVEJUMP_OPCODE;
+       *(s32 *)(&insn_buf[1]) = rel;
+
+       tprm->addr = op->kp.addr;
+       tprm->opcode = insn_buf;
+       tprm->len = RELATIVEJUMP_SIZE;
+}
+
+/*
+ * Replace breakpoints (int3) with relative jumps.
+ * Caller must call with locking kprobe_mutex and text_mutex.
+ */
+void __kprobes arch_optimize_kprobes(struct list_head *oplist)
+{
+       struct optimized_kprobe *op, *tmp;
+       int c = 0;
+
+       list_for_each_entry_safe(op, tmp, oplist, list) {
+               WARN_ON(kprobe_disabled(&op->kp));
+               /* Setup param */
+               setup_optimize_kprobe(&jump_poke_params[c],
+                                     jump_poke_bufs[c].buf, op);
+               list_del_init(&op->list);
+               if (++c >= MAX_OPTIMIZE_PROBES)
+                       break;
+       }
  
         /*
          * text_poke_smp doesn't support NMI/MCE code modifying.
          * However, since kprobes itself also doesn't support NMI/MCE
          * code probing, it's not a problem.
          */
-       text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
-       return 0;
+       text_poke_smp_batch(jump_poke_params, c);
+}
+
+static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
+                                             u8 *insn_buf,
+                                             struct optimized_kprobe *op)
+{
+       /* Set int3 to first byte for kprobes */
+       insn_buf[0] = BREAKPOINT_INSTRUCTION;
+       memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+
+       tprm->addr = op->kp.addr;
+       tprm->opcode = insn_buf;
+       tprm->len = RELATIVEJUMP_SIZE;
+}
+
+/*
+ * Recover original instructions and breakpoints from relative jumps.
+ * Caller must call with locking kprobe_mutex.
+ */
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+                                   struct list_head *done_list)
+{
+       struct optimized_kprobe *op, *tmp;
+       int c = 0;
+
+       list_for_each_entry_safe(op, tmp, oplist, list) {
+               /* Setup param */
+               setup_unoptimize_kprobe(&jump_poke_params[c],
+                                       jump_poke_bufs[c].buf, op);
+               list_move(&op->list, done_list);
+               if (++c >= MAX_OPTIMIZE_PROBES)
+                       break;
+       }
+
+       /*
+        * text_poke_smp doesn't support NMI/MCE code modifying.
+        * However, since kprobes itself also doesn't support NMI/MCE
+        * code probing, it's not a problem.
+        */
+       text_poke_smp_batch(jump_poke_params, c);
  }
  
  /* Replace a relative jump with a breakpoint (int3).  */
@@ -1453,11 +1526,35 @@ static int  __kprobes setup_detour_execution(struct kprobe *p,
         }
         return 0;
  }
+
+static int __kprobes init_poke_params(void)
+{
+       /* Allocate code buffer and parameter array */
+       jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
+                                MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+       if (!jump_poke_bufs)
+               return -ENOMEM;
+
+       jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
+                                  MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+       if (!jump_poke_params) {
+               kfree(jump_poke_bufs);
+               jump_poke_bufs = NULL;
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+#else  /* !CONFIG_OPTPROBES */
+static int __kprobes init_poke_params(void)
+{
+       return 0;
+}
  #endif
  
  int __init arch_init_kprobes(void)
  {
-       return 0;
+       return init_poke_params();
  }
  
  int __kprobes arch_trampoline_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c

index ce0cb4721c9ac9eec8869e64c1fcd1a1ef0fd379..0fe6d1a66c38cf0aaea3383ac000eefbb5d34fca 100644 (file)
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -155,12 +155,6 @@ static int apply_microcode_amd(int cpu)
         return 0;
  }
  
-static int get_ucode_data(void *to, const u8 *from, size_t n)
-{
-       memcpy(to, from, n);
-       return 0;
-}
-
  static void *
  get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
  {
@@ -168,8 +162,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
         u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
         void *mc;
  
-       if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
-               return NULL;
+       get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR);
  
         if (section_hdr[0] != UCODE_UCODE_TYPE) {
                 pr_err("error: invalid type field in container file section header\n");
@@ -183,16 +176,13 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
                 return NULL;
         }
  
-       mc = vmalloc(UCODE_MAX_SIZE);
-       if (mc) {
-               memset(mc, 0, UCODE_MAX_SIZE);
-               if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR,
-                                  total_size)) {
-                       vfree(mc);
-                       mc = NULL;
-               } else
-                       *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
-       }
+       mc = vzalloc(UCODE_MAX_SIZE);
+       if (!mc)
+               return NULL;
+
+       get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size);
+       *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
+
         return mc;
  }
  
@@ -202,8 +192,7 @@ static int install_equiv_cpu_table(const u8 *buf)
         unsigned int *buf_pos = (unsigned int *)container_hdr;
         unsigned long size;
  
-       if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
-               return 0;
+       get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE);
  
         size = buf_pos[2];
  
@@ -219,10 +208,7 @@ static int install_equiv_cpu_table(const u8 *buf)
         }
  
         buf += UCODE_CONTAINER_HEADER_SIZE;
-       if (get_ucode_data(equiv_cpu_table, buf, size)) {
-               vfree(equiv_cpu_table);
-               return 0;
-       }
+       get_ucode_data(equiv_cpu_table, buf, size);
  
         return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
  }
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c

index ba0f0ca9f280bb0473470fdb71a3fa04faec00f0..c01ffa5b9b87e509da797e359577bbb84463ce5f 100644 (file)
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -143,7 +143,7 @@ static void flush_gart(void)
  
         spin_lock_irqsave(&iommu_bitmap_lock, flags);
         if (need_flush) {
-               k8_flush_garts();
+               amd_flush_garts();
                 need_flush = false;
         }
         spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -561,17 +561,17 @@ static void enable_gart_translations(void)
  {
         int i;
  
-       if (!k8_northbridges.gart_supported)
+       if (!amd_nb_has_feature(AMD_NB_GART))
                 return;
  
-       for (i = 0; i < k8_northbridges.num; i++) {
-               struct pci_dev *dev = k8_northbridges.nb_misc[i];
+       for (i = 0; i < amd_nb_num(); i++) {
+               struct pci_dev *dev = node_to_amd_nb(i)->misc;
  
                 enable_gart_translation(dev, __pa(agp_gatt_table));
         }
  
         /* Flush the GART-TLB to remove stale entries */
-       k8_flush_garts();
+       amd_flush_garts();
  }
  
  /*
@@ -596,13 +596,13 @@ static void gart_fixup_northbridges(struct sys_device *dev)
         if (!fix_up_north_bridges)
                 return;
  
-       if (!k8_northbridges.gart_supported)
+       if (!amd_nb_has_feature(AMD_NB_GART))
                 return;
  
         pr_info("PCI-DMA: Restoring GART aperture settings\n");
  
-       for (i = 0; i < k8_northbridges.num; i++) {
-               struct pci_dev *dev = k8_northbridges.nb_misc[i];
+       for (i = 0; i < amd_nb_num(); i++) {
+               struct pci_dev *dev = node_to_amd_nb(i)->misc;
  
                 /*
                  * Don't enable translations just yet.  That is the next
@@ -644,7 +644,7 @@ static struct sys_device device_gart = {
   * Private Northbridge GATT initialization in case we cannot use the
   * AGP driver for some reason.
   */
-static __init int init_k8_gatt(struct agp_kern_info *info)
+static __init int init_amd_gatt(struct agp_kern_info *info)
  {
         unsigned aper_size, gatt_size, new_aper_size;
         unsigned aper_base, new_aper_base;
@@ -656,8 +656,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
  
         aper_size = aper_base = info->aper_size = 0;
         dev = NULL;
-       for (i = 0; i < k8_northbridges.num; i++) {
-               dev = k8_northbridges.nb_misc[i];
+       for (i = 0; i < amd_nb_num(); i++) {
+               dev = node_to_amd_nb(i)->misc;
                 new_aper_base = read_aperture(dev, &new_aper_size);
                 if (!new_aper_base)
                         goto nommu;
@@ -725,13 +725,13 @@ static void gart_iommu_shutdown(void)
         if (!no_agp)
                 return;
  
-       if (!k8_northbridges.gart_supported)
+       if (!amd_nb_has_feature(AMD_NB_GART))
                 return;
  
-       for (i = 0; i < k8_northbridges.num; i++) {
+       for (i = 0; i < amd_nb_num(); i++) {
                 u32 ctl;
  
-               dev = k8_northbridges.nb_misc[i];
+               dev = node_to_amd_nb(i)->misc;
                 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
  
                 ctl &= ~GARTEN;
@@ -749,14 +749,14 @@ int __init gart_iommu_init(void)
         unsigned long scratch;
         long i;
  
-       if (!k8_northbridges.gart_supported)
+       if (!amd_nb_has_feature(AMD_NB_GART))
                 return 0;
  
  #ifndef CONFIG_AGP_AMD64
         no_agp = 1;
  #else
         /* Makefile puts PCI initialization via subsys_initcall first. */
-       /* Add other K8 AGP bridge drivers here */
+       /* Add other AMD AGP bridge drivers here */
         no_agp = no_agp ||
                 (agp_amd64_init() < 0) ||
                 (agp_copy_info(agp_bridge, &info) < 0);
@@ -765,7 +765,7 @@ int __init gart_iommu_init(void)
         if (no_iommu ||
             (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
             !gart_iommu_aperture ||
-           (no_agp && init_k8_gatt(&info) < 0)) {
+           (no_agp && init_amd_gatt(&info) < 0)) {
                 if (max_pfn > MAX_DMA32_PFN) {
                         pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
                         pr_warning("falling back to iommu=soft.\n");
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c

index 57d1868a86aadc060bc2260b34139809a98ffab5..c852041bfc3d5b70e792dad0cfd42fdd467c3f38 100644 (file)
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -91,8 +91,7 @@ void exit_thread(void)
  void show_regs(struct pt_regs *regs)
  {
         show_registers(regs);
-       show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs),
-                  regs->bp);
+       show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs));
  }
  
  void show_regs_common(void)
@@ -374,6 +373,7 @@ void default_idle(void)
  {
         if (hlt_use_halt()) {
                 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+               trace_cpu_idle(1, smp_processor_id());
                 current_thread_info()->status &= ~TS_POLLING;
                 /*
                  * TS_POLLING-cleared state must be visible before we
@@ -444,6 +444,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
  {
         trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
+       trace_cpu_idle((ax>>4)+1, smp_processor_id());
         if (!need_resched()) {
                 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
                         clflush((void *)&current_thread_info()->flags);
@@ -460,6 +461,7 @@ static void mwait_idle(void)
  {
         if (!need_resched()) {
                 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+               trace_cpu_idle(1, smp_processor_id());
                 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
                         clflush((void *)&current_thread_info()->flags);
  
@@ -481,10 +483,12 @@ static void mwait_idle(void)
  static void poll_idle(void)
  {
         trace_power_start(POWER_CSTATE, 0, smp_processor_id());
+       trace_cpu_idle(0, smp_processor_id());
         local_irq_enable();
         while (!need_resched())
                 cpu_relax();
-       trace_power_end(0);
+       trace_power_end(smp_processor_id());
+       trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
  }
  
  /*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c

index 96586c3cbbbf88dd6479ed250b24ea1112154a22..4b9befa0e347f6f402238d28fea4f9969563a6dd 100644 (file)
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -113,8 +113,8 @@ void cpu_idle(void)
                         stop_critical_timings();
                         pm_idle();
                         start_critical_timings();
-
                         trace_power_end(smp_processor_id());
+                       trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
                 }
                 tick_nohz_restart_sched_tick();
                 preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c

index b3d7a3a04f389d9626837a1e776c5106e821f7dc..4c818a73839685c3b1083d2170f02e6758a3bed7 100644 (file)
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -142,6 +142,8 @@ void cpu_idle(void)
                         start_critical_timings();
  
                         trace_power_end(smp_processor_id());
+                       trace_cpu_idle(PWR_EVENT_EXIT,
+                                      smp_processor_id());
  
                         /* In many cases the interrupt that ended idle
                            has already called exit_idle. But some idle
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c

index fda313ebbb03dfc98d5d0f953a1b99846e25dc9a..c8e41e90f59ceb9da7be768fcfe197a83602742e 100644 (file)
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -43,17 +43,33 @@ static void rdc321x_reset(struct pci_dev *dev)
         outb(1, 0x92);
  }
  
+static void ce4100_reset(struct pci_dev *dev)
+{
+       int i;
+
+       for (i = 0; i < 10; i++) {
+               outb(0x2, 0xcf9);
+               udelay(50);
+       }
+}
+
  struct device_fixup {
         unsigned int vendor;
         unsigned int device;
         void (*reboot_fixup)(struct pci_dev *);
  };
  
+/*
+ * PCI ids solely used for fixups_table go here
+ */
+#define PCI_DEVICE_ID_INTEL_CE4100     0x0708
+
  static const struct device_fixup fixups_table[] = {
  { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
  { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
  { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
  { PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
+{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset },
  };
  
  /*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c

index a0f52af256a037eaca0fce91e00caad47018d65d..d3cfe26c0252ab24aaae1481c8c297ac096e6c6d 100644 (file)
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -705,7 +705,7 @@ static u64 __init get_max_mapped(void)
  void __init setup_arch(char **cmdline_p)
  {
         int acpi = 0;
-       int k8 = 0;
+       int amd = 0;
         unsigned long flags;
  
  #ifdef CONFIG_X86_32
@@ -991,12 +991,12 @@ void __init setup_arch(char **cmdline_p)
         acpi = acpi_numa_init();
  #endif
  
-#ifdef CONFIG_K8_NUMA
+#ifdef CONFIG_AMD_NUMA
         if (!acpi)
-               k8 = !k8_numa_init(0, max_pfn);
+               amd = !amd_numa_init(0, max_pfn);
  #endif
  
-       initmem_init(0, max_pfn, acpi, k8);
+       initmem_init(0, max_pfn, acpi, amd);
         memblock_find_dma_reserve();
         dma32_reserve_bootmem();
  
@@ -1045,10 +1045,7 @@ void __init setup_arch(char **cmdline_p)
  #endif
  
         init_apic_mappings();
-       ioapic_init_mappings();
-
-       /* need to wait for io_apic is mapped */
-       probe_nr_irqs_gsi();
+       ioapic_and_gsi_init();
  
         kvm_guest_init();
  
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c

index 083e99d1b7df2aba236563467f47ebb21a09943d..ee886fe10ef4eb8515ae1d2431c0ce8ad3e8dd89 100644 (file)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -281,6 +281,13 @@ static void __cpuinit smp_callin(void)
          */
         smp_store_cpu_info(cpuid);
  
+       /*
+        * This must be done before setting cpu_online_mask
+        * or calling notify_cpu_starting.
+        */
+       set_cpu_sibling_map(raw_smp_processor_id());
+       wmb();
+
         notify_cpu_starting(cpuid);
  
         /*
@@ -316,16 +323,6 @@ notrace static void __cpuinit start_secondary(void *unused)
          */
         check_tsc_sync_target();
  
-       if (nmi_watchdog == NMI_IO_APIC) {
-               legacy_pic->mask(0);
-               enable_NMI_through_LVT0();
-               legacy_pic->unmask(0);
-       }
-
-       /* This must be done before setting cpu_online_mask */
-       set_cpu_sibling_map(raw_smp_processor_id());
-       wmb();
-
         /*
          * We need to hold call_lock, so there is no inconsistency
          * between the time smp_call_function() determines number of
@@ -1061,8 +1058,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
                 printk(KERN_INFO "SMP mode deactivated.\n");
                 smpboot_clear_io_apic();
  
-               localise_nmi_watchdog();
-
                 connect_bsp_APIC();
                 setup_local_APIC();
                 end_local_APIC_setup();
@@ -1166,6 +1161,20 @@ out:
         preempt_enable();
  }
  
+void arch_disable_nonboot_cpus_begin(void)
+{
+       /*
+        * Avoid the smp alternatives switch during the disable_nonboot_cpus().
+        * In the suspend path, we will be back in the SMP mode shortly anyways.
+        */
+       skip_smp_alternatives = true;
+}
+
+void arch_disable_nonboot_cpus_end(void)
+{
+       skip_smp_alternatives = false;
+}
+
  void arch_enable_nonboot_cpus_begin(void)
  {
         set_mtrr_aps_delayed_init();
@@ -1196,7 +1205,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
  #ifdef CONFIG_X86_IO_APIC
         setup_ioapic_dest();
  #endif
-       check_nmi_watchdog();
         mtrr_aps_init();
  }
  
@@ -1341,8 +1349,6 @@ int native_cpu_disable(void)
         if (cpu == 0)
                 return -EBUSY;
  
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               stop_apic_nmi_watchdog(NULL);
         clear_local_APIC();
  
         cpu_disable_common();
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c

index b53c525368a75cf07489b0327de138bfab5b16d5..938c8e10a19abeae0e2c814e182a0920ff0328f9 100644 (file)
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -73,22 +73,22 @@ static const struct stacktrace_ops save_stack_ops_nosched = {
   */
  void save_stack_trace(struct stack_trace *trace)
  {
-       dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
+       dump_trace(current, NULL, NULL, &save_stack_ops, trace);
         if (trace->nr_entries < trace->max_entries)
                 trace->entries[trace->nr_entries++] = ULONG_MAX;
  }
  EXPORT_SYMBOL_GPL(save_stack_trace);
  
-void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
+void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
  {
-       dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
+       dump_trace(current, regs, NULL, &save_stack_ops, trace);
         if (trace->nr_entries < trace->max_entries)
                 trace->entries[trace->nr_entries++] = ULONG_MAX;
  }
  
  void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
  {
-       dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
+       dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace);
         if (trace->nr_entries < trace->max_entries)
                 trace->entries[trace->nr_entries++] = ULONG_MAX;
  }
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c

index fb5cc5e14cfafb6ed62c4de2929b3bfa0a2e91e9..25a28a245937989d2de9abfb94d96835a1b01615 100644 (file)
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -22,10 +22,6 @@
  #include <asm/hpet.h>
  #include <asm/time.h>
  
-#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
-int timer_ack;
-#endif
-
  #ifdef CONFIG_X86_64
  volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
  #endif
@@ -63,20 +59,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
         /* Keep nmi watchdog up to date */
         inc_irq_stat(irq0_irqs);
  
-       /* Optimized out for !IO_APIC and x86_64 */
-       if (timer_ack) {
-               /*
-                * Subtle, when I/O APICs are used we have to ack timer IRQ
-                * manually to deassert NMI lines for the watchdog if run
-                * on an 82489DX-based system.
-                */
-               raw_spin_lock(&i8259A_lock);
-               outb(0x0c, PIC_MASTER_OCW3);
-               /* Ack the IRQ; AEOI will end it automatically. */
-               inb(PIC_MASTER_POLL);
-               raw_spin_unlock(&i8259A_lock);
-       }
-
         global_clock_event->event_handler(global_clock_event);
  
         /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S

index 3af2dff58b213262d403a8f9c1c55f84920b1d8d..075d130efcf9019eb3f4745677e281533335fa11 100644 (file)
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -127,7 +127,7 @@ startup_64:
  no_longmode:
         hlt
         jmp no_longmode
-#include "verify_cpu_64.S"
+#include "verify_cpu.S"
  
         # Careful these need to be in the same 64K segment as the above;
  tidt:
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c

index cb838ca42c9664c2ecf9530d170d27cdbc679f7a..c76aaca5694dd88c6fee8af964e8c62ccd3ad272 100644 (file)
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -83,6 +83,8 @@ EXPORT_SYMBOL_GPL(used_vectors);
  
  static int ignore_nmis;
  
+int unknown_nmi_panic;
+
  static inline void conditional_sti(struct pt_regs *regs)
  {
         if (regs->flags & X86_EFLAGS_IF)
@@ -300,6 +302,13 @@ gp_in_kernel:
         die("general protection fault", regs, error_code);
  }
  
+static int __init setup_unknown_nmi_panic(char *str)
+{
+       unknown_nmi_panic = 1;
+       return 1;
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
+
  static notrace __kprobes void
  mem_parity_error(unsigned char reason, struct pt_regs *regs)
  {
@@ -342,9 +351,11 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
         reason = (reason & 0xf) | 8;
         outb(reason, 0x61);
  
-       i = 2000;
-       while (--i)
-               udelay(1000);
+       i = 20000;
+       while (--i) {
+               touch_nmi_watchdog();
+               udelay(100);
+       }
  
         reason &= ~8;
         outb(reason, 0x61);
@@ -371,7 +382,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
                         reason, smp_processor_id());
  
         printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
-       if (panic_on_unrecovered_nmi)
+       if (unknown_nmi_panic || panic_on_unrecovered_nmi)
                 panic("NMI: Not continuing");
  
         printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
@@ -397,20 +408,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
                 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
                                                         == NOTIFY_STOP)
                         return;
-
-#ifndef CONFIG_LOCKUP_DETECTOR
-               /*
-                * Ok, so this is none of the documented NMI sources,
-                * so it must be the NMI watchdog.
-                */
-               if (nmi_watchdog_tick(regs, reason))
-                       return;
-               if (!do_nmi_callback(regs, cpu))
-#endif /* !CONFIG_LOCKUP_DETECTOR */
-                       unknown_nmi_error(reason, regs);
-#else
-               unknown_nmi_error(reason, regs);
  #endif
+               unknown_nmi_error(reason, regs);
  
                 return;
         }
@@ -446,14 +445,12 @@ do_nmi(struct pt_regs *regs, long error_code)
  
  void stop_nmi(void)
  {
-       acpi_nmi_disable();
         ignore_nmis++;
  }
  
  void restart_nmi(void)
  {
         ignore_nmis--;
-       acpi_nmi_enable();
  }
  
  /* May run on IST stack. */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c

index 0c40d8b72416ba2ef7e86bfd812b7bf6f1db2f8f..356a0d455cf997cb1bd586d3a13fd8a7c16d4d3c 100644 (file)
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -872,6 +872,9 @@ __cpuinit int unsynchronized_tsc(void)
  
         if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
                 return 0;
+
+       if (tsc_clocksource_reliable)
+               return 0;
         /*
          * Intel systems are normally all synchronized.
          * Exceptions must mark TSC as unstable:
@@ -879,14 +882,92 @@ __cpuinit int unsynchronized_tsc(void)
         if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
                 /* assume multi socket systems are not synchronized: */
                 if (num_possible_cpus() > 1)
-                       tsc_unstable = 1;
+                       return 1;
         }
  
-       return tsc_unstable;
+       return 0;
+}
+
+
+static void tsc_refine_calibration_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
+/**
+ * tsc_refine_calibration_work - Further refine tsc freq calibration
+ * @work - ignored.
+ *
+ * This functions uses delayed work over a period of a
+ * second to further refine the TSC freq value. Since this is
+ * timer based, instead of loop based, we don't block the boot
+ * process while this longer calibration is done.
+ *
+ * If there are any calibration anomolies (too many SMIs, etc),
+ * or the refined calibration is off by 1% of the fast early
+ * calibration, we throw out the new calibration and use the
+ * early calibration.
+ */
+static void tsc_refine_calibration_work(struct work_struct *work)
+{
+       static u64 tsc_start = -1, ref_start;
+       static int hpet;
+       u64 tsc_stop, ref_stop, delta;
+       unsigned long freq;
+
+       /* Don't bother refining TSC on unstable systems */
+       if (check_tsc_unstable())
+               goto out;
+
+       /*
+        * Since the work is started early in boot, we may be
+        * delayed the first time we expire. So set the workqueue
+        * again once we know timers are working.
+        */
+       if (tsc_start == -1) {
+               /*
+                * Only set hpet once, to avoid mixing hardware
+                * if the hpet becomes enabled later.
+                */
+               hpet = is_hpet_enabled();
+               schedule_delayed_work(&tsc_irqwork, HZ);
+               tsc_start = tsc_read_refs(&ref_start, hpet);
+               return;
+       }
+
+       tsc_stop = tsc_read_refs(&ref_stop, hpet);
+
+       /* hpet or pmtimer available ? */
+       if (!hpet && !ref_start && !ref_stop)
+               goto out;
+
+       /* Check, whether the sampling was disturbed by an SMI */
+       if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
+               goto out;
+
+       delta = tsc_stop - tsc_start;
+       delta *= 1000000LL;
+       if (hpet)
+               freq = calc_hpet_ref(delta, ref_start, ref_stop);
+       else
+               freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+
+       /* Make sure we're within 1% */
+       if (abs(tsc_khz - freq) > tsc_khz/100)
+               goto out;
+
+       tsc_khz = freq;
+       printk(KERN_INFO "Refined TSC clocksource calibration: "
+               "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
+                                       (unsigned long)tsc_khz % 1000);
+
+out:
+       clocksource_register_khz(&clocksource_tsc, tsc_khz);
  }
  
-static void __init init_tsc_clocksource(void)
+
+static int __init init_tsc_clocksource(void)
  {
+       if (!cpu_has_tsc || tsc_disabled > 0)
+               return 0;
+
         if (tsc_clocksource_reliable)
                 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
         /* lower the rating if we already know its unstable: */
@@ -894,8 +975,14 @@ static void __init init_tsc_clocksource(void)
                 clocksource_tsc.rating = 0;
                 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
         }
-       clocksource_register_khz(&clocksource_tsc, tsc_khz);
+       schedule_delayed_work(&tsc_irqwork, 0);
+       return 0;
  }
+/*
+ * We use device_initcall here, to ensure we run after the hpet
+ * is fully initialized, which may occur at fs_initcall time.
+ */
+device_initcall(init_tsc_clocksource);
  
  void __init tsc_init(void)
  {
@@ -949,6 +1036,5 @@ void __init tsc_init(void)
                 mark_tsc_unstable("TSCs unsynchronized");
  
         check_system_tsc_reliable();
-       init_tsc_clocksource();
  }
  
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S

new file mode 100644 (file)

index 0000000..0edefc1
--- /dev/null
+++ b/arch/x86/kernel/verify_cpu.S
@@ -0,0 +1,139 @@
+/*
+ *
+ *     verify_cpu.S - Code for cpu long mode and SSE verification. This
+ *     code has been borrowed from boot/setup.S and was introduced by
+ *     Andi Kleen.
+ *
+ *     Copyright (c) 2007  Andi Kleen (ak@suse.de)
+ *     Copyright (c) 2007  Eric Biederman (ebiederm@xmission.com)
+ *     Copyright (c) 2007  Vivek Goyal (vgoyal@in.ibm.com)
+ *     Copyright (c) 2010  Kees Cook (kees.cook@canonical.com)
+ *
+ *     This source code is licensed under the GNU General Public License,
+ *     Version 2.  See the file COPYING for more details.
+ *
+ *     This is a common code for verification whether CPU supports
+ *     long mode and SSE or not. It is not called directly instead this
+ *     file is included at various places and compiled in that context.
+ *     This file is expected to run in 32bit code.  Currently:
+ *
+ *     arch/x86/boot/compressed/head_64.S: Boot cpu verification
+ *     arch/x86/kernel/trampoline_64.S: secondary processor verfication
+ *     arch/x86/kernel/head_32.S: processor startup
+ *
+ *     verify_cpu, returns the status of longmode and SSE in register %eax.
+ *             0: Success    1: Failure
+ *
+ *     On Intel, the XD_DISABLE flag will be cleared as a side-effect.
+ *
+ *     The caller needs to check for the error code and take the action
+ *     appropriately. Either display a message or halt.
+ */
+
+#include <asm/cpufeature.h>
+#include <asm/msr-index.h>
+
+verify_cpu:
+       pushfl                          # Save caller passed flags
+       pushl   $0                      # Kill any dangerous flags
+       popfl
+
+       pushfl                          # standard way to check for cpuid
+       popl    %eax
+       movl    %eax,%ebx
+       xorl    $0x200000,%eax
+       pushl   %eax
+       popfl
+       pushfl
+       popl    %eax
+       cmpl    %eax,%ebx
+       jz      verify_cpu_no_longmode  # cpu has no cpuid
+
+       movl    $0x0,%eax               # See if cpuid 1 is implemented
+       cpuid
+       cmpl    $0x1,%eax
+       jb      verify_cpu_no_longmode  # no cpuid 1
+
+       xor     %di,%di
+       cmpl    $0x68747541,%ebx        # AuthenticAMD
+       jnz     verify_cpu_noamd
+       cmpl    $0x69746e65,%edx
+       jnz     verify_cpu_noamd
+       cmpl    $0x444d4163,%ecx
+       jnz     verify_cpu_noamd
+       mov     $1,%di                  # cpu is from AMD
+       jmp     verify_cpu_check
+
+verify_cpu_noamd:
+       cmpl    $0x756e6547,%ebx        # GenuineIntel?
+       jnz     verify_cpu_check
+       cmpl    $0x49656e69,%edx
+       jnz     verify_cpu_check
+       cmpl    $0x6c65746e,%ecx
+       jnz     verify_cpu_check
+
+       # only call IA32_MISC_ENABLE when:
+       # family > 6 || (family == 6 && model >= 0xd)
+       movl    $0x1, %eax              # check CPU family and model
+       cpuid
+       movl    %eax, %ecx
+
+       andl    $0x0ff00f00, %eax       # mask family and extended family
+       shrl    $8, %eax
+       cmpl    $6, %eax
+       ja      verify_cpu_clear_xd     # family > 6, ok
+       jb      verify_cpu_check        # family < 6, skip
+
+       andl    $0x000f00f0, %ecx       # mask model and extended model
+       shrl    $4, %ecx
+       cmpl    $0xd, %ecx
+       jb      verify_cpu_check        # family == 6, model < 0xd, skip
+
+verify_cpu_clear_xd:
+       movl    $MSR_IA32_MISC_ENABLE, %ecx
+       rdmsr
+       btrl    $2, %edx                # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
+       jnc     verify_cpu_check        # only write MSR if bit was changed
+       wrmsr
+
+verify_cpu_check:
+       movl    $0x1,%eax               # Does the cpu have what it takes
+       cpuid
+       andl    $REQUIRED_MASK0,%edx
+       xorl    $REQUIRED_MASK0,%edx
+       jnz     verify_cpu_no_longmode
+
+       movl    $0x80000000,%eax        # See if extended cpuid is implemented
+       cpuid
+       cmpl    $0x80000001,%eax
+       jb      verify_cpu_no_longmode  # no extended cpuid
+
+       movl    $0x80000001,%eax        # Does the cpu have what it takes
+       cpuid
+       andl    $REQUIRED_MASK1,%edx
+       xorl    $REQUIRED_MASK1,%edx
+       jnz     verify_cpu_no_longmode
+
+verify_cpu_sse_test:
+       movl    $1,%eax
+       cpuid
+       andl    $SSE_MASK,%edx
+       cmpl    $SSE_MASK,%edx
+       je      verify_cpu_sse_ok
+       test    %di,%di
+       jz      verify_cpu_no_longmode  # only try to force SSE on AMD
+       movl    $MSR_K7_HWCR,%ecx
+       rdmsr
+       btr     $15,%eax                # enable SSE
+       wrmsr
+       xor     %di,%di                 # don't loop
+       jmp     verify_cpu_sse_test     # try again
+
+verify_cpu_no_longmode:
+       popfl                           # Restore caller passed flags
+       movl $1,%eax
+       ret
+verify_cpu_sse_ok:
+       popfl                           # Restore caller passed flags
+       xorl %eax, %eax
+       ret
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S

deleted file mode 100644 (file)

index 56a8c2a..0000000
--- a/arch/x86/kernel/verify_cpu_64.S
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- *
- *     verify_cpu.S - Code for cpu long mode and SSE verification. This
- *     code has been borrowed from boot/setup.S and was introduced by
- *     Andi Kleen.
- *
- *     Copyright (c) 2007  Andi Kleen (ak@suse.de)
- *     Copyright (c) 2007  Eric Biederman (ebiederm@xmission.com)
- *     Copyright (c) 2007  Vivek Goyal (vgoyal@in.ibm.com)
- *
- *     This source code is licensed under the GNU General Public License,
- *     Version 2.  See the file COPYING for more details.
- *
- *     This is a common code for verification whether CPU supports
- *     long mode and SSE or not. It is not called directly instead this
- *     file is included at various places and compiled in that context.
- *     Following are the current usage.
- *
- *     This file is included by both 16bit and 32bit code.
- *
- *     arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
- *     arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
- *     arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
- *     arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
- *
- *     verify_cpu, returns the status of cpu check in register %eax.
- *             0: Success    1: Failure
- *
- *     The caller needs to check for the error code and take the action
- *     appropriately. Either display a message or halt.
- */
-
-#include <asm/cpufeature.h>
-#include <asm/msr-index.h>
-
-verify_cpu:
-       pushfl                          # Save caller passed flags
-       pushl   $0                      # Kill any dangerous flags
-       popfl
-
-       pushfl                          # standard way to check for cpuid
-       popl    %eax
-       movl    %eax,%ebx
-       xorl    $0x200000,%eax
-       pushl   %eax
-       popfl
-       pushfl
-       popl    %eax
-       cmpl    %eax,%ebx
-       jz      verify_cpu_no_longmode  # cpu has no cpuid
-
-       movl    $0x0,%eax               # See if cpuid 1 is implemented
-       cpuid
-       cmpl    $0x1,%eax
-       jb      verify_cpu_no_longmode  # no cpuid 1
-
-       xor     %di,%di
-       cmpl    $0x68747541,%ebx        # AuthenticAMD
-       jnz     verify_cpu_noamd
-       cmpl    $0x69746e65,%edx
-       jnz     verify_cpu_noamd
-       cmpl    $0x444d4163,%ecx
-       jnz     verify_cpu_noamd
-       mov     $1,%di                  # cpu is from AMD
-
-verify_cpu_noamd:
-       movl    $0x1,%eax               # Does the cpu have what it takes
-       cpuid
-       andl    $REQUIRED_MASK0,%edx
-       xorl    $REQUIRED_MASK0,%edx
-       jnz     verify_cpu_no_longmode
-
-       movl    $0x80000000,%eax        # See if extended cpuid is implemented
-       cpuid
-       cmpl    $0x80000001,%eax
-       jb      verify_cpu_no_longmode  # no extended cpuid
-
-       movl    $0x80000001,%eax        # Does the cpu have what it takes
-       cpuid
-       andl    $REQUIRED_MASK1,%edx
-       xorl    $REQUIRED_MASK1,%edx
-       jnz     verify_cpu_no_longmode
-
-verify_cpu_sse_test:
-       movl    $1,%eax
-       cpuid
-       andl    $SSE_MASK,%edx
-       cmpl    $SSE_MASK,%edx
-       je      verify_cpu_sse_ok
-       test    %di,%di
-       jz      verify_cpu_no_longmode  # only try to force SSE on AMD
-       movl    $MSR_K7_HWCR,%ecx
-       rdmsr
-       btr     $15,%eax                # enable SSE
-       wrmsr
-       xor     %di,%di                 # don't loop
-       jmp     verify_cpu_sse_test     # try again
-
-verify_cpu_no_longmode:
-       popfl                           # Restore caller passed flags
-       movl $1,%eax
-       ret
-verify_cpu_sse_ok:
-       popfl                           # Restore caller passed flags
-       xorl %eax, %eax
-       ret
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S

index e03530aebfd0332635f901afa9f19f00317fa097..bf4700755184e32d4b4e549bd19f4014caa46468 100644 (file)
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -69,7 +69,7 @@ jiffies_64 = jiffies;
  
  PHDRS {
         text PT_LOAD FLAGS(5);          /* R_E */
-       data PT_LOAD FLAGS(7);          /* RWE */
+       data PT_LOAD FLAGS(6);          /* RW_ */
  #ifdef CONFIG_X86_64
         user PT_LOAD FLAGS(5);          /* R_E */
  #ifdef CONFIG_SMP
@@ -116,6 +116,10 @@ SECTIONS
  
         EXCEPTION_TABLE(16) :text = 0x9090
  
+#if defined(CONFIG_DEBUG_RODATA)
+       /* .text should occupy whole number of pages */
+       . = ALIGN(PAGE_SIZE);
+#endif
         X64_ALIGN_DEBUG_RODATA_BEGIN
         RO_DATA(PAGE_SIZE)
         X64_ALIGN_DEBUG_RODATA_END
@@ -335,7 +339,7 @@ SECTIONS
                 __bss_start = .;
                 *(.bss..page_aligned)
                 *(.bss)
-               . = ALIGN(4);
+               . = ALIGN(PAGE_SIZE);
                 __bss_stop = .;
         }
  
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c

index f628234fbeca06683876543f07476f10111812b7..3cece05e4ac4622853c8eeb20e3bd04a02f851c7 100644 (file)
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -575,6 +575,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
         s->pics[1].elcr_mask = 0xde;
         s->pics[0].pics_state = s;
         s->pics[1].pics_state = s;
+       s->pics[0].isr_ack = 0xff;
+       s->pics[1].isr_ack = 0xff;
  
         /*
          * Initialize PIO device
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index fb8b376bf28cb3e04a6bb903900f32838ab02a14..fbb04aee8301efab741f80f7c590a45d7e3b298f 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2394,7 +2394,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
                         ASSERT(!VALID_PAGE(root));
                         spin_lock(&vcpu->kvm->mmu_lock);
                         kvm_mmu_free_some_pages(vcpu);
-                       sp = kvm_mmu_get_page(vcpu, i << 30, i << 30,
+                       sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
+                                             i << 30,
                                               PT32_ROOT_LEVEL, 1, ACC_ALL,
                                               NULL);
                         root = __pa(sp->spt);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile

index 55543397a8a795f295ef6e3731e6b9373c953958..09df2f9a3d69ce36a20ec86c82bc9b719d44a1ae 100644 (file)
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -23,7 +23,7 @@ mmiotrace-y                   := kmmio.o pf_in.o mmio-mod.o
  obj-$(CONFIG_MMIOTRACE_TEST)   += testmmiotrace.o
  
  obj-$(CONFIG_NUMA)             += numa.o numa_$(BITS).o
-obj-$(CONFIG_K8_NUMA)          += k8topology_64.o
+obj-$(CONFIG_AMD_NUMA)         += amdtopology_64.o
  obj-$(CONFIG_ACPI_NUMA)                += srat_$(BITS).o
  
  obj-$(CONFIG_HAVE_MEMBLOCK)            += memblock.o
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c

new file mode 100644 (file)

index 0000000..51fae9c
--- /dev/null
+++ b/arch/x86/mm/amdtopology_64.c
@@ -0,0 +1,237 @@
+/*
+ * AMD NUMA support.
+ * Discover the memory map and associated nodes.
+ *
+ * This version reads it directly from the AMD northbridge.
+ *
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/memblock.h>
+
+#include <asm/io.h>
+#include <linux/pci_ids.h>
+#include <linux/acpi.h>
+#include <asm/types.h>
+#include <asm/mmzone.h>
+#include <asm/proto.h>
+#include <asm/e820.h>
+#include <asm/pci-direct.h>
+#include <asm/numa.h>
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <asm/amd_nb.h>
+
+static struct bootnode __initdata nodes[8];
+static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
+
+static __init int find_northbridge(void)
+{
+       int num;
+
+       for (num = 0; num < 32; num++) {
+               u32 header;
+
+               header = read_pci_config(0, num, 0, 0x00);
+               if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
+                       header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
+                       header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
+                       continue;
+
+               header = read_pci_config(0, num, 1, 0x00);
+               if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
+                       header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
+                       header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
+                       continue;
+               return num;
+       }
+
+       return -1;
+}
+
+static __init void early_get_boot_cpu_id(void)
+{
+       /*
+        * need to get the APIC ID of the BSP so can use that to
+        * create apicid_to_node in amd_scan_nodes()
+        */
+#ifdef CONFIG_X86_MPPARSE
+       /*
+        * get boot-time SMP configuration:
+        */
+       if (smp_found_config)
+               early_get_smp_config();
+#endif
+       early_init_lapic_mapping();
+}
+
+int __init amd_get_nodes(struct bootnode *physnodes)
+{
+       int i;
+       int ret = 0;
+
+       for_each_node_mask(i, nodes_parsed) {
+               physnodes[ret].start = nodes[i].start;
+               physnodes[ret].end = nodes[i].end;
+               ret++;
+       }
+       return ret;
+}
+
+int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+       unsigned long start = PFN_PHYS(start_pfn);
+       unsigned long end = PFN_PHYS(end_pfn);
+       unsigned numnodes;
+       unsigned long prevbase;
+       int i, nb, found = 0;
+       u32 nodeid, reg;
+
+       if (!early_pci_allowed())
+               return -1;
+
+       nb = find_northbridge();
+       if (nb < 0)
+               return nb;
+
+       pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
+
+       reg = read_pci_config(0, nb, 0, 0x60);
+       numnodes = ((reg >> 4) & 0xF) + 1;
+       if (numnodes <= 1)
+               return -1;
+
+       pr_info("Number of physical nodes %d\n", numnodes);
+
+       prevbase = 0;
+       for (i = 0; i < 8; i++) {
+               unsigned long base, limit;
+
+               base = read_pci_config(0, nb, 1, 0x40 + i*8);
+               limit = read_pci_config(0, nb, 1, 0x44 + i*8);
+
+               nodeid = limit & 7;
+               if ((base & 3) == 0) {
+                       if (i < numnodes)
+                               pr_info("Skipping disabled node %d\n", i);
+                       continue;
+               }
+               if (nodeid >= numnodes) {
+                       pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
+                               base, limit);
+                       continue;
+               }
+
+               if (!limit) {
+                       pr_info("Skipping node entry %d (base %lx)\n",
+                               i, base);
+                       continue;
+               }
+               if ((base >> 8) & 3 || (limit >> 8) & 3) {
+                       pr_err("Node %d using interleaving mode %lx/%lx\n",
+                              nodeid, (base >> 8) & 3, (limit >> 8) & 3);
+                       return -1;
+               }
+               if (node_isset(nodeid, nodes_parsed)) {
+                       pr_info("Node %d already present, skipping\n",
+                               nodeid);
+                       continue;
+               }
+
+               limit >>= 16;
+               limit <<= 24;
+               limit |= (1<<24)-1;
+               limit++;
+
+               if (limit > end)
+                       limit = end;
+               if (limit <= base)
+                       continue;
+
+               base >>= 16;
+               base <<= 24;
+
+               if (base < start)
+                       base = start;
+               if (limit > end)
+                       limit = end;
+               if (limit == base) {
+                       pr_err("Empty node %d\n", nodeid);
+                       continue;
+               }
+               if (limit < base) {
+                       pr_err("Node %d bogus settings %lx-%lx.\n",
+                              nodeid, base, limit);
+                       continue;
+               }
+
+               /* Could sort here, but pun for now. Should not happen anyroads. */
+               if (prevbase > base) {
+                       pr_err("Node map not sorted %lx,%lx\n",
+                              prevbase, base);
+                       return -1;
+               }
+
+               pr_info("Node %d MemBase %016lx Limit %016lx\n",
+                       nodeid, base, limit);
+
+               found++;
+
+               nodes[nodeid].start = base;
+               nodes[nodeid].end = limit;
+
+               prevbase = base;
+
+               node_set(nodeid, nodes_parsed);
+       }
+
+       if (!found)
+               return -1;
+       return 0;
+}
+
+int __init amd_scan_nodes(void)
+{
+       unsigned int bits;
+       unsigned int cores;
+       unsigned int apicid_base;
+       int i;
+
+       BUG_ON(nodes_empty(nodes_parsed));
+       node_possible_map = nodes_parsed;
+       memnode_shift = compute_hash_shift(nodes, 8, NULL);
+       if (memnode_shift < 0) {
+               pr_err("No NUMA node hash function found. Contact maintainer\n");
+               return -1;
+       }
+       pr_info("Using node hash shift of %d\n", memnode_shift);
+
+       /* use the coreid bits from early_identify_cpu */
+       bits = boot_cpu_data.x86_coreid_bits;
+       cores = (1<<bits);
+       apicid_base = 0;
+       /* get the APIC ID of the BSP early for systems with apicid lifting */
+       early_get_boot_cpu_id();
+       if (boot_cpu_physical_apicid > 0) {
+               pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
+               apicid_base = boot_cpu_physical_apicid;
+       }
+
+       for_each_node_mask(i, node_possible_map) {
+               int j;
+
+               memblock_x86_register_active_regions(i,
+                               nodes[i].start >> PAGE_SHIFT,
+                               nodes[i].end >> PAGE_SHIFT);
+               for (j = apicid_base; j < cores + apicid_base; j++)
+                       apicid_to_node[(i << bits) + j] = i;
+               setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+       }
+
+       numa_init_array();
+       return 0;
+}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c

index c0e28a13de7df55c1ee1b173b61fd2d18c50e49c..947f42abe820eed9e47388ff3fcfd6fc937bb96a 100644 (file)
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -364,8 +364,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
         /*
          * We just marked the kernel text read only above, now that
          * we are going to free part of that, we need to make that
-        * writeable first.
+        * writeable and non-executable first.
          */
+       set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
         set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
  
         printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c

index 0e969f9f401b72bddf403325e2a3a862199ee07b..f89b5bb4e93f82926f339054aa37f5bd4e829216 100644 (file)
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -226,7 +226,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
  
  static inline int is_kernel_text(unsigned long addr)
  {
-       if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+       if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
                 return 1;
         return 0;
  }
@@ -912,6 +912,23 @@ void set_kernel_text_ro(void)
         set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
  }
  
+static void mark_nxdata_nx(void)
+{
+       /*
+        * When this called, init has already been executed and released,
+        * so everything past _etext sould be NX.
+        */
+       unsigned long start = PFN_ALIGN(_etext);
+       /*
+        * This comes from is_kernel_text upper limit. Also HPAGE where used:
+        */
+       unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+
+       if (__supported_pte_mask & _PAGE_NX)
+               printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+       set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
  void mark_rodata_ro(void)
  {
         unsigned long start = PFN_ALIGN(_text);
@@ -946,6 +963,7 @@ void mark_rodata_ro(void)
         printk(KERN_INFO "Testing CPA: write protecting again\n");
         set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
  #endif
+       mark_nxdata_nx();
  }
  #endif
  
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c

deleted file mode 100644 (file)

index 804a3b6..0000000
--- a/arch/x86/mm/k8topology_64.c
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * AMD K8 NUMA support.
- * Discover the memory map and associated nodes.
- *
- * This version reads it directly from the K8 northbridge.
- *
- * Copyright 2002,2003 Andi Kleen, SuSE Labs.
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/nodemask.h>
-#include <linux/memblock.h>
-
-#include <asm/io.h>
-#include <linux/pci_ids.h>
-#include <linux/acpi.h>
-#include <asm/types.h>
-#include <asm/mmzone.h>
-#include <asm/proto.h>
-#include <asm/e820.h>
-#include <asm/pci-direct.h>
-#include <asm/numa.h>
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <asm/amd_nb.h>
-
-static struct bootnode __initdata nodes[8];
-static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
-
-static __init int find_northbridge(void)
-{
-       int num;
-
-       for (num = 0; num < 32; num++) {
-               u32 header;
-
-               header = read_pci_config(0, num, 0, 0x00);
-               if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
-                       header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
-                       header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
-                       continue;
-
-               header = read_pci_config(0, num, 1, 0x00);
-               if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
-                       header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
-                       header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
-                       continue;
-               return num;
-       }
-
-       return -1;
-}
-
-static __init void early_get_boot_cpu_id(void)
-{
-       /*
-        * need to get the APIC ID of the BSP so can use that to
-        * create apicid_to_node in k8_scan_nodes()
-        */
-#ifdef CONFIG_X86_MPPARSE
-       /*
-        * get boot-time SMP configuration:
-        */
-       if (smp_found_config)
-               early_get_smp_config();
-#endif
-       early_init_lapic_mapping();
-}
-
-int __init k8_get_nodes(struct bootnode *physnodes)
-{
-       int i;
-       int ret = 0;
-
-       for_each_node_mask(i, nodes_parsed) {
-               physnodes[ret].start = nodes[i].start;
-               physnodes[ret].end = nodes[i].end;
-               ret++;
-       }
-       return ret;
-}
-
-int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
-{
-       unsigned long start = PFN_PHYS(start_pfn);
-       unsigned long end = PFN_PHYS(end_pfn);
-       unsigned numnodes;
-       unsigned long prevbase;
-       int i, nb, found = 0;
-       u32 nodeid, reg;
-
-       if (!early_pci_allowed())
-               return -1;
-
-       nb = find_northbridge();
-       if (nb < 0)
-               return nb;
-
-       pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
-
-       reg = read_pci_config(0, nb, 0, 0x60);
-       numnodes = ((reg >> 4) & 0xF) + 1;
-       if (numnodes <= 1)
-               return -1;
-
-       pr_info("Number of physical nodes %d\n", numnodes);
-
-       prevbase = 0;
-       for (i = 0; i < 8; i++) {
-               unsigned long base, limit;
-
-               base = read_pci_config(0, nb, 1, 0x40 + i*8);
-               limit = read_pci_config(0, nb, 1, 0x44 + i*8);
-
-               nodeid = limit & 7;
-               if ((base & 3) == 0) {
-                       if (i < numnodes)
-                               pr_info("Skipping disabled node %d\n", i);
-                       continue;
-               }
-               if (nodeid >= numnodes) {
-                       pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
-                               base, limit);
-                       continue;
-               }
-
-               if (!limit) {
-                       pr_info("Skipping node entry %d (base %lx)\n",
-                               i, base);
-                       continue;
-               }
-               if ((base >> 8) & 3 || (limit >> 8) & 3) {
-                       pr_err("Node %d using interleaving mode %lx/%lx\n",
-                              nodeid, (base >> 8) & 3, (limit >> 8) & 3);
-                       return -1;
-               }
-               if (node_isset(nodeid, nodes_parsed)) {
-                       pr_info("Node %d already present, skipping\n",
-                               nodeid);
-                       continue;
-               }
-
-               limit >>= 16;
-               limit <<= 24;
-               limit |= (1<<24)-1;
-               limit++;
-
-               if (limit > end)
-                       limit = end;
-               if (limit <= base)
-                       continue;
-
-               base >>= 16;
-               base <<= 24;
-
-               if (base < start)
-                       base = start;
-               if (limit > end)
-                       limit = end;
-               if (limit == base) {
-                       pr_err("Empty node %d\n", nodeid);
-                       continue;
-               }
-               if (limit < base) {
-                       pr_err("Node %d bogus settings %lx-%lx.\n",
-                              nodeid, base, limit);
-                       continue;
-               }
-
-               /* Could sort here, but pun for now. Should not happen anyroads. */
-               if (prevbase > base) {
-                       pr_err("Node map not sorted %lx,%lx\n",
-                              prevbase, base);
-                       return -1;
-               }
-
-               pr_info("Node %d MemBase %016lx Limit %016lx\n",
-                       nodeid, base, limit);
-
-               found++;
-
-               nodes[nodeid].start = base;
-               nodes[nodeid].end = limit;
-
-               prevbase = base;
-
-               node_set(nodeid, nodes_parsed);
-       }
-
-       if (!found)
-               return -1;
-       return 0;
-}
-
-int __init k8_scan_nodes(void)
-{
-       unsigned int bits;
-       unsigned int cores;
-       unsigned int apicid_base;
-       int i;
-
-       BUG_ON(nodes_empty(nodes_parsed));
-       node_possible_map = nodes_parsed;
-       memnode_shift = compute_hash_shift(nodes, 8, NULL);
-       if (memnode_shift < 0) {
-               pr_err("No NUMA node hash function found. Contact maintainer\n");
-               return -1;
-       }
-       pr_info("Using node hash shift of %d\n", memnode_shift);
-
-       /* use the coreid bits from early_identify_cpu */
-       bits = boot_cpu_data.x86_coreid_bits;
-       cores = (1<<bits);
-       apicid_base = 0;
-       /* get the APIC ID of the BSP early for systems with apicid lifting */
-       early_get_boot_cpu_id();
-       if (boot_cpu_physical_apicid > 0) {
-               pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
-               apicid_base = boot_cpu_physical_apicid;
-       }
-
-       for_each_node_mask(i, node_possible_map) {
-               int j;
-
-               memblock_x86_register_active_regions(i,
-                               nodes[i].start >> PAGE_SHIFT,
-                               nodes[i].end >> PAGE_SHIFT);
-               for (j = apicid_base; j < cores + apicid_base; j++)
-                       apicid_to_node[(i << bits) + j] = i;
-               setup_node_bootmem(i, nodes[i].start, nodes[i].end);
-       }
-
-       numa_init_array();
-       return 0;
-}
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c

index af3b6c8a436f7b7ec49a2366738a28faa58cdfc5..704a37cedddb59404a3c1fc773e44853b2089939 100644 (file)
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
         e->trace.entries = e->trace_entries;
         e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
         e->trace.skip = 0;
-       save_stack_trace_bp(&e->trace, regs->bp);
+       save_stack_trace_regs(&e->trace, regs);
  
         /* Round address down to nearest 16 bytes */
         shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c

index 7ffc9b727efdc95ee6748acd1a1b646c8e164ffb..7762a517d69d9233a7a6e419a5eaa9ae6c94ee53 100644 (file)
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -264,7 +264,7 @@ static struct bootnode physnodes[MAX_NUMNODES] __initdata;
  static char *cmdline __initdata;
  
  static int __init setup_physnodes(unsigned long start, unsigned long end,
-                                       int acpi, int k8)
+                                       int acpi, int amd)
  {
         int nr_nodes = 0;
         int ret = 0;
@@ -274,13 +274,13 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
         if (acpi)
                 nr_nodes = acpi_get_nodes(physnodes);
  #endif
-#ifdef CONFIG_K8_NUMA
-       if (k8)
-               nr_nodes = k8_get_nodes(physnodes);
+#ifdef CONFIG_AMD_NUMA
+       if (amd)
+               nr_nodes = amd_get_nodes(physnodes);
  #endif
         /*
          * Basic sanity checking on the physical node map: there may be errors
-        * if the SRAT or K8 incorrectly reported the topology or the mem=
+        * if the SRAT or AMD code incorrectly reported the topology or the mem=
          * kernel parameter is used.
          */
         for (i = 0; i < nr_nodes; i++) {
@@ -549,7 +549,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
   * numa=fake command-line option.
   */
  static int __init numa_emulation(unsigned long start_pfn,
-                       unsigned long last_pfn, int acpi, int k8)
+                       unsigned long last_pfn, int acpi, int amd)
  {
         u64 addr = start_pfn << PAGE_SHIFT;
         u64 max_addr = last_pfn << PAGE_SHIFT;
@@ -557,7 +557,7 @@ static int __init numa_emulation(unsigned long start_pfn,
         int num_nodes;
         int i;
  
-       num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
+       num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd);
         /*
          * If the numa=fake command-line contains a 'M' or 'G', it represents
          * the fixed node size.  Otherwise, if it is just a single number N,
@@ -602,7 +602,7 @@ static int __init numa_emulation(unsigned long start_pfn,
  #endif /* CONFIG_NUMA_EMU */
  
  void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
-                               int acpi, int k8)
+                               int acpi, int amd)
  {
         int i;
  
@@ -610,7 +610,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
         nodes_clear(node_online_map);
  
  #ifdef CONFIG_NUMA_EMU
-       if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
+       if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
                 return;
         nodes_clear(node_possible_map);
         nodes_clear(node_online_map);
@@ -624,8 +624,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
         nodes_clear(node_online_map);
  #endif
  
-#ifdef CONFIG_K8_NUMA
-       if (!numa_off && k8 && !k8_scan_nodes())
+#ifdef CONFIG_AMD_NUMA
+       if (!numa_off && amd && !amd_scan_nodes())
                 return;
         nodes_clear(node_possible_map);
         nodes_clear(node_online_map);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c

index 532e7933d606fdbdde77aacdb2de746f249d2bf2..8b830ca14ac46c08facc1a848ddcb3c42c0d56cf 100644 (file)
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -13,6 +13,7 @@
  #include <linux/pfn.h>
  #include <linux/percpu.h>
  #include <linux/gfp.h>
+#include <linux/pci.h>
  
  #include <asm/e820.h>
  #include <asm/processor.h>
@@ -255,13 +256,16 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
                                    unsigned long pfn)
  {
         pgprot_t forbidden = __pgprot(0);
+       pgprot_t required = __pgprot(0);
  
         /*
          * The BIOS area between 640k and 1Mb needs to be executable for
          * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
          */
-       if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+#ifdef CONFIG_PCI_BIOS
+       if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
                 pgprot_val(forbidden) |= _PAGE_NX;
+#endif
  
         /*
          * The kernel text needs to be executable for obvious reasons
@@ -278,6 +282,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
         if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
                    __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
                 pgprot_val(forbidden) |= _PAGE_RW;
+       /*
+        * .data and .bss should always be writable.
+        */
+       if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
+           within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
+               pgprot_val(required) |= _PAGE_RW;
  
  #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
         /*
@@ -317,6 +327,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
  #endif
  
         prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+       prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
  
         return prot;
  }
@@ -393,7 +404,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
  {
         unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
         pte_t new_pte, old_pte, *tmp;
-       pgprot_t old_prot, new_prot;
+       pgprot_t old_prot, new_prot, req_prot;
         int i, do_split = 1;
         unsigned int level;
  
@@ -438,10 +449,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
          * We are safe now. Check whether the new pgprot is the same:
          */
         old_pte = *kpte;
-       old_prot = new_prot = pte_pgprot(old_pte);
+       old_prot = new_prot = req_prot = pte_pgprot(old_pte);
  
-       pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
-       pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+       pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
+       pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
  
         /*
          * old_pte points to the large page base address. So we need
@@ -450,17 +461,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
         pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
         cpa->pfn = pfn;
  
-       new_prot = static_protections(new_prot, address, pfn);
+       new_prot = static_protections(req_prot, address, pfn);
  
         /*
          * We need to check the full range, whether
          * static_protection() requires a different pgprot for one of
          * the pages in the range we try to preserve:
          */
-       addr = address + PAGE_SIZE;
-       pfn++;
-       for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
-               pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
+       addr = address & pmask;
+       pfn = pte_pfn(old_pte);
+       for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
+               pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
  
                 if (pgprot_val(chk_prot) != pgprot_val(new_prot))
                         goto out_unlock;
@@ -483,7 +494,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
          * that we limited the number of possible pages already to
          * the number of pages in the large page.
          */
-       if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
+       if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
                 /*
                  * The address is aligned and the number of pages
                  * covers the full page.
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c

index a3250aa34086fce7d376e9e1e464fa2e996dbb6d..410531d3c292d20cde9b40a487711930eab51eb2 100644 (file)
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -41,7 +41,7 @@ void __init x86_report_nx(void)
  {
         if (!cpu_has_nx) {
                 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
-                      "missing in CPU or disabled in BIOS!\n");
+                      "missing in CPU!\n");
         } else {
  #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
                 if (disable_nx) {
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c

index a17dffd136c143898e91187cbd39d005b05779b6..f16434568a51da26ea8524ae11fa84f0c7405717 100644 (file)
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -92,6 +92,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
         /* mark this node as "seen" in node bitmap */
         BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
  
+       /* don't need to check apic_id here, because it is always 8 bits */
         apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
  
         printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c

index a35cb9d8b0606bc8f7123cd15f0017972a5e8dda..171a0aacb99a0874373619f4fd51ed955e2ddb9e 100644 (file)
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
         }
  
         apic_id = pa->apic_id;
+       if (apic_id >= MAX_LOCAL_APIC) {
+               printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+               return;
+       }
         apicid_to_node[apic_id] = node;
         node_set(node, cpu_nodes_parsed);
         acpi_numa = 1;
@@ -168,6 +172,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
                 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
         else
                 apic_id = pa->apic_id;
+
+       if (apic_id >= MAX_LOCAL_APIC) {
+               printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+               return;
+       }
+
         apicid_to_node[apic_id] = node;
         node_set(node, cpu_nodes_parsed);
         acpi_numa = 1;
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c

index 2d49d4e19a3619c0be2c7d17a892b8aea582048f..72cbec14d783867cb5f5fa8547eb6fcee2fe28b9 100644 (file)
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -126,7 +126,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
         if (!user_mode_vm(regs)) {
                 unsigned long stack = kernel_stack_pointer(regs);
                 if (depth)
-                       dump_trace(NULL, regs, (unsigned long *)stack, 0,
+                       dump_trace(NULL, regs, (unsigned long *)stack,
                                    &backtrace_ops, &depth);
                 return;
         }
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c

index 4e8baad36d37739e32b71da0be2932924bacfe69..358c8b9c96a79c725766e1627544486eb312a0bc 100644 (file)
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -732,6 +732,9 @@ int __init op_nmi_init(struct oprofile_operations *ops)
                 case 0x14:
                         cpu_type = "x86-64/family14h";
                         break;
+               case 0x15:
+                       cpu_type = "x86-64/family15h";
+                       break;
                 default:
                         return -ENODEV;
                 }
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c

index e3ecb71b5790228073d5eb089f4a4fc9303ab09a..0636dd93cef8d64a718124ddfa7c5a02edd2174f 100644 (file)
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ b/arch/x86/oprofile/nmi_timer_int.c
@@ -58,9 +58,6 @@ static void timer_stop(void)
  
  int __init op_nmi_timer_init(struct oprofile_operations *ops)
  {
-       if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0))
-               return -ENODEV;
-
         ops->start = timer_start;
         ops->stop = timer_stop;
         ops->cpu_type = "timer";
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c

index a011bcc0f94331d82c8abfa7d4afdbbd0c59eff5..c3b8e24f2b16f4f6441c286268a61ac6a320b7c3 100644 (file)
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -29,11 +29,12 @@
  #include "op_x86_model.h"
  #include "op_counter.h"
  
-#define NUM_COUNTERS 4
+#define NUM_COUNTERS           4
+#define NUM_COUNTERS_F15H      6
  #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-#define NUM_VIRT_COUNTERS 32
+#define NUM_VIRT_COUNTERS      32
  #else
-#define NUM_VIRT_COUNTERS NUM_COUNTERS
+#define NUM_VIRT_COUNTERS      0
  #endif
  
  #define OP_EVENT_MASK                  0x0FFF
@@ -41,7 +42,8 @@
  
  #define MSR_AMD_EVENTSEL_RESERVED      ((0xFFFFFCF0ULL<<32)|(1ULL<<21))
  
-static unsigned long reset_value[NUM_VIRT_COUNTERS];
+static int num_counters;
+static unsigned long reset_value[OP_MAX_COUNTER];
  
  #define IBS_FETCH_SIZE                 6
  #define IBS_OP_SIZE                    12
@@ -387,7 +389,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
         int i;
  
         /* enable active counters */
-       for (i = 0; i < NUM_COUNTERS; ++i) {
+       for (i = 0; i < num_counters; ++i) {
                 int virt = op_x86_phys_to_virt(i);
                 if (!reset_value[virt])
                         continue;
@@ -406,7 +408,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
  {
         int i;
  
-       for (i = 0; i < NUM_COUNTERS; ++i) {
+       for (i = 0; i < num_counters; ++i) {
                 if (!msrs->counters[i].addr)
                         continue;
                 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
@@ -418,7 +420,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
  {
         int i;
  
-       for (i = 0; i < NUM_COUNTERS; i++) {
+       for (i = 0; i < num_counters; i++) {
                 if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
                         goto fail;
                 if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
@@ -426,8 +428,13 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
                         goto fail;
                 }
                 /* both registers must be reserved */
-               msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
-               msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+               if (num_counters == NUM_COUNTERS_F15H) {
+                       msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
+                       msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
+               } else {
+                       msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+                       msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
+               }
                 continue;
         fail:
                 if (!counter_config[i].enabled)
@@ -447,7 +454,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
         int i;
  
         /* setup reset_value */
-       for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
+       for (i = 0; i < OP_MAX_COUNTER; ++i) {
                 if (counter_config[i].enabled
                     && msrs->counters[op_x86_virt_to_phys(i)].addr)
                         reset_value[i] = counter_config[i].count;
@@ -456,7 +463,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
         }
  
         /* clear all counters */
-       for (i = 0; i < NUM_COUNTERS; ++i) {
+       for (i = 0; i < num_counters; ++i) {
                 if (!msrs->controls[i].addr)
                         continue;
                 rdmsrl(msrs->controls[i].addr, val);
@@ -472,7 +479,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
         }
  
         /* enable active counters */
-       for (i = 0; i < NUM_COUNTERS; ++i) {
+       for (i = 0; i < num_counters; ++i) {
                 int virt = op_x86_phys_to_virt(i);
                 if (!reset_value[virt])
                         continue;
@@ -503,7 +510,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs,
         u64 val;
         int i;
  
-       for (i = 0; i < NUM_COUNTERS; ++i) {
+       for (i = 0; i < num_counters; ++i) {
                 int virt = op_x86_phys_to_virt(i);
                 if (!reset_value[virt])
                         continue;
@@ -526,7 +533,7 @@ static void op_amd_start(struct op_msrs const * const msrs)
         u64 val;
         int i;
  
-       for (i = 0; i < NUM_COUNTERS; ++i) {
+       for (i = 0; i < num_counters; ++i) {
                 if (!reset_value[op_x86_phys_to_virt(i)])
                         continue;
                 rdmsrl(msrs->controls[i].addr, val);
@@ -546,7 +553,7 @@ static void op_amd_stop(struct op_msrs const * const msrs)
          * Subtle: stop on all counters to avoid race with setting our
          * pm callback
          */
-       for (i = 0; i < NUM_COUNTERS; ++i) {
+       for (i = 0; i < num_counters; ++i) {
                 if (!reset_value[op_x86_phys_to_virt(i)])
                         continue;
                 rdmsrl(msrs->controls[i].addr, val);
@@ -603,6 +610,7 @@ static int force_ibs_eilvt_setup(void)
                 ret = setup_ibs_ctl(i);
                 if (ret)
                         return ret;
+               pr_err(FW_BUG "using offset %d for IBS interrupts\n", i);
                 return 0;
         }
  
@@ -630,21 +638,29 @@ static int __init_ibs_nmi(void)
         return 0;
  }
  
-/* initialize the APIC for the IBS interrupts if available */
+/*
+ * check and reserve APIC extended interrupt LVT offset for IBS if
+ * available
+ *
+ * init_ibs() preforms implicitly cpu-local operations, so pin this
+ * thread to its current CPU
+ */
+
  static void init_ibs(void)
  {
-       ibs_caps = get_ibs_caps();
+       preempt_disable();
  
+       ibs_caps = get_ibs_caps();
         if (!ibs_caps)
-               return;
+               goto out;
  
-       if (__init_ibs_nmi()) {
+       if (__init_ibs_nmi() < 0)
                 ibs_caps = 0;
-               return;
-       }
+       else
+               printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
  
-       printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n",
-              (unsigned)ibs_caps);
+out:
+       preempt_enable();
  }
  
  static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
@@ -698,18 +714,29 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
         return 0;
  }
  
+struct op_x86_model_spec op_amd_spec;
+
  static int op_amd_init(struct oprofile_operations *ops)
  {
         init_ibs();
         create_arch_files = ops->create_files;
         ops->create_files = setup_ibs_files;
+
+       if (boot_cpu_data.x86 == 0x15) {
+               num_counters = NUM_COUNTERS_F15H;
+       } else {
+               num_counters = NUM_COUNTERS;
+       }
+
+       op_amd_spec.num_counters = num_counters;
+       op_amd_spec.num_controls = num_counters;
+       op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS);
+
         return 0;
  }
  
  struct op_x86_model_spec op_amd_spec = {
-       .num_counters           = NUM_COUNTERS,
-       .num_controls           = NUM_COUNTERS,
-       .num_virt_counters      = NUM_VIRT_COUNTERS,
+       /* num_counters/num_controls filled in at runtime */
         .reserved               = MSR_AMD_EVENTSEL_RESERVED,
         .event_mask             = OP_EVENT_MASK,
         .init                   = op_amd_init,
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c

index 182558dd5515add420a27dfa58304d04b6a6f71a..9fadec074142b11afcb39e73627dc4c4fd8e14dd 100644 (file)
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -11,7 +11,7 @@
  #include <linux/oprofile.h>
  #include <linux/smp.h>
  #include <linux/ptrace.h>
-#include <linux/nmi.h>
+#include <asm/nmi.h>
  #include <asm/msr.h>
  #include <asm/fixmap.h>
  #include <asm/apic.h>
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile

index effd96e33f16690c3dd318de9886bee75e5fd699..6b8759f7634e661de3983dbc7e6accb26e939a8c 100644 (file)
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_PCI_OLPC)          += olpc.o
  obj-$(CONFIG_PCI_XEN)          += xen.o
  
  obj-y                          += fixup.o
+obj-$(CONFIG_X86_INTEL_CE)      += ce4100.o
  obj-$(CONFIG_ACPI)             += acpi.o
  obj-y                          += legacy.o irq.o
  
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c

new file mode 100644 (file)

index 0000000..85b68ef
--- /dev/null
+++ b/arch/x86/pci/ce4100.c
@@ -0,0 +1,315 @@
+/*
+ *  GPL LICENSE SUMMARY
+ *
+ *  Copyright(c) 2010 Intel Corporation. All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of version 2 of the GNU General Public License as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *  The full GNU General Public License is included in this distribution
+ *  in the file called LICENSE.GPL.
+ *
+ *  Contact Information:
+ *    Intel Corporation
+ *    2200 Mission College Blvd.
+ *    Santa Clara, CA  97052
+ *
+ * This provides access methods for PCI registers that mis-behave on
+ * the CE4100. Each register can be assigned a private init, read and
+ * write routine. The exception to this is the bridge device.  The
+ * bridge device is the only device on bus zero (0) that requires any
+ * fixup so it is a special case ATM
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+
+#include <asm/pci_x86.h>
+
+struct sim_reg {
+       u32 value;
+       u32 mask;
+};
+
+struct sim_dev_reg {
+       int dev_func;
+       int reg;
+       void (*init)(struct sim_dev_reg *reg);
+       void (*read)(struct sim_dev_reg *reg, u32 *value);
+       void (*write)(struct sim_dev_reg *reg, u32 value);
+       struct sim_reg sim_reg;
+};
+
+struct sim_reg_op {
+       void (*init)(struct sim_dev_reg *reg);
+       void (*read)(struct sim_dev_reg *reg, u32 value);
+       void (*write)(struct sim_dev_reg *reg, u32 value);
+};
+
+#define MB (1024 * 1024)
+#define KB (1024)
+#define SIZE_TO_MASK(size) (~(size - 1))
+
+#define DEFINE_REG(device, func, offset, size, init_op, read_op, write_op)\
+{ PCI_DEVFN(device, func), offset, init_op, read_op, write_op,\
+       {0, SIZE_TO_MASK(size)} },
+
+static void reg_init(struct sim_dev_reg *reg)
+{
+       pci_direct_conf1.read(0, 1, reg->dev_func, reg->reg, 4,
+                             &reg->sim_reg.value);
+}
+
+static void reg_read(struct sim_dev_reg *reg, u32 *value)
+{
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&pci_config_lock, flags);
+       *value = reg->sim_reg.value;
+       raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+}
+
+static void reg_write(struct sim_dev_reg *reg, u32 value)
+{
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&pci_config_lock, flags);
+       reg->sim_reg.value = (value & reg->sim_reg.mask) |
+               (reg->sim_reg.value & ~reg->sim_reg.mask);
+       raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+}
+
+static void sata_reg_init(struct sim_dev_reg *reg)
+{
+       pci_direct_conf1.read(0, 1, PCI_DEVFN(14, 0), 0x10, 4,
+                             &reg->sim_reg.value);
+       reg->sim_reg.value += 0x400;
+}
+
+static void ehci_reg_read(struct sim_dev_reg *reg, u32 *value)
+{
+       reg_read(reg, value);
+       if (*value != reg->sim_reg.mask)
+               *value |= 0x100;
+}
+
+void sata_revid_init(struct sim_dev_reg *reg)
+{
+       reg->sim_reg.value = 0x01060100;
+       reg->sim_reg.mask = 0;
+}
+
+static void sata_revid_read(struct sim_dev_reg *reg, u32 *value)
+{
+       reg_read(reg, value);
+}
+
+static struct sim_dev_reg bus1_fixups[] = {
+       DEFINE_REG(2, 0, 0x10, (16*MB), reg_init, reg_read, reg_write)
+       DEFINE_REG(2, 0, 0x14, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(2, 1, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(3, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(4, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(4, 1, 0x10, (128*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(6, 0, 0x10, (512*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(6, 1, 0x10, (512*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(6, 2, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(8, 0, 0x10, (1*MB), reg_init, reg_read, reg_write)
+       DEFINE_REG(8, 1, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(8, 2, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(9, 0, 0x10 , (1*MB), reg_init, reg_read, reg_write)
+       DEFINE_REG(9, 0, 0x14, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(10, 0, 0x10, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(10, 0, 0x14, (256*MB), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 0, 0x10, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 0, 0x14, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 1, 0x10, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 2, 0x10, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 2, 0x14, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 2, 0x18, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 3, 0x10, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 3, 0x14, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 4, 0x10, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 5, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 6, 0x10, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(11, 7, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(12, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(12, 0, 0x14, (256), reg_init, reg_read, reg_write)
+       DEFINE_REG(12, 1, 0x10, (1024), reg_init, reg_read, reg_write)
+       DEFINE_REG(13, 0, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write)
+       DEFINE_REG(13, 1, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write)
+       DEFINE_REG(14, 0, 0x8,  0, sata_revid_init, sata_revid_read, 0)
+       DEFINE_REG(14, 0, 0x10, 0, reg_init, reg_read, reg_write)
+       DEFINE_REG(14, 0, 0x14, 0, reg_init, reg_read, reg_write)
+       DEFINE_REG(14, 0, 0x18, 0, reg_init, reg_read, reg_write)
+       DEFINE_REG(14, 0, 0x1C, 0, reg_init, reg_read, reg_write)
+       DEFINE_REG(14, 0, 0x20, 0, reg_init, reg_read, reg_write)
+       DEFINE_REG(14, 0, 0x24, (0x200), sata_reg_init, reg_read, reg_write)
+       DEFINE_REG(15, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(15, 0, 0x14, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(16, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(16, 0, 0x14, (64*MB), reg_init, reg_read, reg_write)
+       DEFINE_REG(16, 0, 0x18, (64*MB), reg_init, reg_read, reg_write)
+       DEFINE_REG(17, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+       DEFINE_REG(18, 0, 0x10, (1*KB), reg_init, reg_read, reg_write)
+};
+
+static void __init init_sim_regs(void)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+               if (bus1_fixups[i].init)
+                       bus1_fixups[i].init(&bus1_fixups[i]);
+       }
+}
+
+static inline void extract_bytes(u32 *value, int reg, int len)
+{
+       uint32_t mask;
+
+       *value >>= ((reg & 3) * 8);
+       mask = 0xFFFFFFFF >> ((4 - len) * 8);
+       *value &= mask;
+}
+
+int bridge_read(unsigned int devfn, int reg, int len, u32 *value)
+{
+       u32 av_bridge_base, av_bridge_limit;
+       int retval = 0;
+
+       switch (reg) {
+       /* Make BARs appear to not request any memory. */
+       case PCI_BASE_ADDRESS_0:
+       case PCI_BASE_ADDRESS_0 + 1:
+       case PCI_BASE_ADDRESS_0 + 2:
+       case PCI_BASE_ADDRESS_0 + 3:
+               *value = 0;
+               break;
+
+               /* Since subordinate bus number register is hardwired
+                * to zero and read only, so do the simulation.
+                */
+       case PCI_PRIMARY_BUS:
+               if (len == 4)
+                       *value = 0x00010100;
+               break;
+
+       case PCI_SUBORDINATE_BUS:
+               *value = 1;
+               break;
+
+       case PCI_MEMORY_BASE:
+       case PCI_MEMORY_LIMIT:
+               /* Get the A/V bridge base address. */
+               pci_direct_conf1.read(0, 0, devfn,
+                               PCI_BASE_ADDRESS_0, 4, &av_bridge_base);
+
+               av_bridge_limit = av_bridge_base + (512*MB - 1);
+               av_bridge_limit >>= 16;
+               av_bridge_limit &= 0xFFF0;
+
+               av_bridge_base >>= 16;
+               av_bridge_base &= 0xFFF0;
+
+               if (reg == PCI_MEMORY_LIMIT)
+                       *value = av_bridge_limit;
+               else if (len == 2)
+                       *value = av_bridge_base;
+               else
+                       *value = (av_bridge_limit << 16) | av_bridge_base;
+               break;
+               /* Make prefetchable memory limit smaller than prefetchable
+                * memory base, so not claim prefetchable memory space.
+                */
+       case PCI_PREF_MEMORY_BASE:
+               *value = 0xFFF0;
+               break;
+       case PCI_PREF_MEMORY_LIMIT:
+               *value = 0x0;
+               break;
+               /* Make IO limit smaller than IO base, so not claim IO space. */
+       case PCI_IO_BASE:
+               *value = 0xF0;
+               break;
+       case PCI_IO_LIMIT:
+               *value = 0;
+               break;
+       default:
+               retval = 1;
+       }
+       return retval;
+}
+
+static int ce4100_conf_read(unsigned int seg, unsigned int bus,
+                           unsigned int devfn, int reg, int len, u32 *value)
+{
+       int i, retval = 1;
+
+       if (bus == 1) {
+               for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+                       if (bus1_fixups[i].dev_func == devfn &&
+                           bus1_fixups[i].reg == (reg & ~3) &&
+                           bus1_fixups[i].read) {
+                               bus1_fixups[i].read(&(bus1_fixups[i]),
+                                                   value);
+                               extract_bytes(value, reg, len);
+                               return 0;
+                       }
+               }
+       }
+
+       if (bus == 0 && (PCI_DEVFN(1, 0) == devfn) &&
+           !bridge_read(devfn, reg, len, value))
+               return 0;
+
+       return pci_direct_conf1.read(seg, bus, devfn, reg, len, value);
+}
+
+static int ce4100_conf_write(unsigned int seg, unsigned int bus,
+                            unsigned int devfn, int reg, int len, u32 value)
+{
+       int i;
+
+       if (bus == 1) {
+               for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+                       if (bus1_fixups[i].dev_func == devfn &&
+                           bus1_fixups[i].reg == (reg & ~3) &&
+                           bus1_fixups[i].write) {
+                               bus1_fixups[i].write(&(bus1_fixups[i]),
+                                                    value);
+                               return 0;
+                       }
+               }
+       }
+
+       /* Discard writes to A/V bridge BAR. */
+       if (bus == 0 && PCI_DEVFN(1, 0) == devfn &&
+           ((reg & ~3) == PCI_BASE_ADDRESS_0))
+               return 0;
+
+       return pci_direct_conf1.write(seg, bus, devfn, reg, len, value);
+}
+
+struct pci_raw_ops ce4100_pci_conf = {
+       .read = ce4100_conf_read,
+       .write = ce4100_conf_write,
+};
+
+static int __init ce4100_pci_init(void)
+{
+       init_sim_regs();
+       raw_pci_ops = &ce4100_pci_conf;
+       return 0;
+}
+subsys_initcall(ce4100_pci_init);
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c

index 2492d165096a2a696cf8332534966cfce2c848a7..a5f7d0d63de0def1481382f785d0fc34d553f0f6 100644 (file)
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -9,6 +9,7 @@
  #include <linux/uaccess.h>
  #include <asm/pci_x86.h>
  #include <asm/pci-functions.h>
+#include <asm/cacheflush.h>
  
  /* BIOS32 signature: "_32_" */
  #define BIOS32_SIGNATURE       (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
@@ -25,6 +26,27 @@
  #define PCIBIOS_HW_TYPE1_SPEC          0x10
  #define PCIBIOS_HW_TYPE2_SPEC          0x20
  
+int pcibios_enabled;
+
+/* According to the BIOS specification at:
+ * http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could
+ * restrict the x zone to some pages and make it ro. But this may be
+ * broken on some bios, complex to handle with static_protections.
+ * We could make the 0xe0000-0x100000 range rox, but this can break
+ * some ISA mapping.
+ *
+ * So we let's an rw and x hole when pcibios is used. This shouldn't
+ * happen for modern system with mmconfig, and if you don't want it
+ * you could disable pcibios...
+ */
+static inline void set_bios_x(void)
+{
+       pcibios_enabled = 1;
+       set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
+       if (__supported_pte_mask & _PAGE_NX)
+               printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n");
+}
+
  /*
   * This is the standard structure used to identify the entry point
   * to the BIOS32 Service Directory, as documented in
@@ -332,6 +354,7 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
                         DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
                                         bios32_entry);
                         bios32_indirect.address = bios32_entry + PAGE_OFFSET;
+                       set_bios_x();
                         if (check_pcibios())
                                 return &pci_bios_access;
                 }
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile

index 7bf70b812fa2a436ec8ce1bea270b05af2b2f487..021eee91c0562503dbb68c5bea490674d9db8618 100644 (file)
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -1,5 +1,7 @@
  # Platform specific code goes here
+obj-y  += ce4100/
  obj-y  += efi/
+obj-y  += iris/
  obj-y  += mrst/
  obj-y  += olpc/
  obj-y  += scx200/
diff --git a/arch/x86/platform/ce4100/Makefile b/arch/x86/platform/ce4100/Makefile

new file mode 100644 (file)

index 0000000..91fc929
--- /dev/null
+++ b/arch/x86/platform/ce4100/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_INTEL_CE)     += ce4100.o
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c

new file mode 100644 (file)

index 0000000..d2c0d51
--- /dev/null
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -0,0 +1,132 @@
+/*
+ * Intel CE4100  platform specific setup code
+ *
+ * (C) Copyright 2010 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/serial_reg.h>
+#include <linux/serial_8250.h>
+
+#include <asm/setup.h>
+#include <asm/io.h>
+
+static int ce4100_i8042_detect(void)
+{
+       return 0;
+}
+
+static void __init sdv_find_smp_config(void)
+{
+}
+
+#ifdef CONFIG_SERIAL_8250
+
+
+static unsigned int mem_serial_in(struct uart_port *p, int offset)
+{
+       offset = offset << p->regshift;
+       return readl(p->membase + offset);
+}
+
+/*
+ * The UART Tx interrupts are not set under some conditions and therefore serial
+ * transmission hangs. This is a silicon issue and has not been root caused. The
+ * workaround for this silicon issue checks UART_LSR_THRE bit and UART_LSR_TEMT
+ * bit of LSR register in interrupt handler to see whether at least one of these
+ * two bits is set, if so then process the transmit request. If this workaround
+ * is not applied, then the serial transmission may hang. This workaround is for
+ * errata number 9 in Errata - B step.
+*/
+
+static unsigned int ce4100_mem_serial_in(struct uart_port *p, int offset)
+{
+       unsigned int ret, ier, lsr;
+
+       if (offset == UART_IIR) {
+               offset = offset << p->regshift;
+               ret = readl(p->membase + offset);
+               if (ret & UART_IIR_NO_INT) {
+                       /* see if the TX interrupt should have really set */
+                       ier = mem_serial_in(p, UART_IER);
+                       /* see if the UART's XMIT interrupt is enabled */
+                       if (ier & UART_IER_THRI) {
+                               lsr = mem_serial_in(p, UART_LSR);
+                               /* now check to see if the UART should be
+                                  generating an interrupt (but isn't) */
+                               if (lsr & (UART_LSR_THRE | UART_LSR_TEMT))
+                                       ret &= ~UART_IIR_NO_INT;
+                       }
+               }
+       } else
+               ret =  mem_serial_in(p, offset);
+       return ret;
+}
+
+static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value)
+{
+       offset = offset << p->regshift;
+       writel(value, p->membase + offset);
+}
+
+static void ce4100_serial_fixup(int port, struct uart_port *up,
+       unsigned short *capabilites)
+{
+#ifdef CONFIG_EARLY_PRINTK
+       /*
+        * Over ride the legacy port configuration that comes from
+        * asm/serial.h. Using the ioport driver then switching to the
+        * PCI memmaped driver hangs the IOAPIC
+        */
+       if (up->iotype !=  UPIO_MEM32) {
+               up->uartclk  = 14745600;
+               up->mapbase = 0xdffe0200;
+               set_fixmap_nocache(FIX_EARLYCON_MEM_BASE,
+                               up->mapbase & PAGE_MASK);
+               up->membase =
+                       (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE);
+               up->membase += up->mapbase & ~PAGE_MASK;
+               up->iotype   = UPIO_MEM32;
+               up->regshift = 2;
+       }
+#endif
+       up->iobase = 0;
+       up->serial_in = ce4100_mem_serial_in;
+       up->serial_out = ce4100_mem_serial_out;
+
+       *capabilites |= (1 << 12);
+}
+
+static __init void sdv_serial_fixup(void)
+{
+       serial8250_set_isa_configurator(ce4100_serial_fixup);
+}
+
+#else
+static inline void sdv_serial_fixup(void);
+#endif
+
+static void __init sdv_arch_setup(void)
+{
+       sdv_serial_fixup();
+}
+
+/*
+ * CE4100 specific x86_init function overrides and early setup
+ * calls.
+ */
+void __init x86_ce4100_early_setup(void)
+{
+       x86_init.oem.arch_setup = sdv_arch_setup;
+       x86_platform.i8042_detect = ce4100_i8042_detect;
+       x86_init.resources.probe_roms = x86_init_noop;
+       x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+       x86_init.mpparse.find_smp_config = sdv_find_smp_config;
+}
diff --git a/arch/x86/platform/iris/Makefile b/arch/x86/platform/iris/Makefile

new file mode 100644 (file)

index 0000000..db92198
--- /dev/null
+++ b/arch/x86/platform/iris/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_X86_32_IRIS)              += iris.o
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c

new file mode 100644 (file)

index 0000000..1ba7f5e
--- /dev/null
+++ b/arch/x86/platform/iris/iris.c
@@ -0,0 +1,91 @@
+/*
+ * Eurobraille/Iris power off support.
+ *
+ * Eurobraille's Iris machine is a PC with no APM or ACPI support.
+ * It is shutdown by a special I/O sequence which this module provides.
+ *
+ *  Copyright (C) Shérab <Sebastien.Hinderer@ens-lyon.org>
+ *
+ * This program is free software ; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation ; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY ; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with the program ; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/moduleparam.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <asm/io.h>
+
+#define IRIS_GIO_BASE          0x340
+#define IRIS_GIO_INPUT         IRIS_GIO_BASE
+#define IRIS_GIO_OUTPUT                (IRIS_GIO_BASE + 1)
+#define IRIS_GIO_PULSE         0x80 /* First byte to send */
+#define IRIS_GIO_REST          0x00 /* Second byte to send */
+#define IRIS_GIO_NODEV         0xff /* Likely not an Iris */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
+MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
+MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
+
+static int force;
+
+module_param(force, bool, 0);
+MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation.");
+
+static void (*old_pm_power_off)(void);
+
+static void iris_power_off(void)
+{
+       outb(IRIS_GIO_PULSE, IRIS_GIO_OUTPUT);
+       msleep(850);
+       outb(IRIS_GIO_REST, IRIS_GIO_OUTPUT);
+}
+
+/*
+ * Before installing the power_off handler, try to make sure the OS is
+ * running on an Iris.  Since Iris does not support DMI, this is done
+ * by reading its input port and seeing whether the read value is
+ * meaningful.
+ */
+static int iris_init(void)
+{
+       unsigned char status;
+       if (force != 1) {
+               printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n");
+               return -ENODEV;
+       }
+       status = inb(IRIS_GIO_INPUT);
+       if (status == IRIS_GIO_NODEV) {
+               printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n");
+               return -ENODEV;
+       }
+       old_pm_power_off = pm_power_off;
+       pm_power_off = &iris_power_off;
+       printk(KERN_INFO "Iris power_off handler installed.\n");
+
+       return 0;
+}
+
+static void iris_exit(void)
+{
+       pm_power_off = old_pm_power_off;
+       printk(KERN_INFO "Iris power_off handler uninstalled.\n");
+}
+
+module_init(iris_init);
+module_exit(iris_exit);
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile

index efbbc552fa953a5bc5e70e8108ec909cb3061252..f61ccdd4934141444f1c8f27d2b3a783f06c37d0 100644 (file)
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1 +1,3 @@
  obj-$(CONFIG_X86_MRST)         += mrst.o
+obj-$(CONFIG_X86_MRST)         += vrtc.o
+obj-$(CONFIG_EARLY_PRINTK_MRST)        += early_printk_mrst.o
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c

new file mode 100644 (file)

index 0000000..65df603
--- /dev/null
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
@@ -0,0 +1,319 @@
+/*
+ * early_printk_mrst.c - early consoles for Intel MID platforms
+ *
+ * Copyright (c) 2008-2010, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+/*
+ * This file implements two early consoles named mrst and hsu.
+ * mrst is based on Maxim3110 spi-uart device, it exists in both
+ * Moorestown and Medfield platforms, while hsu is based on a High
+ * Speed UART device which only exists in the Medfield platform
+ */
+
+#include <linux/serial_reg.h>
+#include <linux/serial_mfd.h>
+#include <linux/kmsg_dump.h>
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/io.h>
+
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/mrst.h>
+
+#define MRST_SPI_TIMEOUT               0x200000
+#define MRST_REGBASE_SPI0              0xff128000
+#define MRST_REGBASE_SPI1              0xff128400
+#define MRST_CLK_SPI0_REG              0xff11d86c
+
+/* Bit fields in CTRLR0 */
+#define SPI_DFS_OFFSET                 0
+
+#define SPI_FRF_OFFSET                 4
+#define SPI_FRF_SPI                    0x0
+#define SPI_FRF_SSP                    0x1
+#define SPI_FRF_MICROWIRE              0x2
+#define SPI_FRF_RESV                   0x3
+
+#define SPI_MODE_OFFSET                        6
+#define SPI_SCPH_OFFSET                        6
+#define SPI_SCOL_OFFSET                        7
+#define SPI_TMOD_OFFSET                        8
+#define        SPI_TMOD_TR                     0x0             /* xmit & recv */
+#define SPI_TMOD_TO                    0x1             /* xmit only */
+#define SPI_TMOD_RO                    0x2             /* recv only */
+#define SPI_TMOD_EPROMREAD             0x3             /* eeprom read mode */
+
+#define SPI_SLVOE_OFFSET               10
+#define SPI_SRL_OFFSET                 11
+#define SPI_CFS_OFFSET                 12
+
+/* Bit fields in SR, 7 bits */
+#define SR_MASK                                0x7f            /* cover 7 bits */
+#define SR_BUSY                                (1 << 0)
+#define SR_TF_NOT_FULL                 (1 << 1)
+#define SR_TF_EMPT                     (1 << 2)
+#define SR_RF_NOT_EMPT                 (1 << 3)
+#define SR_RF_FULL                     (1 << 4)
+#define SR_TX_ERR                      (1 << 5)
+#define SR_DCOL                                (1 << 6)
+
+struct dw_spi_reg {
+       u32     ctrl0;
+       u32     ctrl1;
+       u32     ssienr;
+       u32     mwcr;
+       u32     ser;
+       u32     baudr;
+       u32     txfltr;
+       u32     rxfltr;
+       u32     txflr;
+       u32     rxflr;
+       u32     sr;
+       u32     imr;
+       u32     isr;
+       u32     risr;
+       u32     txoicr;
+       u32     rxoicr;
+       u32     rxuicr;
+       u32     msticr;
+       u32     icr;
+       u32     dmacr;
+       u32     dmatdlr;
+       u32     dmardlr;
+       u32     idr;
+       u32     version;
+
+       /* Currently operates as 32 bits, though only the low 16 bits matter */
+       u32     dr;
+} __packed;
+
+#define dw_readl(dw, name)             __raw_readl(&(dw)->name)
+#define dw_writel(dw, name, val)       __raw_writel((val), &(dw)->name)
+
+/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
+static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
+
+static u32 *pclk_spi0;
+/* Always contains an accessable address, start with 0 */
+static struct dw_spi_reg *pspi;
+
+static struct kmsg_dumper dw_dumper;
+static int dumper_registered;
+
+static void dw_kmsg_dump(struct kmsg_dumper *dumper,
+                       enum kmsg_dump_reason reason,
+                       const char *s1, unsigned long l1,
+                       const char *s2, unsigned long l2)
+{
+       int i;
+
+       /* When run to this, we'd better re-init the HW */
+       mrst_early_console_init();
+
+       for (i = 0; i < l1; i++)
+               early_mrst_console.write(&early_mrst_console, s1 + i, 1);
+       for (i = 0; i < l2; i++)
+               early_mrst_console.write(&early_mrst_console, s2 + i, 1);
+}
+
+/* Set the ratio rate to 115200, 8n1, IRQ disabled */
+static void max3110_write_config(void)
+{
+       u16 config;
+
+       config = 0xc001;
+       dw_writel(pspi, dr, config);
+}
+
+/* Translate char to a eligible word and send to max3110 */
+static void max3110_write_data(char c)
+{
+       u16 data;
+
+       data = 0x8000 | c;
+       dw_writel(pspi, dr, data);
+}
+
+void mrst_early_console_init(void)
+{
+       u32 ctrlr0 = 0;
+       u32 spi0_cdiv;
+       u32 freq; /* Freqency info only need be searched once */
+
+       /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
+       pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+                                                       MRST_CLK_SPI0_REG);
+       spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
+       freq = 100000000 / (spi0_cdiv + 1);
+
+       if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL)
+               mrst_spi_paddr = MRST_REGBASE_SPI1;
+
+       pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+                                               mrst_spi_paddr);
+
+       /* Disable SPI controller */
+       dw_writel(pspi, ssienr, 0);
+
+       /* Set control param, 8 bits, transmit only mode */
+       ctrlr0 = dw_readl(pspi, ctrl0);
+
+       ctrlr0 &= 0xfcc0;
+       ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
+                     | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
+       dw_writel(pspi, ctrl0, ctrlr0);
+
+       /*
+        * Change the spi0 clk to comply with 115200 bps, use 100000 to
+        * calculate the clk dividor to make the clock a little slower
+        * than real baud rate.
+        */
+       dw_writel(pspi, baudr, freq/100000);
+
+       /* Disable all INT for early phase */
+       dw_writel(pspi, imr, 0x0);
+
+       /* Set the cs to spi-uart */
+       dw_writel(pspi, ser, 0x2);
+
+       /* Enable the HW, the last step for HW init */
+       dw_writel(pspi, ssienr, 0x1);
+
+       /* Set the default configuration */
+       max3110_write_config();
+
+       /* Register the kmsg dumper */
+       if (!dumper_registered) {
+               dw_dumper.dump = dw_kmsg_dump;
+               kmsg_dump_register(&dw_dumper);
+               dumper_registered = 1;
+       }
+}
+
+/* Slave select should be called in the read/write function */
+static void early_mrst_spi_putc(char c)
+{
+       unsigned int timeout;
+       u32 sr;
+
+       timeout = MRST_SPI_TIMEOUT;
+       /* Early putc needs to make sure the TX FIFO is not full */
+       while (--timeout) {
+               sr = dw_readl(pspi, sr);
+               if (!(sr & SR_TF_NOT_FULL))
+                       cpu_relax();
+               else
+                       break;
+       }
+
+       if (!timeout)
+               pr_warning("MRST earlycon: timed out\n");
+       else
+               max3110_write_data(c);
+}
+
+/* Early SPI only uses polling mode */
+static void early_mrst_spi_write(struct console *con, const char *str, unsigned n)
+{
+       int i;
+
+       for (i = 0; i < n && *str; i++) {
+               if (*str == '\n')
+                       early_mrst_spi_putc('\r');
+               early_mrst_spi_putc(*str);
+               str++;
+       }
+}
+
+struct console early_mrst_console = {
+       .name =         "earlymrst",
+       .write =        early_mrst_spi_write,
+       .flags =        CON_PRINTBUFFER,
+       .index =        -1,
+};
+
+/*
+ * Following is the early console based on Medfield HSU (High
+ * Speed UART) device.
+ */
+#define HSU_PORT2_PADDR                0xffa28180
+
+static void __iomem *phsu;
+
+void hsu_early_console_init(void)
+{
+       u8 lcr;
+
+       phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+                                                       HSU_PORT2_PADDR);
+
+       /* Disable FIFO */
+       writeb(0x0, phsu + UART_FCR);
+
+       /* Set to default 115200 bps, 8n1 */
+       lcr = readb(phsu + UART_LCR);
+       writeb((0x80 | lcr), phsu + UART_LCR);
+       writeb(0x18, phsu + UART_DLL);
+       writeb(lcr,  phsu + UART_LCR);
+       writel(0x3600, phsu + UART_MUL*4);
+
+       writeb(0x8, phsu + UART_MCR);
+       writeb(0x7, phsu + UART_FCR);
+       writeb(0x3, phsu + UART_LCR);
+
+       /* Clear IRQ status */
+       readb(phsu + UART_LSR);
+       readb(phsu + UART_RX);
+       readb(phsu + UART_IIR);
+       readb(phsu + UART_MSR);
+
+       /* Enable FIFO */
+       writeb(0x7, phsu + UART_FCR);
+}
+
+#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
+
+static void early_hsu_putc(char ch)
+{
+       unsigned int timeout = 10000; /* 10ms */
+       u8 status;
+
+       while (--timeout) {
+               status = readb(phsu + UART_LSR);
+               if (status & BOTH_EMPTY)
+                       break;
+               udelay(1);
+       }
+
+       /* Only write the char when there was no timeout */
+       if (timeout)
+               writeb(ch, phsu + UART_TX);
+}
+
+static void early_hsu_write(struct console *con, const char *str, unsigned n)
+{
+       int i;
+
+       for (i = 0; i < n && *str; i++) {
+               if (*str == '\n')
+                       early_hsu_putc('\r');
+               early_hsu_putc(*str);
+               str++;
+       }
+}
+
+struct console early_hsu_console = {
+       .name =         "earlyhsu",
+       .write =        early_hsu_write,
+       .flags =        CON_PRINTBUFFER,
+       .index =        -1,
+};
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c

index 79ae68154e871fe208ff5fcfd809daf3b03c4ab5..fee0b4914e07ad494f82629ccc10dc0a1980879c 100644 (file)
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -9,9 +9,19 @@
   * as published by the Free Software Foundation; version 2
   * of the License.
   */
+
+#define pr_fmt(fmt) "mrst: " fmt
+
  #include <linux/init.h>
  #include <linux/kernel.h>
  #include <linux/sfi.h>
+#include <linux/intel_pmic_gpio.h>
+#include <linux/spi/spi.h>
+#include <linux/i2c.h>
+#include <linux/i2c/pca953x.h>
+#include <linux/gpio_keys.h>
+#include <linux/input.h>
+#include <linux/platform_device.h>
  #include <linux/irq.h>
  #include <linux/module.h>
  
@@ -23,7 +33,9 @@
  #include <asm/mrst.h>
  #include <asm/io.h>
  #include <asm/i8259.h>
+#include <asm/intel_scu_ipc.h>
  #include <asm/apb_timer.h>
+#include <asm/reboot.h>
  
  /*
   * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
@@ -102,10 +114,10 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table)
                 memcpy(sfi_mtimer_array, pentry, totallen);
         }
  
-       printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
+       pr_debug("SFI MTIMER info (num = %d):\n", sfi_mtimer_num);
         pentry = sfi_mtimer_array;
         for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
-               printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
+               pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz,"
                         " irq = %d\n", totallen, (u32)pentry->phys_addr,
                         pentry->freq_hz, pentry->irq);
                         if (!pentry->irq)
@@ -176,14 +188,14 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
                 memcpy(sfi_mrtc_array, pentry, totallen);
         }
  
-       printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
+       pr_debug("SFI RTC info (num = %d):\n", sfi_mrtc_num);
         pentry = sfi_mrtc_array;
         for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
-               printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
+               pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n",
                         totallen, (u32)pentry->phys_addr, pentry->irq);
                 mp_irq.type = MP_IOAPIC;
                 mp_irq.irqtype = mp_INT;
-               mp_irq.irqflag = 0;
+               mp_irq.irqflag = 0xf;   /* level trigger and active low */
                 mp_irq.srcbus = 0;
                 mp_irq.srcbusirq = pentry->irq; /* IRQ */
                 mp_irq.dstapic = MP_APIC_ALL;
@@ -209,6 +221,7 @@ static unsigned long __init mrst_calibrate_tsc(void)
  
  void __init mrst_time_init(void)
  {
+       sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
         switch (mrst_timer_options) {
         case MRST_TIMER_APBT_ONLY:
                 break;
@@ -224,16 +237,10 @@ void __init mrst_time_init(void)
                 return;
         }
         /* we need at least one APB timer */
-       sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
         pre_init_apic_IRQ0();
         apbt_time_init();
  }
  
-void __init mrst_rtc_init(void)
-{
-       sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
-}
-
  void __cpuinit mrst_arch_setup(void)
  {
         if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
@@ -256,6 +263,17 @@ static int mrst_i8042_detect(void)
         return 0;
  }
  
+/* Reboot and power off are handled by the SCU on a MID device */
+static void mrst_power_off(void)
+{
+       intel_scu_ipc_simple_command(0xf1, 1);
+}
+
+static void mrst_reboot(void)
+{
+       intel_scu_ipc_simple_command(0xf1, 0);
+}
+
  /*
   * Moorestown specific x86_init function overrides and early setup
   * calls.
@@ -281,6 +299,10 @@ void __init x86_mrst_early_setup(void)
  
         legacy_pic = &null_legacy_pic;
  
+       /* Moorestown specific power_off/restart method */
+       pm_power_off = mrst_power_off;
+       machine_ops.emergency_restart  = mrst_reboot;
+
         /* Avoid searching for BIOS MP tables */
         x86_init.mpparse.find_smp_config = x86_init_noop;
         x86_init.mpparse.get_smp_config = x86_init_uint_noop;
@@ -309,3 +331,505 @@ static inline int __init setup_x86_mrst_timer(char *arg)
         return 0;
  }
  __setup("x86_mrst_timer=", setup_x86_mrst_timer);
+
+/*
+ * Parsing GPIO table first, since the DEVS table will need this table
+ * to map the pin name to the actual pin.
+ */
+static struct sfi_gpio_table_entry *gpio_table;
+static int gpio_num_entry;
+
+static int __init sfi_parse_gpio(struct sfi_table_header *table)
+{
+       struct sfi_table_simple *sb;
+       struct sfi_gpio_table_entry *pentry;
+       int num, i;
+
+       if (gpio_table)
+               return 0;
+       sb = (struct sfi_table_simple *)table;
+       num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry);
+       pentry = (struct sfi_gpio_table_entry *)sb->pentry;
+
+       gpio_table = (struct sfi_gpio_table_entry *)
+                               kmalloc(num * sizeof(*pentry), GFP_KERNEL);
+       if (!gpio_table)
+               return -1;
+       memcpy(gpio_table, pentry, num * sizeof(*pentry));
+       gpio_num_entry = num;
+
+       pr_debug("GPIO pin info:\n");
+       for (i = 0; i < num; i++, pentry++)
+               pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s,"
+               " pin = %d\n", i,
+                       pentry->controller_name,
+                       pentry->pin_name,
+                       pentry->pin_no);
+       return 0;
+}
+
+static int get_gpio_by_name(const char *name)
+{
+       struct sfi_gpio_table_entry *pentry = gpio_table;
+       int i;
+
+       if (!pentry)
+               return -1;
+       for (i = 0; i < gpio_num_entry; i++, pentry++) {
+               if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN))
+                       return pentry->pin_no;
+       }
+       return -1;
+}
+
+/*
+ * Here defines the array of devices platform data that IAFW would export
+ * through SFI "DEVS" table, we use name and type to match the device and
+ * its platform data.
+ */
+struct devs_id {
+       char name[SFI_NAME_LEN + 1];
+       u8 type;
+       u8 delay;
+       void *(*get_platform_data)(void *info);
+};
+
+/* the offset for the mapping of global gpio pin to irq */
+#define MRST_IRQ_OFFSET 0x100
+
+static void __init *pmic_gpio_platform_data(void *info)
+{
+       static struct intel_pmic_gpio_platform_data pmic_gpio_pdata;
+       int gpio_base = get_gpio_by_name("pmic_gpio_base");
+
+       if (gpio_base == -1)
+               gpio_base = 64;
+       pmic_gpio_pdata.gpio_base = gpio_base;
+       pmic_gpio_pdata.irq_base = gpio_base + MRST_IRQ_OFFSET;
+       pmic_gpio_pdata.gpiointr = 0xffffeff8;
+
+       return &pmic_gpio_pdata;
+}
+
+static void __init *max3111_platform_data(void *info)
+{
+       struct spi_board_info *spi_info = info;
+       int intr = get_gpio_by_name("max3111_int");
+
+       if (intr == -1)
+               return NULL;
+       spi_info->irq = intr + MRST_IRQ_OFFSET;
+       return NULL;
+}
+
+/* we have multiple max7315 on the board ... */
+#define MAX7315_NUM 2
+static void __init *max7315_platform_data(void *info)
+{
+       static struct pca953x_platform_data max7315_pdata[MAX7315_NUM];
+       static int nr;
+       struct pca953x_platform_data *max7315 = &max7315_pdata[nr];
+       struct i2c_board_info *i2c_info = info;
+       int gpio_base, intr;
+       char base_pin_name[SFI_NAME_LEN + 1];
+       char intr_pin_name[SFI_NAME_LEN + 1];
+
+       if (nr == MAX7315_NUM) {
+               pr_err("too many max7315s, we only support %d\n",
+                               MAX7315_NUM);
+               return NULL;
+       }
+       /* we have several max7315 on the board, we only need load several
+        * instances of the same pca953x driver to cover them
+        */
+       strcpy(i2c_info->type, "max7315");
+       if (nr++) {
+               sprintf(base_pin_name, "max7315_%d_base", nr);
+               sprintf(intr_pin_name, "max7315_%d_int", nr);
+       } else {
+               strcpy(base_pin_name, "max7315_base");
+               strcpy(intr_pin_name, "max7315_int");
+       }
+
+       gpio_base = get_gpio_by_name(base_pin_name);
+       intr = get_gpio_by_name(intr_pin_name);
+
+       if (gpio_base == -1)
+               return NULL;
+       max7315->gpio_base = gpio_base;
+       if (intr != -1) {
+               i2c_info->irq = intr + MRST_IRQ_OFFSET;
+               max7315->irq_base = gpio_base + MRST_IRQ_OFFSET;
+       } else {
+               i2c_info->irq = -1;
+               max7315->irq_base = -1;
+       }
+       return max7315;
+}
+
+static void __init *emc1403_platform_data(void *info)
+{
+       static short intr2nd_pdata;
+       struct i2c_board_info *i2c_info = info;
+       int intr = get_gpio_by_name("thermal_int");
+       int intr2nd = get_gpio_by_name("thermal_alert");
+
+       if (intr == -1 || intr2nd == -1)
+               return NULL;
+
+       i2c_info->irq = intr + MRST_IRQ_OFFSET;
+       intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET;
+
+       return &intr2nd_pdata;
+}
+
+static void __init *lis331dl_platform_data(void *info)
+{
+       static short intr2nd_pdata;
+       struct i2c_board_info *i2c_info = info;
+       int intr = get_gpio_by_name("accel_int");
+       int intr2nd = get_gpio_by_name("accel_2");
+
+       if (intr == -1 || intr2nd == -1)
+               return NULL;
+
+       i2c_info->irq = intr + MRST_IRQ_OFFSET;
+       intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET;
+
+       return &intr2nd_pdata;
+}
+
+static void __init *no_platform_data(void *info)
+{
+       return NULL;
+}
+
+static const struct devs_id __initconst device_ids[] = {
+       {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data},
+       {"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data},
+       {"i2c_max7315", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
+       {"i2c_max7315_2", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
+       {"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data},
+       {"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data},
+       {"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
+       {"msic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
+       {},
+};
+
+#define MAX_IPCDEVS    24
+static struct platform_device *ipc_devs[MAX_IPCDEVS];
+static int ipc_next_dev;
+
+#define MAX_SCU_SPI    24
+static struct spi_board_info *spi_devs[MAX_SCU_SPI];
+static int spi_next_dev;
+
+#define MAX_SCU_I2C    24
+static struct i2c_board_info *i2c_devs[MAX_SCU_I2C];
+static int i2c_bus[MAX_SCU_I2C];
+static int i2c_next_dev;
+
+static void __init intel_scu_device_register(struct platform_device *pdev)
+{
+       if(ipc_next_dev == MAX_IPCDEVS)
+               pr_err("too many SCU IPC devices");
+       else
+               ipc_devs[ipc_next_dev++] = pdev;
+}
+
+static void __init intel_scu_spi_device_register(struct spi_board_info *sdev)
+{
+       struct spi_board_info *new_dev;
+
+       if (spi_next_dev == MAX_SCU_SPI) {
+               pr_err("too many SCU SPI devices");
+               return;
+       }
+
+       new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+       if (!new_dev) {
+               pr_err("failed to alloc mem for delayed spi dev %s\n",
+                       sdev->modalias);
+               return;
+       }
+       memcpy(new_dev, sdev, sizeof(*sdev));
+
+       spi_devs[spi_next_dev++] = new_dev;
+}
+
+static void __init intel_scu_i2c_device_register(int bus,
+                                               struct i2c_board_info *idev)
+{
+       struct i2c_board_info *new_dev;
+
+       if (i2c_next_dev == MAX_SCU_I2C) {
+               pr_err("too many SCU I2C devices");
+               return;
+       }
+
+       new_dev = kzalloc(sizeof(*idev), GFP_KERNEL);
+       if (!new_dev) {
+               pr_err("failed to alloc mem for delayed i2c dev %s\n",
+                       idev->type);
+               return;
+       }
+       memcpy(new_dev, idev, sizeof(*idev));
+
+       i2c_bus[i2c_next_dev] = bus;
+       i2c_devs[i2c_next_dev++] = new_dev;
+}
+
+/* Called by IPC driver */
+void intel_scu_devices_create(void)
+{
+       int i;
+
+       for (i = 0; i < ipc_next_dev; i++)
+               platform_device_add(ipc_devs[i]);
+
+       for (i = 0; i < spi_next_dev; i++)
+               spi_register_board_info(spi_devs[i], 1);
+
+       for (i = 0; i < i2c_next_dev; i++) {
+               struct i2c_adapter *adapter;
+               struct i2c_client *client;
+
+               adapter = i2c_get_adapter(i2c_bus[i]);
+               if (adapter) {
+                       client = i2c_new_device(adapter, i2c_devs[i]);
+                       if (!client)
+                               pr_err("can't create i2c device %s\n",
+                                       i2c_devs[i]->type);
+               } else
+                       i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1);
+       }
+}
+EXPORT_SYMBOL_GPL(intel_scu_devices_create);
+
+/* Called by IPC driver */
+void intel_scu_devices_destroy(void)
+{
+       int i;
+
+       for (i = 0; i < ipc_next_dev; i++)
+               platform_device_del(ipc_devs[i]);
+}
+EXPORT_SYMBOL_GPL(intel_scu_devices_destroy);
+
+static void __init install_irq_resource(struct platform_device *pdev, int irq)
+{
+       /* Single threaded */
+       static struct resource __initdata res = {
+               .name = "IRQ",
+               .flags = IORESOURCE_IRQ,
+       };
+       res.start = irq;
+       platform_device_add_resources(pdev, &res, 1);
+}
+
+static void __init sfi_handle_ipc_dev(struct platform_device *pdev)
+{
+       const struct devs_id *dev = device_ids;
+       void *pdata = NULL;
+
+       while (dev->name[0]) {
+               if (dev->type == SFI_DEV_TYPE_IPC &&
+                       !strncmp(dev->name, pdev->name, SFI_NAME_LEN)) {
+                       pdata = dev->get_platform_data(pdev);
+                       break;
+               }
+               dev++;
+       }
+       pdev->dev.platform_data = pdata;
+       intel_scu_device_register(pdev);
+}
+
+static void __init sfi_handle_spi_dev(struct spi_board_info *spi_info)
+{
+       const struct devs_id *dev = device_ids;
+       void *pdata = NULL;
+
+       while (dev->name[0]) {
+               if (dev->type == SFI_DEV_TYPE_SPI &&
+                               !strncmp(dev->name, spi_info->modalias, SFI_NAME_LEN)) {
+                       pdata = dev->get_platform_data(spi_info);
+                       break;
+               }
+               dev++;
+       }
+       spi_info->platform_data = pdata;
+       if (dev->delay)
+               intel_scu_spi_device_register(spi_info);
+       else
+               spi_register_board_info(spi_info, 1);
+}
+
+static void __init sfi_handle_i2c_dev(int bus, struct i2c_board_info *i2c_info)
+{
+       const struct devs_id *dev = device_ids;
+       void *pdata = NULL;
+
+       while (dev->name[0]) {
+               if (dev->type == SFI_DEV_TYPE_I2C &&
+                       !strncmp(dev->name, i2c_info->type, SFI_NAME_LEN)) {
+                       pdata = dev->get_platform_data(i2c_info);
+                       break;
+               }
+               dev++;
+       }
+       i2c_info->platform_data = pdata;
+
+       if (dev->delay)
+               intel_scu_i2c_device_register(bus, i2c_info);
+       else
+               i2c_register_board_info(bus, i2c_info, 1);
+ }
+
+
+static int __init sfi_parse_devs(struct sfi_table_header *table)
+{
+       struct sfi_table_simple *sb;
+       struct sfi_device_table_entry *pentry;
+       struct spi_board_info spi_info;
+       struct i2c_board_info i2c_info;
+       struct platform_device *pdev;
+       int num, i, bus;
+       int ioapic;
+       struct io_apic_irq_attr irq_attr;
+
+       sb = (struct sfi_table_simple *)table;
+       num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
+       pentry = (struct sfi_device_table_entry *)sb->pentry;
+
+       for (i = 0; i < num; i++, pentry++) {
+               if (pentry->irq != (u8)0xff) { /* native RTE case */
+                       /* these SPI2 devices are not exposed to system as PCI
+                        * devices, but they have separate RTE entry in IOAPIC
+                        * so we have to enable them one by one here
+                        */
+                       ioapic = mp_find_ioapic(pentry->irq);
+                       irq_attr.ioapic = ioapic;
+                       irq_attr.ioapic_pin = pentry->irq;
+                       irq_attr.trigger = 1;
+                       irq_attr.polarity = 1;
+                       io_apic_set_pci_routing(NULL, pentry->irq, &irq_attr);
+               }
+               switch (pentry->type) {
+               case SFI_DEV_TYPE_IPC:
+                       /* ID as IRQ is a hack that will go away */
+                       pdev = platform_device_alloc(pentry->name, pentry->irq);
+                       if (pdev == NULL) {
+                               pr_err("out of memory for SFI platform device '%s'.\n",
+                                                       pentry->name);
+                               continue;
+                       }
+                       install_irq_resource(pdev, pentry->irq);
+                       pr_debug("info[%2d]: IPC bus, name = %16.16s, "
+                               "irq = 0x%2x\n", i, pentry->name, pentry->irq);
+                       sfi_handle_ipc_dev(pdev);
+                       break;
+               case SFI_DEV_TYPE_SPI:
+                       memset(&spi_info, 0, sizeof(spi_info));
+                       strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN);
+                       spi_info.irq = pentry->irq;
+                       spi_info.bus_num = pentry->host_num;
+                       spi_info.chip_select = pentry->addr;
+                       spi_info.max_speed_hz = pentry->max_freq;
+                       pr_debug("info[%2d]: SPI bus = %d, name = %16.16s, "
+                               "irq = 0x%2x, max_freq = %d, cs = %d\n", i,
+                               spi_info.bus_num,
+                               spi_info.modalias,
+                               spi_info.irq,
+                               spi_info.max_speed_hz,
+                               spi_info.chip_select);
+                       sfi_handle_spi_dev(&spi_info);
+                       break;
+               case SFI_DEV_TYPE_I2C:
+                       memset(&i2c_info, 0, sizeof(i2c_info));
+                       bus = pentry->host_num;
+                       strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN);
+                       i2c_info.irq = pentry->irq;
+                       i2c_info.addr = pentry->addr;
+                       pr_debug("info[%2d]: I2C bus = %d, name = %16.16s, "
+                               "irq = 0x%2x, addr = 0x%x\n", i, bus,
+                               i2c_info.type,
+                               i2c_info.irq,
+                               i2c_info.addr);
+                       sfi_handle_i2c_dev(bus, &i2c_info);
+                       break;
+               case SFI_DEV_TYPE_UART:
+               case SFI_DEV_TYPE_HSI:
+               default:
+                       ;
+               }
+       }
+       return 0;
+}
+
+static int __init mrst_platform_init(void)
+{
+       sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio);
+       sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs);
+       return 0;
+}
+arch_initcall(mrst_platform_init);
+
+/*
+ * we will search these buttons in SFI GPIO table (by name)
+ * and register them dynamically. Please add all possible
+ * buttons here, we will shrink them if no GPIO found.
+ */
+static struct gpio_keys_button gpio_button[] = {
+       {KEY_POWER,             -1, 1, "power_btn",     EV_KEY, 0, 3000},
+       {KEY_PROG1,             -1, 1, "prog_btn1",     EV_KEY, 0, 20},
+       {KEY_PROG2,             -1, 1, "prog_btn2",     EV_KEY, 0, 20},
+       {SW_LID,                -1, 1, "lid_switch",    EV_SW,  0, 20},
+       {KEY_VOLUMEUP,          -1, 1, "vol_up",        EV_KEY, 0, 20},
+       {KEY_VOLUMEDOWN,        -1, 1, "vol_down",      EV_KEY, 0, 20},
+       {KEY_CAMERA,            -1, 1, "camera_full",   EV_KEY, 0, 20},
+       {KEY_CAMERA_FOCUS,      -1, 1, "camera_half",   EV_KEY, 0, 20},
+       {SW_KEYPAD_SLIDE,       -1, 1, "MagSw1",        EV_SW,  0, 20},
+       {SW_KEYPAD_SLIDE,       -1, 1, "MagSw2",        EV_SW,  0, 20},
+};
+
+static struct gpio_keys_platform_data mrst_gpio_keys = {
+       .buttons        = gpio_button,
+       .rep            = 1,
+       .nbuttons       = -1, /* will fill it after search */
+};
+
+static struct platform_device pb_device = {
+       .name           = "gpio-keys",
+       .id             = -1,
+       .dev            = {
+               .platform_data  = &mrst_gpio_keys,
+       },
+};
+
+/*
+ * Shrink the non-existent buttons, register the gpio button
+ * device if there is some
+ */
+static int __init pb_keys_init(void)
+{
+       struct gpio_keys_button *gb = gpio_button;
+       int i, num, good = 0;
+
+       num = sizeof(gpio_button) / sizeof(struct gpio_keys_button);
+       for (i = 0; i < num; i++) {
+               gb[i].gpio = get_gpio_by_name(gb[i].desc);
+               if (gb[i].gpio == -1)
+                       continue;
+
+               if (i != good)
+                       gb[good] = gb[i];
+               good++;
+       }
+
+       if (good) {
+               mrst_gpio_keys.nbuttons = good;
+               return platform_device_register(&pb_device);
+       }
+       return 0;
+}
+late_initcall(pb_keys_init);
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c

new file mode 100644 (file)

index 0000000..32cd7ed
--- /dev/null
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -0,0 +1,165 @@
+/*
+ * vrtc.c: Driver for virtual RTC device on Intel MID platform
+ *
+ * (C) Copyright 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ * VRTC is emulated by system controller firmware, the real HW
+ * RTC is located in the PMIC device. SCU FW shadows PMIC RTC
+ * in a memory mapped IO space that is visible to the host IA
+ * processor.
+ *
+ * This driver is based on RTC CMOS driver.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <linux/platform_device.h>
+
+#include <asm/mrst.h>
+#include <asm/mrst-vrtc.h>
+#include <asm/time.h>
+#include <asm/fixmap.h>
+
+static unsigned char __iomem *vrtc_virt_base;
+
+unsigned char vrtc_cmos_read(unsigned char reg)
+{
+       unsigned char retval;
+
+       /* vRTC's registers range from 0x0 to 0xD */
+       if (reg > 0xd || !vrtc_virt_base)
+               return 0xff;
+
+       lock_cmos_prefix(reg);
+       retval = __raw_readb(vrtc_virt_base + (reg << 2));
+       lock_cmos_suffix(reg);
+       return retval;
+}
+EXPORT_SYMBOL_GPL(vrtc_cmos_read);
+
+void vrtc_cmos_write(unsigned char val, unsigned char reg)
+{
+       if (reg > 0xd || !vrtc_virt_base)
+               return;
+
+       lock_cmos_prefix(reg);
+       __raw_writeb(val, vrtc_virt_base + (reg << 2));
+       lock_cmos_suffix(reg);
+}
+EXPORT_SYMBOL_GPL(vrtc_cmos_write);
+
+unsigned long vrtc_get_time(void)
+{
+       u8 sec, min, hour, mday, mon;
+       u32 year;
+
+       while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP))
+               cpu_relax();
+
+       sec = vrtc_cmos_read(RTC_SECONDS);
+       min = vrtc_cmos_read(RTC_MINUTES);
+       hour = vrtc_cmos_read(RTC_HOURS);
+       mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
+       mon = vrtc_cmos_read(RTC_MONTH);
+       year = vrtc_cmos_read(RTC_YEAR);
+
+       /* vRTC YEAR reg contains the offset to 1960 */
+       year += 1960;
+
+       printk(KERN_INFO "vRTC: sec: %d min: %d hour: %d day: %d "
+               "mon: %d year: %d\n", sec, min, hour, mday, mon, year);
+
+       return mktime(year, mon, mday, hour, min, sec);
+}
+
+/* Only care about the minutes and seconds */
+int vrtc_set_mmss(unsigned long nowtime)
+{
+       int real_sec, real_min;
+       int vrtc_min;
+
+       vrtc_min = vrtc_cmos_read(RTC_MINUTES);
+
+       real_sec = nowtime % 60;
+       real_min = nowtime / 60;
+       if (((abs(real_min - vrtc_min) + 15)/30) & 1)
+               real_min += 30;
+       real_min %= 60;
+
+       vrtc_cmos_write(real_sec, RTC_SECONDS);
+       vrtc_cmos_write(real_min, RTC_MINUTES);
+       return 0;
+}
+
+void __init mrst_rtc_init(void)
+{
+       unsigned long rtc_paddr;
+       void __iomem *virt_base;
+
+       sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
+       if (!sfi_mrtc_num)
+               return;
+
+       rtc_paddr = sfi_mrtc_array[0].phys_addr;
+
+       /* vRTC's register address may not be page aligned */
+       set_fixmap_nocache(FIX_LNW_VRTC, rtc_paddr);
+
+       virt_base = (void __iomem *)__fix_to_virt(FIX_LNW_VRTC);
+       virt_base += rtc_paddr & ~PAGE_MASK;
+       vrtc_virt_base = virt_base;
+
+       x86_platform.get_wallclock = vrtc_get_time;
+       x86_platform.set_wallclock = vrtc_set_mmss;
+}
+
+/*
+ * The Moorestown platform has a memory mapped virtual RTC device that emulates
+ * the programming interface of the RTC.
+ */
+
+static struct resource vrtc_resources[] = {
+       [0] = {
+               .flags  = IORESOURCE_MEM,
+       },
+       [1] = {
+               .flags  = IORESOURCE_IRQ,
+       }
+};
+
+static struct platform_device vrtc_device = {
+       .name           = "rtc_mrst",
+       .id             = -1,
+       .resource       = vrtc_resources,
+       .num_resources  = ARRAY_SIZE(vrtc_resources),
+};
+
+/* Register the RTC device if appropriate */
+static int __init mrst_device_create(void)
+{
+       /* No Moorestown, no device */
+       if (!mrst_identify_cpu())
+               return -ENODEV;
+       /* No timer, no device */
+       if (!sfi_mrtc_num)
+               return -ENODEV;
+
+       /* iomem resource */
+       vrtc_resources[0].start = sfi_mrtc_array[0].phys_addr;
+       vrtc_resources[0].end = sfi_mrtc_array[0].phys_addr +
+                               MRST_VRTC_MAP_SZ;
+       /* irq resource */
+       vrtc_resources[1].start = sfi_mrtc_array[0].irq;
+       vrtc_resources[1].end = sfi_mrtc_array[0].irq;
+
+       return platform_device_register(&vrtc_device);
+}
+
+module_init(mrst_device_create);
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c

index dd4c281ffe5720c3ff15f1eceaa09759e17df7d1..ca54875ac795117079b7a9521bfd0bf42bf9f980 100644 (file)
--- a/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -48,9 +48,9 @@ static void __init mp_sfi_register_lapic_address(unsigned long address)
  /* All CPUs enumerated by SFI must be present and enabled */
  static void __cpuinit mp_sfi_register_lapic(u8 id)
  {
-       if (MAX_APICS - id <= 0) {
+       if (MAX_LOCAL_APIC - id <= 0) {
                 pr_warning("Processor #%d invalid (max %d)\n",
-                       id, MAX_APICS);
+                       id, MAX_LOCAL_APIC);
                 return;
         }
  
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c

index ba9caa808a9c1b42c6a616968c57e96769039314..df58e9cad96ae9441a4f86f22900a6e0bf05aa64 100644 (file)
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1341,7 +1341,7 @@ uv_activation_descriptor_init(int node, int pnode)
  
         /*
          * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
-        * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
+        * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE)
          */
         bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE
                                 * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
@@ -1490,7 +1490,7 @@ calculate_destination_timeout(void)
  /*
   * initialize the bau_control structure for each cpu
   */
-static void __init uv_init_per_cpu(int nuvhubs)
+static int __init uv_init_per_cpu(int nuvhubs)
  {
         int i;
         int cpu;
@@ -1507,7 +1507,7 @@ static void __init uv_init_per_cpu(int nuvhubs)
         struct bau_control *smaster = NULL;
         struct socket_desc {
                 short num_cpus;
-               short cpu_number[16];
+               short cpu_number[MAX_CPUS_PER_SOCKET];
         };
         struct uvhub_desc {
                 unsigned short socket_mask;
@@ -1540,6 +1540,10 @@ static void __init uv_init_per_cpu(int nuvhubs)
                 sdp = &bdp->socket[socket];
                 sdp->cpu_number[sdp->num_cpus] = cpu;
                 sdp->num_cpus++;
+               if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
+                       printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus);
+                       return 1;
+               }
         }
         for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
                 if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
@@ -1570,6 +1574,12 @@ static void __init uv_init_per_cpu(int nuvhubs)
                                 bcp->uvhub_master = hmaster;
                                 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
                                                 blade_processor_id;
+                               if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
+                                       printk(KERN_EMERG
+                                               "%d cpus per uvhub invalid\n",
+                                               bcp->uvhub_cpu);
+                                       return 1;
+                               }
                         }
  nextsocket:
                         socket++;
@@ -1595,6 +1605,7 @@ nextsocket:
                 bcp->congested_reps = congested_reps;
                 bcp->congested_period = congested_period;
         }
+       return 0;
  }
  
  /*
@@ -1625,7 +1636,10 @@ static int __init uv_bau_init(void)
         spin_lock_init(&disable_lock);
         congested_cycles = microsec_2_cycles(congested_response_us);
  
-       uv_init_per_cpu(nuvhubs);
+       if (uv_init_per_cpu(nuvhubs)) {
+               nobau = 1;
+               return 0;
+       }
  
         uv_partition_base_pnode = 0x7fffffff;
         for (uvhub = 0; uvhub < nuvhubs; uvhub++)
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c

index 3371bd053b89f29e14e5f55d88929d2427f123b2..63203767174683b3db1ab410be9263aabe280ab0 100644 (file)
--- a/arch/x86/platform/visws/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -171,7 +171,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
         ver = m->apicver;
         if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
                 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
-                       m->apicid, MAX_APICS);
+                       m->apicid, MAX_LOCAL_APIC);
                 return;
         }
  
diff --git a/drivers/acpi/acpica/nsinit.c b/drivers/acpi/acpica/nsinit.c

index 660a2728908d6e15f8abbe5b2eae43b4a91df503..0cac7ec0d2ece0764806fae7a11994da0520ab23 100644 (file)
--- a/drivers/acpi/acpica/nsinit.c
+++ b/drivers/acpi/acpica/nsinit.c
@@ -577,9 +577,7 @@ acpi_ns_init_one_device(acpi_handle obj_handle,
          * as possible (without an NMI being received in the middle of
          * this) - so disable NMIs and initialize the device:
          */
-       acpi_nmi_disable();
         status = acpi_ns_evaluate(info);
-       acpi_nmi_enable();
  
         if (ACPI_SUCCESS(status)) {
                 walk_info->num_INI++;
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c

index 5718566e00f9b27573db228ba74559570ba48f89..d9926afec110997b618b70062d50450847d9d1ff 100644 (file)
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -275,13 +275,23 @@ acpi_table_parse_srat(enum acpi_srat_type id,
  int __init acpi_numa_init(void)
  {
         int ret = 0;
+       int nr_cpu_entries = nr_cpu_ids;
+
+#ifdef CONFIG_X86
+       /*
+        * Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
+        * SRAT cpu entries could have different order with that in MADT.
+        * So go over all cpu entries in SRAT to get apicid to node mapping.
+        */
+       nr_cpu_entries = MAX_LOCAL_APIC;
+#endif
  
         /* SRAT: Static Resource Affinity Table */
         if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
                 acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
-                                    acpi_parse_x2apic_affinity, nr_cpu_ids);
+                                    acpi_parse_x2apic_affinity, nr_cpu_entries);
                 acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
-                                    acpi_parse_processor_affinity, nr_cpu_ids);
+                                    acpi_parse_processor_affinity, nr_cpu_entries);
                 ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
                                             acpi_parse_memory_affinity,
                                             NR_NODE_MEMBLKS);
diff --git a/drivers/atm/atmtcp.c b/drivers/atm/atmtcp.c

index 2b464b631f22697ac5f177d371b94d5646683d8f..0b0625054a87a5697c8e4745bfd49eb2d1bf45c0 100644 (file)
--- a/drivers/atm/atmtcp.c
+++ b/drivers/atm/atmtcp.c
@@ -392,7 +392,10 @@ static int atmtcp_attach(struct atm_vcc *vcc,int itf)
                         atm_dev_put(dev);
                         return -EMEDIUMTYPE;
                 }
-               if (PRIV(dev)->vcc) return -EBUSY;
+               if (PRIV(dev)->vcc) {
+                       atm_dev_put(dev);
+                       return -EBUSY;
+               }
         }
         else {
                 int error;
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c

index 42396df555567660d597ac524631e9020edfbea3..9252e85706ef2ce54728a0c8bc366c80c793033c 100644 (file)
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -38,7 +38,7 @@ static int agp_bridges_found;
  
  static void amd64_tlbflush(struct agp_memory *temp)
  {
-       k8_flush_garts();
+       amd_flush_garts();
  }
  
  static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
@@ -124,7 +124,7 @@ static int amd64_fetch_size(void)
         u32 temp;
         struct aper_size_info_32 *values;
  
-       dev = k8_northbridges.nb_misc[0];
+       dev = node_to_amd_nb(0)->misc;
         if (dev==NULL)
                 return 0;
  
@@ -181,16 +181,15 @@ static int amd_8151_configure(void)
         unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
         int i;
  
-       if (!k8_northbridges.gart_supported)
+       if (!amd_nb_has_feature(AMD_NB_GART))
                 return 0;
  
         /* Configure AGP regs in each x86-64 host bridge. */
-       for (i = 0; i < k8_northbridges.num; i++) {
+       for (i = 0; i < amd_nb_num(); i++) {
                 agp_bridge->gart_bus_addr =
-                               amd64_configure(k8_northbridges.nb_misc[i],
-                                               gatt_bus);
+                       amd64_configure(node_to_amd_nb(i)->misc, gatt_bus);
         }
-       k8_flush_garts();
+       amd_flush_garts();
         return 0;
  }
  
@@ -200,11 +199,11 @@ static void amd64_cleanup(void)
         u32 tmp;
         int i;
  
-       if (!k8_northbridges.gart_supported)
+       if (!amd_nb_has_feature(AMD_NB_GART))
                 return;
  
-       for (i = 0; i < k8_northbridges.num; i++) {
-               struct pci_dev *dev = k8_northbridges.nb_misc[i];
+       for (i = 0; i < amd_nb_num(); i++) {
+               struct pci_dev *dev = node_to_amd_nb(i)->misc;
                 /* disable gart translation */
                 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp);
                 tmp &= ~GARTEN;
@@ -331,15 +330,15 @@ static __devinit int cache_nbs(struct pci_dev *pdev, u32 cap_ptr)
  {
         int i;
  
-       if (cache_k8_northbridges() < 0)
+       if (amd_cache_northbridges() < 0)
                 return -ENODEV;
  
-       if (!k8_northbridges.gart_supported)
+       if (!amd_nb_has_feature(AMD_NB_GART))
                 return -ENODEV;
  
         i = 0;
-       for (i = 0; i < k8_northbridges.num; i++) {
-               struct pci_dev *dev = k8_northbridges.nb_misc[i];
+       for (i = 0; i < amd_nb_num(); i++) {
+               struct pci_dev *dev = node_to_amd_nb(i)->misc;
                 if (fix_northbridge(dev, pdev, cap_ptr) < 0) {
                         dev_err(&dev->dev, "no usable aperture found\n");
  #ifdef __x86_64__
@@ -416,7 +415,7 @@ static int __devinit uli_agp_init(struct pci_dev *pdev)
         }
  
         /* shadow x86-64 registers into ULi registers */
-       pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+       pci_read_config_dword (node_to_amd_nb(0)->misc, AMD64_GARTAPERTUREBASE,
                                &httfea);
  
         /* if x86-64 aperture base is beyond 4G, exit here */
@@ -484,7 +483,7 @@ static int nforce3_agp_init(struct pci_dev *pdev)
         pci_write_config_dword(dev1, NVIDIA_X86_64_1_APSIZE, tmp);
  
         /* shadow x86-64 registers into NVIDIA registers */
-       pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+       pci_read_config_dword (node_to_amd_nb(0)->misc, AMD64_GARTAPERTUREBASE,
                                &apbase);
  
         /* if x86-64 aperture base is beyond 4G, exit here */
@@ -778,7 +777,7 @@ int __init agp_amd64_init(void)
                 }
  
                 /* First check that we have at least one AMD64 NB */
-               if (!pci_dev_present(k8_nb_ids))
+               if (!pci_dev_present(amd_nb_misc_ids))
                         return -ENODEV;
  
                 /* Look for any AGP bridge */
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c

index c63a438237444a0e35fa05bdef9d0d232839c463..1109f6848a43940b8e8ed738f891f6560f17f7a8 100644 (file)
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -355,6 +355,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
                 dprintk("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new,
                         (unsigned long)freqs->cpu);
                 trace_power_frequency(POWER_PSTATE, freqs->new, freqs->cpu);
+               trace_cpu_frequency(freqs->new, freqs->cpu);
                 srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
                                 CPUFREQ_POSTCHANGE, freqs);
                 if (likely(policy) && likely(policy->cpu == freqs->cpu))
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c

index a507108433785f8432b3d15504c3a036b461c1a0..08d5f05378d9efb1df0fe055240e8e8ed9e7a90b 100644 (file)
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -107,6 +107,7 @@ static void cpuidle_idle_call(void)
         if (cpuidle_curr_governor->reflect)
                 cpuidle_curr_governor->reflect(dev);
         trace_power_end(smp_processor_id());
+       trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
  }
  
  /**
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c

index 411d5bf50fc43cab437dff34d3d9f25dd9928fe0..a25f5f61e0e00becc22b5b5b4d8ce1e4ce6e31b8 100644 (file)
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -449,7 +449,7 @@ mv_xor_slot_cleanup(struct mv_xor_chan *mv_chan)
  static void mv_xor_tasklet(unsigned long data)
  {
         struct mv_xor_chan *chan = (struct mv_xor_chan *) data;
-       __mv_xor_slot_cleanup(chan);
+       mv_xor_slot_cleanup(chan);
  }
  
  static struct mv_xor_desc_slot *
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c

index eca9ba193e94a914aa64740f0b4262b8f7c99b81..df211181fca41627cb0bdea2a089cd9d26f7faf4 100644 (file)
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2917,7 +2917,7 @@ static int __init amd64_edac_init(void)
  
         opstate_init();
  
-       if (cache_k8_northbridges() < 0)
+       if (amd_cache_northbridges() < 0)
                 goto err_ret;
  
         msrs = msrs_alloc();
@@ -2934,7 +2934,7 @@ static int __init amd64_edac_init(void)
          * to finish initialization of the MC instances.
          */
         err = -ENODEV;
-       for (nb = 0; nb < k8_northbridges.num; nb++) {
+       for (nb = 0; nb < amd_nb_num(); nb++) {
                 if (!pvt_lookup[nb])
                         continue;
  
diff --git a/drivers/gpu/drm/i915/dvo_ch7017.c b/drivers/gpu/drm/i915/dvo_ch7017.c

index af70337567ce35a0167ffb193d8b1ffae769fedd..d3e8c540f778d01ef97437c3643acbd690716546 100644 (file)
--- a/drivers/gpu/drm/i915/dvo_ch7017.c
+++ b/drivers/gpu/drm/i915/dvo_ch7017.c
@@ -242,7 +242,7 @@ fail:
  
  static enum drm_connector_status ch7017_detect(struct intel_dvo_device *dvo)
  {
-       return connector_status_unknown;
+       return connector_status_connected;
  }
  
  static enum drm_mode_status ch7017_mode_valid(struct intel_dvo_device *dvo,
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c

index e6800819bca846f6a3fd102ef37e2632858247f8..cb900dc83d950e29f99144e0d39c38102c55ce5b 100644 (file)
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -34,6 +34,7 @@
  #include "i915_drm.h"
  #include "i915_drv.h"
  #include "i915_trace.h"
+#include "../../../platform/x86/intel_ips.h"
  #include <linux/pci.h>
  #include <linux/vgaarb.h>
  #include <linux/acpi.h>
@@ -1870,6 +1871,26 @@ out_unlock:
  }
  EXPORT_SYMBOL_GPL(i915_gpu_turbo_disable);
  
+/**
+ * Tells the intel_ips driver that the i915 driver is now loaded, if
+ * IPS got loaded first.
+ *
+ * This awkward dance is so that neither module has to depend on the
+ * other in order for IPS to do the appropriate communication of
+ * GPU turbo limits to i915.
+ */
+static void
+ips_ping_for_i915_load(void)
+{
+       void (*link)(void);
+
+       link = symbol_get(ips_link_to_i915_driver);
+       if (link) {
+               link();
+               symbol_put(ips_link_to_i915_driver);
+       }
+}
+
  /**
   * i915_driver_load - setup chip and create an initial config
   * @dev: DRM device
@@ -2075,6 +2096,8 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
         dev_priv->mchdev_lock = &mchdev_lock;
         spin_unlock(&mchdev_lock);
  
+       ips_ping_for_i915_load();
+
         return 0;
  
  out_workqueue_free:
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h

index 878fc766a12cc05f6b30c90d53d65a52c6023727..cb8f434292793451225eefdbea071aefe7408e79 100644 (file)
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -2471,6 +2471,9 @@
  # define MARIUNIT_CLOCK_GATE_DISABLE           (1 << 18)
  # define SVSMUNIT_CLOCK_GATE_DISABLE           (1 << 1)
  
+#define PCH_3DCGDIS1           0x46024
+# define VFMUNIT_CLOCK_GATE_DISABLE            (1 << 11)
+
  #define FDI_PLL_FREQ_CTL        0x46030
  #define  FDI_PLL_FREQ_CHANGE_REQUEST    (1<<24)
  #define  FDI_PLL_FREQ_LOCK_LIMIT_MASK   0xfff00
@@ -2588,6 +2591,13 @@
  #define ILK_DISPLAY_CHICKEN2   0x42004
  #define  ILK_DPARB_GATE        (1<<22)
  #define  ILK_VSDPFD_FULL       (1<<21)
+#define ILK_DISPLAY_CHICKEN_FUSES      0x42014
+#define  ILK_INTERNAL_GRAPHICS_DISABLE (1<<31)
+#define  ILK_INTERNAL_DISPLAY_DISABLE  (1<<30)
+#define  ILK_DISPLAY_DEBUG_DISABLE     (1<<29)
+#define  ILK_HDCP_DISABLE              (1<<25)
+#define  ILK_eDP_A_DISABLE             (1<<24)
+#define  ILK_DESKTOP                   (1<<23)
  #define ILK_DSPCLK_GATE                0x42020
  #define  ILK_DPARB_CLK_GATE    (1<<5)
  /* According to spec this bit 7/8/9 of 0x42020 should be set to enable FBC */
diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c

index 2b2078695d2acb9c105a56b8256a87a625f428e1..b0b1200ed6500055b05bc1e51a932df26b6600ef 100644 (file)
--- a/drivers/gpu/drm/i915/intel_bios.c
+++ b/drivers/gpu/drm/i915/intel_bios.c
@@ -270,7 +270,7 @@ parse_general_features(struct drm_i915_private *dev_priv,
                                         general->ssc_freq ? 66 : 48;
                         else if (IS_GEN5(dev) || IS_GEN6(dev))
                                 dev_priv->lvds_ssc_freq =
-                                       general->ssc_freq ? 120 : 100;
+                                       general->ssc_freq ? 100 : 120;
                         else
                                 dev_priv->lvds_ssc_freq =
                                         general->ssc_freq ? 100 : 96;
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c

index d9b7092439ef59ab19acc9546a039ff60f0bb43c..fca523288acad035b9f3d130fe32e9c3c7719989 100644 (file)
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -5379,6 +5379,23 @@ static int intel_encoder_clones(struct drm_device *dev, int type_mask)
         return index_mask;
  }
  
+static bool has_edp_a(struct drm_device *dev)
+{
+       struct drm_i915_private *dev_priv = dev->dev_private;
+
+       if (!IS_MOBILE(dev))
+               return false;
+
+       if ((I915_READ(DP_A) & DP_DETECTED) == 0)
+               return false;
+
+       if (IS_GEN5(dev) &&
+           (I915_READ(ILK_DISPLAY_CHICKEN_FUSES) & ILK_eDP_A_DISABLE))
+               return false;
+
+       return true;
+}
+
  static void intel_setup_outputs(struct drm_device *dev)
  {
         struct drm_i915_private *dev_priv = dev->dev_private;
@@ -5396,7 +5413,7 @@ static void intel_setup_outputs(struct drm_device *dev)
         if (HAS_PCH_SPLIT(dev)) {
                 dpd_is_edp = intel_dpd_is_edp(dev);
  
-               if (IS_MOBILE(dev) && (I915_READ(DP_A) & DP_DETECTED))
+               if (has_edp_a(dev))
                         intel_dp_init(dev, DP_A);
  
                 if (dpd_is_edp && (I915_READ(PCH_DP_D) & DP_DETECTED))
@@ -5825,6 +5842,8 @@ void intel_init_clock_gating(struct drm_device *dev)
                         I915_WRITE(PCH_3DCGDIS0,
                                    MARIUNIT_CLOCK_GATE_DISABLE |
                                    SVSMUNIT_CLOCK_GATE_DISABLE);
+                       I915_WRITE(PCH_3DCGDIS1,
+                                  VFMUNIT_CLOCK_GATE_DISABLE);
                 }
  
                 I915_WRITE(PCH_DSPCLK_GATE_D, dspclk_gate);
diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c

index 27e63abf2a7317d73560bc2919eece5e42208149..6bc42fa2a6ecc152d796611a3e66311b98b3de9f 100644 (file)
--- a/drivers/gpu/drm/i915/intel_sdvo.c
+++ b/drivers/gpu/drm/i915/intel_sdvo.c
@@ -2040,13 +2040,14 @@ intel_sdvo_dvi_init(struct intel_sdvo *intel_sdvo, int device)
                                            SDVO_COLORIMETRY_RGB256);
                 connector->connector_type = DRM_MODE_CONNECTOR_HDMIA;
  
-               intel_sdvo_add_hdmi_properties(intel_sdvo_connector);
                 intel_sdvo->is_hdmi = true;
         }
         intel_sdvo->base.clone_mask = ((1 << INTEL_SDVO_NON_TV_CLONE_BIT) |
                                        (1 << INTEL_ANALOG_CLONE_BIT));
  
         intel_sdvo_connector_init(intel_sdvo_connector, intel_sdvo);
+       if (intel_sdvo->is_hdmi)
+               intel_sdvo_add_hdmi_properties(intel_sdvo_connector);
  
         return true;
  }
diff --git a/drivers/hwmon/s3c-hwmon.c b/drivers/hwmon/s3c-hwmon.c

index 05248f2d7581cd02dc191185b083bd30977ec5ba..92b42db43bcfd9e5706c41bfb9049a26fd967b93 100644 (file)
--- a/drivers/hwmon/s3c-hwmon.c
+++ b/drivers/hwmon/s3c-hwmon.c
@@ -234,7 +234,6 @@ static int s3c_hwmon_create_attr(struct device *dev,
         attr->index = channel;
         attr->dev_attr.attr.name  = attrs->in_name;
         attr->dev_attr.attr.mode  = S_IRUGO;
-       attr->dev_attr.attr.owner = THIS_MODULE;
         attr->dev_attr.show = s3c_hwmon_ch_show;
  
         ret =  device_create_file(dev, &attr->dev_attr);
@@ -252,7 +251,6 @@ static int s3c_hwmon_create_attr(struct device *dev,
                 attr->index = channel;
                 attr->dev_attr.attr.name  = attrs->label_name;
                 attr->dev_attr.attr.mode  = S_IRUGO;
-               attr->dev_attr.attr.owner = THIS_MODULE;
                 attr->dev_attr.show = s3c_hwmon_label_show;
  
                 ret = device_create_file(dev, &attr->dev_attr);
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c

index c131d58bcb50d818e05ee0464581bceedfdbf20f..56ac09d6c9308157d6acda8c99bdbdeef91aa771 100644 (file)
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -220,9 +220,8 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state)
         kt_before = ktime_get_real();
  
         stop_critical_timings();
-#ifndef MODULE
         trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu);
-#endif
+       trace_cpu_idle((eax >> 4) + 1, cpu);
         if (!need_resched()) {
  
                 __monitor((void *)&current_thread_info()->flags, 0, 0);
diff --git a/drivers/isdn/gigaset/capi.c b/drivers/isdn/gigaset/capi.c

index bcc174e4f3b146db6772cea89c79ac9eb14ce80a..658e75f18d052b5a721ff2e6882109c945318d5b 100644 (file)
--- a/drivers/isdn/gigaset/capi.c
+++ b/drivers/isdn/gigaset/capi.c
@@ -1900,6 +1900,7 @@ static void do_disconnect_req(struct gigaset_capi_ctr *iif,
                 if (b3skb == NULL) {
                         dev_err(cs->dev, "%s: out of memory\n", __func__);
                         send_conf(iif, ap, skb, CAPI_MSGOSRESOURCEERR);
+                       kfree(b3cmsg);
                         return;
                 }
                 capi_cmsg2message(b3cmsg,
diff --git a/drivers/media/video/cx25840/cx25840-core.c b/drivers/media/video/cx25840/cx25840-core.c

index dfb198d0415bc200b0152c1be105d722261653c5..f16461844c5c0063e281da22b8cd3e27fe6d28f4 100644 (file)
--- a/drivers/media/video/cx25840/cx25840-core.c
+++ b/drivers/media/video/cx25840/cx25840-core.c
@@ -1989,8 +1989,23 @@ static int cx25840_probe(struct i2c_client *client,
         v4l2_ctrl_new_std(&state->hdl, &cx25840_ctrl_ops,
                         V4L2_CID_HUE, -128, 127, 1, 0);
         if (!is_cx2583x(state)) {
-               default_volume = 228 - cx25840_read(client, 0x8d4);
-               default_volume = ((default_volume / 2) + 23) << 9;
+               default_volume = cx25840_read(client, 0x8d4);
+               /*
+                * Enforce the legacy PVR-350/MSP3400 to PVR-150/CX25843 volume
+                * scale mapping limits to avoid -ERANGE errors when
+                * initializing the volume control
+                */
+               if (default_volume > 228) {
+                       /* Bottom out at -96 dB, v4l2 vol range 0x2e00-0x2fff */
+                       default_volume = 228;
+                       cx25840_write(client, 0x8d4, 228);
+               }
+               else if (default_volume < 20) {
+                       /* Top out at + 8 dB, v4l2 vol range 0xfe00-0xffff */
+                       default_volume = 20;
+                       cx25840_write(client, 0x8d4, 20);
+               }
+               default_volume = (((228 - default_volume) >> 1) + 23) << 9;
  
                 state->volume = v4l2_ctrl_new_std(&state->hdl,
                         &cx25840_audio_ctrl_ops, V4L2_CID_AUDIO_VOLUME,
diff --git a/drivers/media/video/cx88/cx88-alsa.c b/drivers/media/video/cx88/cx88-alsa.c

index 4aaa47c0eabf54ea8e008a616b0dac531b423ce7..54b7fcd469a8a9b2db267e81ac163d0e579ce31b 100644 (file)
--- a/drivers/media/video/cx88/cx88-alsa.c
+++ b/drivers/media/video/cx88/cx88-alsa.c
@@ -40,7 +40,6 @@
  #include <sound/control.h>
  #include <sound/initval.h>
  #include <sound/tlv.h>
-#include <media/wm8775.h>
  
  #include "cx88.h"
  #include "cx88-reg.h"
@@ -587,47 +586,26 @@ static int snd_cx88_volume_put(struct snd_kcontrol *kcontrol,
         int left, right, v, b;
         int changed = 0;
         u32 old;
-       struct v4l2_control client_ctl;
-
-       /* Pass volume & balance onto any WM8775 */
-       if (value->value.integer.value[0] >= value->value.integer.value[1]) {
-               v = value->value.integer.value[0] << 10;
-               b = value->value.integer.value[0] ?
-                       (0x8000 * value->value.integer.value[1]) / value->value.integer.value[0] :
-                       0x8000;
-       } else {
-               v = value->value.integer.value[1] << 10;
-               b = value->value.integer.value[1] ?
-               0xffff - (0x8000 * value->value.integer.value[0]) / value->value.integer.value[1] :
-               0x8000;
-       }
-       client_ctl.value = v;
-       client_ctl.id = V4L2_CID_AUDIO_VOLUME;
-       call_hw(core, WM8775_GID, core, s_ctrl, &client_ctl);
-
-       client_ctl.value = b;
-       client_ctl.id = V4L2_CID_AUDIO_BALANCE;
-       call_hw(core, WM8775_GID, core, s_ctrl, &client_ctl);
  
         left = value->value.integer.value[0] & 0x3f;
         right = value->value.integer.value[1] & 0x3f;
         b = right - left;
         if (b < 0) {
-               v = 0x3f - left;
-               b = (-b) | 0x40;
+           v = 0x3f - left;
+           b = (-b) | 0x40;
         } else {
-               v = 0x3f - right;
+           v = 0x3f - right;
         }
         /* Do we really know this will always be called with IRQs on? */
         spin_lock_irq(&chip->reg_lock);
         old = cx_read(AUD_VOL_CTL);
         if (v != (old & 0x3f)) {
-               cx_swrite(SHADOW_AUD_VOL_CTL, AUD_VOL_CTL, (old & ~0x3f) | v);
-               changed = 1;
+           cx_write(AUD_VOL_CTL, (old & ~0x3f) | v);
+           changed = 1;
         }
-       if ((cx_read(AUD_BAL_CTL) & 0x7f) != b) {
-               cx_write(AUD_BAL_CTL, b);
-               changed = 1;
+       if (cx_read(AUD_BAL_CTL) != b) {
+           cx_write(AUD_BAL_CTL, b);
+           changed = 1;
         }
         spin_unlock_irq(&chip->reg_lock);
  
@@ -640,7 +618,7 @@ static const struct snd_kcontrol_new snd_cx88_volume = {
         .iface = SNDRV_CTL_ELEM_IFACE_MIXER,
         .access = SNDRV_CTL_ELEM_ACCESS_READWRITE |
                   SNDRV_CTL_ELEM_ACCESS_TLV_READ,
-       .name = "Analog-TV Volume",
+       .name = "Playback Volume",
         .info = snd_cx88_volume_info,
         .get = snd_cx88_volume_get,
         .put = snd_cx88_volume_put,
@@ -671,14 +649,7 @@ static int snd_cx88_switch_put(struct snd_kcontrol *kcontrol,
         vol = cx_read(AUD_VOL_CTL);
         if (value->value.integer.value[0] != !(vol & bit)) {
                 vol ^= bit;
-               cx_swrite(SHADOW_AUD_VOL_CTL, AUD_VOL_CTL, vol);
-               /* Pass mute onto any WM8775 */
-               if ((1<<6) == bit) {
-                       struct v4l2_control client_ctl;
-                       client_ctl.value = 0 != (vol & bit);
-                       client_ctl.id = V4L2_CID_AUDIO_MUTE;
-                       call_hw(core, WM8775_GID, core, s_ctrl, &client_ctl);
-               }
+               cx_write(AUD_VOL_CTL, vol);
                 ret = 1;
         }
         spin_unlock_irq(&chip->reg_lock);
@@ -687,7 +658,7 @@ static int snd_cx88_switch_put(struct snd_kcontrol *kcontrol,
  
  static const struct snd_kcontrol_new snd_cx88_dac_switch = {
         .iface = SNDRV_CTL_ELEM_IFACE_MIXER,
-       .name = "Audio-Out Switch",
+       .name = "Playback Switch",
         .info = snd_ctl_boolean_mono_info,
         .get = snd_cx88_switch_get,
         .put = snd_cx88_switch_put,
@@ -696,49 +667,13 @@ static const struct snd_kcontrol_new snd_cx88_dac_switch = {
  
  static const struct snd_kcontrol_new snd_cx88_source_switch = {
         .iface = SNDRV_CTL_ELEM_IFACE_MIXER,
-       .name = "Analog-TV Switch",
+       .name = "Capture Switch",
         .info = snd_ctl_boolean_mono_info,
         .get = snd_cx88_switch_get,
         .put = snd_cx88_switch_put,
         .private_value = (1<<6),
  };
  
-static int snd_cx88_alc_get(struct snd_kcontrol *kcontrol,
-                              struct snd_ctl_elem_value *value)
-{
-       snd_cx88_card_t *chip = snd_kcontrol_chip(kcontrol);
-       struct cx88_core *core = chip->core;
-       struct v4l2_control client_ctl;
-
-       client_ctl.id = V4L2_CID_AUDIO_LOUDNESS;
-       call_hw(core, WM8775_GID, core, g_ctrl, &client_ctl);
-       value->value.integer.value[0] = client_ctl.value ? 1 : 0;
-
-       return 0;
-}
-
-static int snd_cx88_alc_put(struct snd_kcontrol *kcontrol,
-                                      struct snd_ctl_elem_value *value)
-{
-       snd_cx88_card_t *chip = snd_kcontrol_chip(kcontrol);
-       struct cx88_core *core = chip->core;
-       struct v4l2_control client_ctl;
-
-       client_ctl.value = 0 != value->value.integer.value[0];
-       client_ctl.id = V4L2_CID_AUDIO_LOUDNESS;
-       call_hw(core, WM8775_GID, core, s_ctrl, &client_ctl);
-
-       return 0;
-}
-
-static struct snd_kcontrol_new snd_cx88_alc_switch = {
-       .iface = SNDRV_CTL_ELEM_IFACE_MIXER,
-       .name = "Line-In ALC Switch",
-       .info = snd_ctl_boolean_mono_info,
-       .get = snd_cx88_alc_get,
-       .put = snd_cx88_alc_put,
-};
-
  /****************************************************************************
                         Basic Flow for Sound Devices
   ****************************************************************************/
@@ -860,7 +795,6 @@ static int __devinit cx88_audio_initdev(struct pci_dev *pci,
  {
         struct snd_card  *card;
         snd_cx88_card_t  *chip;
-       struct v4l2_subdev *sd;
         int              err;
  
         if (devno >= SNDRV_CARDS)
@@ -896,15 +830,6 @@ static int __devinit cx88_audio_initdev(struct pci_dev *pci,
         if (err < 0)
                 goto error;
  
-       /* If there's a wm8775 then add a Line-In ALC switch */
-       list_for_each_entry(sd, &chip->core->v4l2_dev.subdevs, list) {
-               if (WM8775_GID == sd->grp_id) {
-                       snd_ctl_add(card, snd_ctl_new1(&snd_cx88_alc_switch,
-                                                      chip));
-                       break;
-               }
-       }
-
         strcpy (card->driver, "CX88x");
         sprintf(card->shortname, "Conexant CX%x", pci->device);
         sprintf(card->longname, "%s at %#llx",
diff --git a/drivers/media/video/cx88/cx88-cards.c b/drivers/media/video/cx88/cx88-cards.c

index 9b9e169cce90862ee92f7d80ab7851cdb3cf81cb..0ccc2afd72668e7d2b8384312b01cde4ebb0e86b 100644 (file)
--- a/drivers/media/video/cx88/cx88-cards.c
+++ b/drivers/media/video/cx88/cx88-cards.c
@@ -1007,15 +1007,22 @@ static const struct cx88_board cx88_boards[] = {
                 .radio_type     = UNSET,
                 .tuner_addr     = ADDR_UNSET,
                 .radio_addr     = ADDR_UNSET,
+               .audio_chip = V4L2_IDENT_WM8775,
                 .input          = {{
                         .type   = CX88_VMUX_DVB,
                         .vmux   = 0,
+                       /* 2: Line-In */
+                       .audioroute = 2,
                 },{
                         .type   = CX88_VMUX_COMPOSITE1,
                         .vmux   = 1,
+                       /* 2: Line-In */
+                       .audioroute = 2,
                 },{
                         .type   = CX88_VMUX_SVIDEO,
                         .vmux   = 2,
+                       /* 2: Line-In */
+                       .audioroute = 2,
                 }},
                 .mpeg           = CX88_MPEG_DVB,
         },
diff --git a/drivers/media/video/cx88/cx88-video.c b/drivers/media/video/cx88/cx88-video.c

index 62cea9549404bf29d3d58e004af015dcb132f779..d9249e5a04c9088041a67d9a1b997e26dae64761 100644 (file)
--- a/drivers/media/video/cx88/cx88-video.c
+++ b/drivers/media/video/cx88/cx88-video.c
@@ -40,7 +40,6 @@
  #include "cx88.h"
  #include <media/v4l2-common.h>
  #include <media/v4l2-ioctl.h>
-#include <media/wm8775.h>
  
  MODULE_DESCRIPTION("v4l2 driver module for cx2388x based TV cards");
  MODULE_AUTHOR("Gerd Knorr <kraxel@bytesex.org> [SuSE Labs]");
@@ -977,7 +976,6 @@ int cx88_set_control(struct cx88_core *core, struct v4l2_control *ctl)
         const struct cx88_ctrl *c = NULL;
         u32 value,mask;
         int i;
-       struct v4l2_control client_ctl;
  
         for (i = 0; i < CX8800_CTLS; i++) {
                 if (cx8800_ctls[i].v.id == ctl->id) {
@@ -991,27 +989,6 @@ int cx88_set_control(struct cx88_core *core, struct v4l2_control *ctl)
                 ctl->value = c->v.minimum;
         if (ctl->value > c->v.maximum)
                 ctl->value = c->v.maximum;
-
-       /* Pass changes onto any WM8775 */
-       client_ctl.id = ctl->id;
-       switch (ctl->id) {
-       case V4L2_CID_AUDIO_MUTE:
-               client_ctl.value = ctl->value;
-               break;
-       case V4L2_CID_AUDIO_VOLUME:
-               client_ctl.value = (ctl->value) ?
-                       (0x90 + ctl->value) << 8 : 0;
-               break;
-       case V4L2_CID_AUDIO_BALANCE:
-               client_ctl.value = ctl->value << 9;
-               break;
-       default:
-               client_ctl.id = 0;
-               break;
-       }
-       if (client_ctl.id)
-               call_hw(core, WM8775_GID, core, s_ctrl, &client_ctl);
-
         mask=c->mask;
         switch (ctl->id) {
         case V4L2_CID_AUDIO_BALANCE:
@@ -1558,9 +1535,7 @@ static int radio_queryctrl (struct file *file, void *priv,
         if (c->id <  V4L2_CID_BASE ||
                 c->id >= V4L2_CID_LASTP1)
                 return -EINVAL;
-       if (c->id == V4L2_CID_AUDIO_MUTE ||
-               c->id == V4L2_CID_AUDIO_VOLUME ||
-               c->id == V4L2_CID_AUDIO_BALANCE) {
+       if (c->id == V4L2_CID_AUDIO_MUTE) {
                 for (i = 0; i < CX8800_CTLS; i++) {
                         if (cx8800_ctls[i].v.id == c->id)
                                 break;
diff --git a/drivers/media/video/cx88/cx88.h b/drivers/media/video/cx88/cx88.h

index e8c732e7ae4f612dfc5004d7f56ba5bd9b9a8d53..c9981e77416a6f8292721b53333bd9cbba1da35c 100644 (file)
--- a/drivers/media/video/cx88/cx88.h
+++ b/drivers/media/video/cx88/cx88.h
@@ -398,19 +398,17 @@ static inline struct cx88_core *to_core(struct v4l2_device *v4l2_dev)
         return container_of(v4l2_dev, struct cx88_core, v4l2_dev);
  }
  
-#define call_hw(core, grpid, o, f, args...) \
+#define call_all(core, o, f, args...)                          \
         do {                                                    \
                 if (!core->i2c_rc) {                            \
                         if (core->gate_ctrl)                    \
                                 core->gate_ctrl(core, 1);       \
-                       v4l2_device_call_all(&core->v4l2_dev, grpid, o, f, ##args); \
+                       v4l2_device_call_all(&core->v4l2_dev, 0, o, f, ##args); \
                         if (core->gate_ctrl)                    \
                                 core->gate_ctrl(core, 0);       \
                 }                                               \
         } while (0)
  
-#define call_all(core, o, f, args...) call_hw(core, 0, o, f, ##args)
-
  struct cx8800_dev;
  struct cx8802_dev;
  
diff --git a/drivers/media/video/em28xx/em28xx-video.c b/drivers/media/video/em28xx/em28xx-video.c

index 908e3bc88303989c10f6ae8950286a820d2132e9..2c3007280032ea5472e45e9b1c75ee38aaa36b4c 100644 (file)
--- a/drivers/media/video/em28xx/em28xx-video.c
+++ b/drivers/media/video/em28xx/em28xx-video.c
@@ -2377,7 +2377,7 @@ static const struct v4l2_file_operations radio_fops = {
         .owner         = THIS_MODULE,
         .open          = em28xx_v4l2_open,
         .release       = em28xx_v4l2_close,
-       .ioctl         = video_ioctl2,
+       .unlocked_ioctl = video_ioctl2,
  };
  
  static const struct v4l2_ioctl_ops radio_ioctl_ops = {
diff --git a/drivers/media/video/wm8775.c b/drivers/media/video/wm8775.c

index 135525649086401779ce84cfbb90d87b4c5a7131..fe8ef6419f831f36d52347ff275d8c96d4dc8c5f 100644 (file)
--- a/drivers/media/video/wm8775.c
+++ b/drivers/media/video/wm8775.c
@@ -35,7 +35,6 @@
  #include <media/v4l2-device.h>
  #include <media/v4l2-chip-ident.h>
  #include <media/v4l2-ctrls.h>
-#include <media/wm8775.h>
  
  MODULE_DESCRIPTION("wm8775 driver");
  MODULE_AUTHOR("Ulf Eklund, Hans Verkuil");
@@ -51,16 +50,10 @@ enum {
         TOT_REGS
  };
  
-#define ALC_HOLD 0x85 /* R17: use zero cross detection, ALC hold time 42.6 ms */
-#define ALC_EN 0x100  /* R17: ALC enable */
-
  struct wm8775_state {
         struct v4l2_subdev sd;
         struct v4l2_ctrl_handler hdl;
         struct v4l2_ctrl *mute;
-       struct v4l2_ctrl *vol;
-       struct v4l2_ctrl *bal;
-       struct v4l2_ctrl *loud;
         u8 input;               /* Last selected input (0-0xf) */
  };
  
@@ -92,30 +85,6 @@ static int wm8775_write(struct v4l2_subdev *sd, int reg, u16 val)
         return -1;
  }
  
-static void wm8775_set_audio(struct v4l2_subdev *sd, int quietly)
-{
-       struct wm8775_state *state = to_state(sd);
-       u8 vol_l, vol_r;
-       int muted = 0 != state->mute->val;
-       u16 volume = (u16)state->vol->val;
-       u16 balance = (u16)state->bal->val;
-
-       /* normalize ( 65535 to 0 -> 255 to 0 (+24dB to -103dB) ) */
-       vol_l = (min(65536 - balance, 32768) * volume) >> 23;
-       vol_r = (min(balance, (u16)32768) * volume) >> 23;
-
-       /* Mute */
-       if (muted || quietly)
-               wm8775_write(sd, R21, 0x0c0 | state->input);
-
-       wm8775_write(sd, R14, vol_l | 0x100); /* 0x100= Left channel ADC zero cross enable */
-       wm8775_write(sd, R15, vol_r | 0x100); /* 0x100= Right channel ADC zero cross enable */
-
-       /* Un-mute */
-       if (!muted)
-               wm8775_write(sd, R21, state->input);
-}
-
  static int wm8775_s_routing(struct v4l2_subdev *sd,
                             u32 input, u32 output, u32 config)
  {
@@ -133,26 +102,25 @@ static int wm8775_s_routing(struct v4l2_subdev *sd,
         state->input = input;
         if (!v4l2_ctrl_g_ctrl(state->mute))
                 return 0;
-       if (!v4l2_ctrl_g_ctrl(state->vol))
-               return 0;
-       if (!v4l2_ctrl_g_ctrl(state->bal))
-               return 0;
-       wm8775_set_audio(sd, 1);
+       wm8775_write(sd, R21, 0x0c0);
+       wm8775_write(sd, R14, 0x1d4);
+       wm8775_write(sd, R15, 0x1d4);
+       wm8775_write(sd, R21, 0x100 + state->input);
         return 0;
  }
  
  static int wm8775_s_ctrl(struct v4l2_ctrl *ctrl)
  {
         struct v4l2_subdev *sd = to_sd(ctrl);
+       struct wm8775_state *state = to_state(sd);
  
         switch (ctrl->id) {
         case V4L2_CID_AUDIO_MUTE:
-       case V4L2_CID_AUDIO_VOLUME:
-       case V4L2_CID_AUDIO_BALANCE:
-               wm8775_set_audio(sd, 0);
-               return 0;
-       case V4L2_CID_AUDIO_LOUDNESS:
-               wm8775_write(sd, R17, (ctrl->val ? ALC_EN : 0) | ALC_HOLD);
+               wm8775_write(sd, R21, 0x0c0);
+               wm8775_write(sd, R14, 0x1d4);
+               wm8775_write(sd, R15, 0x1d4);
+               if (!ctrl->val)
+                       wm8775_write(sd, R21, 0x100 + state->input);
                 return 0;
         }
         return -EINVAL;
@@ -176,7 +144,16 @@ static int wm8775_log_status(struct v4l2_subdev *sd)
  
  static int wm8775_s_frequency(struct v4l2_subdev *sd, struct v4l2_frequency *freq)
  {
-       wm8775_set_audio(sd, 0);
+       struct wm8775_state *state = to_state(sd);
+
+       /* If I remove this, then it can happen that I have no
+          sound the first time I tune from static to a valid channel.
+          It's difficult to reproduce and is almost certainly related
+          to the zero cross detect circuit. */
+       wm8775_write(sd, R21, 0x0c0);
+       wm8775_write(sd, R14, 0x1d4);
+       wm8775_write(sd, R15, 0x1d4);
+       wm8775_write(sd, R21, 0x100 + state->input);
         return 0;
  }
  
@@ -226,7 +203,6 @@ static int wm8775_probe(struct i2c_client *client,
  {
         struct wm8775_state *state;
         struct v4l2_subdev *sd;
-       int err;
  
         /* Check if the adapter supports the needed features */
         if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_BYTE_DATA))
@@ -240,21 +216,15 @@ static int wm8775_probe(struct i2c_client *client,
                 return -ENOMEM;
         sd = &state->sd;
         v4l2_i2c_subdev_init(sd, client, &wm8775_ops);
-       sd->grp_id = WM8775_GID; /* subdev group id */
         state->input = 2;
  
-       v4l2_ctrl_handler_init(&state->hdl, 4);
+       v4l2_ctrl_handler_init(&state->hdl, 1);
         state->mute = v4l2_ctrl_new_std(&state->hdl, &wm8775_ctrl_ops,
                         V4L2_CID_AUDIO_MUTE, 0, 1, 1, 0);
-       state->vol = v4l2_ctrl_new_std(&state->hdl, &wm8775_ctrl_ops,
-                       V4L2_CID_AUDIO_VOLUME, 0, 65535, (65535+99)/100, 0xCF00); /* 0dB*/
-       state->bal = v4l2_ctrl_new_std(&state->hdl, &wm8775_ctrl_ops,
-                       V4L2_CID_AUDIO_BALANCE, 0, 65535, (65535+99)/100, 32768);
-       state->loud = v4l2_ctrl_new_std(&state->hdl, &wm8775_ctrl_ops,
-                       V4L2_CID_AUDIO_LOUDNESS, 0, 1, 1, 1);
         sd->ctrl_handler = &state->hdl;
-       err = state->hdl.error;
-       if (err) {
+       if (state->hdl.error) {
+               int err = state->hdl.error;
+
                 v4l2_ctrl_handler_free(&state->hdl);
                 kfree(state);
                 return err;
@@ -266,25 +236,29 @@ static int wm8775_probe(struct i2c_client *client,
         wm8775_write(sd, R23, 0x000);
         /* Disable zero cross detect timeout */
         wm8775_write(sd, R7, 0x000);
-       /* HPF enable, I2S mode, 24-bit */
-       wm8775_write(sd, R11, 0x022);
+       /* Left justified, 24-bit mode */
+       wm8775_write(sd, R11, 0x021);
         /* Master mode, clock ratio 256fs */
         wm8775_write(sd, R12, 0x102);
         /* Powered up */
         wm8775_write(sd, R13, 0x000);
-       /* ALC stereo, ALC target level -5dB FS, ALC max gain +8dB */
-       wm8775_write(sd, R16, 0x1bb);
-       /* Set ALC mode and hold time */
-       wm8775_write(sd, R17, (state->loud->val ? ALC_EN : 0) | ALC_HOLD);
+       /* ADC gain +2.5dB, enable zero cross */
+       wm8775_write(sd, R14, 0x1d4);
+       /* ADC gain +2.5dB, enable zero cross */
+       wm8775_write(sd, R15, 0x1d4);
+       /* ALC Stereo, ALC target level -1dB FS max gain +8dB */
+       wm8775_write(sd, R16, 0x1bf);
+       /* Enable gain control, use zero cross detection,
+          ALC hold time 42.6 ms */
+       wm8775_write(sd, R17, 0x185);
         /* ALC gain ramp up delay 34 s, ALC gain ramp down delay 33 ms */
         wm8775_write(sd, R18, 0x0a2);
         /* Enable noise gate, threshold -72dBfs */
         wm8775_write(sd, R19, 0x005);
-       /* Transient window 4ms, ALC min gain -5dB  */
-       wm8775_write(sd, R20, 0x0fb);
-
-       wm8775_set_audio(sd, 1);      /* set volume/mute/mux */
-
+       /* Transient window 4ms, lower PGA gain limit -1dB */
+       wm8775_write(sd, R20, 0x07a);
+       /* LRBOTH = 1, use input 2. */
+       wm8775_write(sd, R21, 0x102);
         return 0;
  }
  
diff --git a/drivers/net/atlx/atl1.c b/drivers/net/atlx/atl1.c

index 53363108994ee93bed670940b0eb91170a389456..3acf5123a6efa8c0e21d76b7deb487a76d21ea3e 100644 (file)
--- a/drivers/net/atlx/atl1.c
+++ b/drivers/net/atlx/atl1.c
@@ -3504,6 +3504,8 @@ static int atl1_set_ringparam(struct net_device *netdev,
         struct atl1_rfd_ring rfd_old, rfd_new;
         struct atl1_rrd_ring rrd_old, rrd_new;
         struct atl1_ring_header rhdr_old, rhdr_new;
+       struct atl1_smb smb;
+       struct atl1_cmb cmb;
         int err;
  
         tpd_old = adapter->tpd_ring;
@@ -3544,11 +3546,19 @@ static int atl1_set_ringparam(struct net_device *netdev,
                 adapter->rrd_ring = rrd_old;
                 adapter->tpd_ring = tpd_old;
                 adapter->ring_header = rhdr_old;
+               /*
+                * Save SMB and CMB, since atl1_free_ring_resources
+                * will clear them.
+                */
+               smb = adapter->smb;
+               cmb = adapter->cmb;
                 atl1_free_ring_resources(adapter);
                 adapter->rfd_ring = rfd_new;
                 adapter->rrd_ring = rrd_new;
                 adapter->tpd_ring = tpd_new;
                 adapter->ring_header = rhdr_new;
+               adapter->smb = smb;
+               adapter->cmb = cmb;
  
                 err = atl1_up(adapter);
                 if (err)
diff --git a/drivers/net/cnic.c b/drivers/net/cnic.c

index 92bac19ad60ab0be5383b7566f48cc5dbbf2ab8e..6dff32196c92bcb65d10bda3c54cbb0048dee416 100644 (file)
--- a/drivers/net/cnic.c
+++ b/drivers/net/cnic.c
@@ -940,7 +940,7 @@ static int cnic_alloc_uio_rings(struct cnic_dev *dev, int pages)
                                            &udev->l2_ring_map,
                                            GFP_KERNEL | __GFP_COMP);
         if (!udev->l2_ring)
-               return -ENOMEM;
+               goto err_udev;
  
         udev->l2_buf_size = (cp->l2_rx_ring_size + 1) * cp->l2_single_buf_size;
         udev->l2_buf_size = PAGE_ALIGN(udev->l2_buf_size);
@@ -948,7 +948,7 @@ static int cnic_alloc_uio_rings(struct cnic_dev *dev, int pages)
                                           &udev->l2_buf_map,
                                           GFP_KERNEL | __GFP_COMP);
         if (!udev->l2_buf)
-               return -ENOMEM;
+               goto err_dma;
  
         write_lock(&cnic_dev_lock);
         list_add(&udev->list, &cnic_udev_list);
@@ -959,6 +959,12 @@ static int cnic_alloc_uio_rings(struct cnic_dev *dev, int pages)
         cp->udev = udev;
  
         return 0;
+ err_dma:
+       dma_free_coherent(&udev->pdev->dev, udev->l2_ring_size,
+                         udev->l2_ring, udev->l2_ring_map);
+ err_udev:
+       kfree(udev);
+       return -ENOMEM;
  }
  
  static int cnic_init_uio(struct cnic_dev *dev)
diff --git a/drivers/net/ehea/ehea_ethtool.c b/drivers/net/ehea/ehea_ethtool.c

index 1f37ee6b2a2626282fd5a772cc21f821321b9379..d6cf502906cfeeafa41b1072a9aadf8a97a5b475 100644 (file)
--- a/drivers/net/ehea/ehea_ethtool.c
+++ b/drivers/net/ehea/ehea_ethtool.c
@@ -263,6 +263,13 @@ static void ehea_get_ethtool_stats(struct net_device *dev,
  
  static int ehea_set_flags(struct net_device *dev, u32 data)
  {
+       /* Avoid changing the VLAN flags */
+       if ((data & (ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN)) !=
+           (ethtool_op_get_flags(dev) & (ETH_FLAG_RXVLAN |
+                                         ETH_FLAG_TXVLAN))){
+               return -EINVAL;
+       }
+
         return ethtool_op_set_flags(dev, data, ETH_FLAG_LRO
                                         | ETH_FLAG_TXVLAN
                                         | ETH_FLAG_RXVLAN);
diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c

index 39659976a1acfb1589834368aab20080c894d6eb..89294b43c4a901f8ce927e1981ad317b95538f44 100644 (file)
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -1285,6 +1285,11 @@ ppp_push(struct ppp *ppp)
  }
  
  #ifdef CONFIG_PPP_MULTILINK
+static bool mp_protocol_compress __read_mostly = true;
+module_param(mp_protocol_compress, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(mp_protocol_compress,
+                "compress protocol id in multilink fragments");
+
  /*
   * Divide a packet to be transmitted into fragments and
   * send them out the individual links.
@@ -1347,10 +1352,10 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
         if (nfree == 0 || nfree < navail / 2)
                 return 0; /* can't take now, leave it in xmit_pending */
  
-       /* Do protocol field compression (XXX this should be optional) */
+       /* Do protocol field compression */
         p = skb->data;
         len = skb->len;
-       if (*p == 0) {
+       if (*p == 0 && mp_protocol_compress) {
                 ++p;
                 --len;
         }
diff --git a/drivers/net/skfp/skfddi.c b/drivers/net/skfp/skfddi.c

index 0a66fed52e8ed48e9a82bfdcfb831957acc491cf..16c62659cdd96040e7831d0e7d7e18bc58832752 100644 (file)
--- a/drivers/net/skfp/skfddi.c
+++ b/drivers/net/skfp/skfddi.c
@@ -412,7 +412,7 @@ static  int skfp_driver_init(struct net_device *dev)
                 bp->SharedMemAddr = pci_alloc_consistent(&bp->pdev,
                                                          bp->SharedMemSize,
                                                          &bp->SharedMemDMA);
-               if (!bp->SharedMemSize) {
+               if (!bp->SharedMemAddr) {
                         printk("could not allocate mem for ");
                         printk("hardware module: %ld byte\n",
                                bp->SharedMemSize);
diff --git a/drivers/net/starfire.c b/drivers/net/starfire.c

index 4adf124227877e704fe66a7bb74919738bbd46c5..a4f2bd52e546f522995df2fb7fae5721f94e0244 100644 (file)
--- a/drivers/net/starfire.c
+++ b/drivers/net/starfire.c
@@ -148,7 +148,7 @@ static int full_duplex[MAX_UNITS] = {0, };
   * This SUCKS.
   * We need a much better method to determine if dma_addr_t is 64-bit.
   */
-#if (defined(__i386__) && defined(CONFIG_HIGHMEM64G)) || defined(__x86_64__) || defined (__ia64__) || defined(__alpha__) || defined(__mips64__) || (defined(__mips__) && defined(CONFIG_HIGHMEM) && defined(CONFIG_64BIT_PHYS_ADDR)) || (defined(__powerpc64__) || defined(CONFIG_PHYS_64BIT))
+#if (defined(__i386__) && defined(CONFIG_HIGHMEM64G)) || defined(__x86_64__) || defined (__ia64__) || defined(__alpha__) || (defined(CONFIG_MIPS) && ((defined(CONFIG_HIGHMEM) && defined(CONFIG_64BIT_PHYS_ADDR)) || defined(CONFIG_64BIT))) || (defined(__powerpc64__) || defined(CONFIG_PHYS_64BIT))
  /* 64-bit dma_addr_t */
  #define ADDR_64BITS    /* This chip uses 64 bit addresses. */
  #define netdrv_addr_t __le64
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c

index 30ccbb6d097af220dced34f430723a3b3c5a9dca..6f97b7bbcbf13a29b9f2c567eb317a12ec51609c 100644 (file)
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -12658,7 +12658,7 @@ static void __devinit tg3_read_vpd(struct tg3 *tp)
                         cnt = pci_read_vpd(tp->pdev, pos,
                                            TG3_NVM_VPD_LEN - pos,
                                            &vpd_data[pos]);
-                       if (cnt == -ETIMEDOUT || -EINTR)
+                       if (cnt == -ETIMEDOUT || cnt == -EINTR)
                                 cnt = 0;
                         else if (cnt < 0)
                                 goto out_not_found;
diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c

index c44a5e8b8b82da9d06706d9cd3a3ec0fcb2b883c..f0b3ad13c273fb73394cc78ca8fb36521a8a4c90 100644 (file)
--- a/drivers/platform/x86/intel_ips.c
+++ b/drivers/platform/x86/intel_ips.c
@@ -75,6 +75,7 @@
  #include <drm/i915_drm.h>
  #include <asm/msr.h>
  #include <asm/processor.h>
+#include "intel_ips.h"
  
  #define PCI_DEVICE_ID_INTEL_THERMAL_SENSOR 0x3b32
  
@@ -245,6 +246,7 @@
  #define thm_writel(off, val) writel((val), ips->regmap + (off))
  
  static const int IPS_ADJUST_PERIOD = 5000; /* ms */
+static bool late_i915_load = false;
  
  /* For initial average collection */
  static const int IPS_SAMPLE_PERIOD = 200; /* ms */
@@ -339,6 +341,9 @@ struct ips_driver {
         u64 orig_turbo_ratios;
  };
  
+static bool
+ips_gpu_turbo_enabled(struct ips_driver *ips);
+
  /**
   * ips_cpu_busy - is CPU busy?
   * @ips: IPS driver struct
@@ -517,7 +522,7 @@ static void ips_disable_cpu_turbo(struct ips_driver *ips)
   */
  static bool ips_gpu_busy(struct ips_driver *ips)
  {
-       if (!ips->gpu_turbo_enabled)
+       if (!ips_gpu_turbo_enabled(ips))
                 return false;
  
         return ips->gpu_busy();
@@ -532,7 +537,7 @@ static bool ips_gpu_busy(struct ips_driver *ips)
   */
  static void ips_gpu_raise(struct ips_driver *ips)
  {
-       if (!ips->gpu_turbo_enabled)
+       if (!ips_gpu_turbo_enabled(ips))
                 return;
  
         if (!ips->gpu_raise())
@@ -549,7 +554,7 @@ static void ips_gpu_raise(struct ips_driver *ips)
   */
  static void ips_gpu_lower(struct ips_driver *ips)
  {
-       if (!ips->gpu_turbo_enabled)
+       if (!ips_gpu_turbo_enabled(ips))
                 return;
  
         if (!ips->gpu_lower())
@@ -1454,6 +1459,31 @@ out_err:
         return false;
  }
  
+static bool
+ips_gpu_turbo_enabled(struct ips_driver *ips)
+{
+       if (!ips->gpu_busy && late_i915_load) {
+               if (ips_get_i915_syms(ips)) {
+                       dev_info(&ips->dev->dev,
+                                "i915 driver attached, reenabling gpu turbo\n");
+                       ips->gpu_turbo_enabled = !(thm_readl(THM_HTS) & HTS_GTD_DIS);
+               }
+       }
+
+       return ips->gpu_turbo_enabled;
+}
+
+void
+ips_link_to_i915_driver()
+{
+       /* We can't cleanly get at the various ips_driver structs from
+        * this caller (the i915 driver), so just set a flag saying
+        * that it's time to try getting the symbols again.
+        */
+       late_i915_load = true;
+}
+EXPORT_SYMBOL_GPL(ips_link_to_i915_driver);
+
  static DEFINE_PCI_DEVICE_TABLE(ips_id_table) = {
         { PCI_DEVICE(PCI_VENDOR_ID_INTEL,
                      PCI_DEVICE_ID_INTEL_THERMAL_SENSOR), },
diff --git a/drivers/platform/x86/intel_ips.h b/drivers/platform/x86/intel_ips.h

new file mode 100644 (file)

index 0000000..73299be
--- /dev/null
+++ b/drivers/platform/x86/intel_ips.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ */
+
+void ips_link_to_i915_driver(void);
diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c

index 41a9e34899ac5f81da6dde61f6aa2a1d34e2e134..ca35b0ce944a58ca017ff5b5678b73a468d7a42a 100644 (file)
--- a/drivers/platform/x86/intel_scu_ipc.c
+++ b/drivers/platform/x86/intel_scu_ipc.c
@@ -26,6 +26,7 @@
  #include <linux/sfi.h>
  #include <asm/mrst.h>
  #include <asm/intel_scu_ipc.h>
+#include <asm/mrst.h>
  
  /* IPC defines the following message types */
  #define IPCMSG_WATCHDOG_TIMER 0xF8 /* Set Kernel Watchdog Threshold */
@@ -699,6 +700,9 @@ static int ipc_probe(struct pci_dev *dev, const struct pci_device_id *id)
                 iounmap(ipcdev.ipc_base);
                 return -ENOMEM;
         }
+
+       intel_scu_devices_create();
+
         return 0;
  }
  
@@ -720,6 +724,7 @@ static void ipc_remove(struct pci_dev *pdev)
         iounmap(ipcdev.ipc_base);
         iounmap(ipcdev.i2c_base);
         ipcdev.pdev = NULL;
+       intel_scu_devices_destroy();
  }
  
  static const struct pci_device_id pci_ids[] = {
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig

index 2883428d5ac806408b5082221036ffa30915356c..4941cade319f5cef06d508d0b1f1354d951c1034 100644 (file)
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -463,6 +463,18 @@ config RTC_DRV_CMOS
           This driver can also be built as a module. If so, the module
           will be called rtc-cmos.
  
+config RTC_DRV_VRTC
+       tristate "Virtual RTC for Moorestown platforms"
+       depends on X86_MRST
+       default y if X86_MRST
+
+       help
+       Say "yes" here to get direct support for the real time clock
+       found on Moorestown platforms. The VRTC is a emulated RTC that
+       derives its clock source from a real RTC in the PMIC. The MC146818
+       style programming interface is mostly conserved, but any
+       updates are done via IPC calls to the system controller FW.
+
  config RTC_DRV_DS1216
         tristate "Dallas DS1216"
         depends on SNI_RM
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile

index 4c2832df4697d3cdba4af02990c683dc13118776..2afdaf3ff98660f53c72a189503786ebeac0a27c 100644 (file)
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_RTC_DRV_CMOS)    += rtc-cmos.o
  obj-$(CONFIG_RTC_DRV_COH901331)        += rtc-coh901331.o
  obj-$(CONFIG_RTC_DRV_DAVINCI)  += rtc-davinci.o
  obj-$(CONFIG_RTC_DRV_DM355EVM) += rtc-dm355evm.o
+obj-$(CONFIG_RTC_DRV_VRTC)     += rtc-mrst.o
  obj-$(CONFIG_RTC_DRV_DS1216)   += rtc-ds1216.o
  obj-$(CONFIG_RTC_DRV_DS1286)   += rtc-ds1286.o
  obj-$(CONFIG_RTC_DRV_DS1302)   += rtc-ds1302.o
diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c

new file mode 100644 (file)

index 0000000..bcd0cf6
--- /dev/null
+++ b/drivers/rtc/rtc-mrst.c
@@ -0,0 +1,582 @@
+/*
+ * rtc-mrst.c: Driver for Moorestown virtual RTC
+ *
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ *        Feng Tang (feng.tang@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ * VRTC is emulated by system controller firmware, the real HW
+ * RTC is located in the PMIC device. SCU FW shadows PMIC RTC
+ * in a memory mapped IO space that is visible to the host IA
+ * processor.
+ *
+ * This driver is based upon drivers/rtc/rtc-cmos.c
+ */
+
+/*
+ * Note:
+ *  * vRTC only supports binary mode and 24H mode
+ *  * vRTC only support PIE and AIE, no UIE, and its PIE only happens
+ *    at 23:59:59pm everyday, no support for adjustable frequency
+ *  * Alarm function is also limited to hr/min/sec.
+ */
+
+#include <linux/mod_devicetable.h>
+#include <linux/platform_device.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+
+#include <asm-generic/rtc.h>
+#include <asm/intel_scu_ipc.h>
+#include <asm/mrst.h>
+#include <asm/mrst-vrtc.h>
+
+struct mrst_rtc {
+       struct rtc_device       *rtc;
+       struct device           *dev;
+       int                     irq;
+       struct resource         *iomem;
+
+       u8                      enabled_wake;
+       u8                      suspend_ctrl;
+};
+
+static const char driver_name[] = "rtc_mrst";
+
+#define        RTC_IRQMASK     (RTC_PF | RTC_AF)
+
+static inline int is_intr(u8 rtc_intr)
+{
+       if (!(rtc_intr & RTC_IRQF))
+               return 0;
+       return rtc_intr & RTC_IRQMASK;
+}
+
+/*
+ * rtc_time's year contains the increment over 1900, but vRTC's YEAR
+ * register can't be programmed to value larger than 0x64, so vRTC
+ * driver chose to use 1960 (1970 is UNIX time start point) as the base,
+ * and does the translation at read/write time.
+ *
+ * Why not just use 1970 as the offset? it's because using 1960 will
+ * make it consistent in leap year setting for both vrtc and low-level
+ * physical rtc devices.
+ */
+static int mrst_read_time(struct device *dev, struct rtc_time *time)
+{
+       unsigned long flags;
+
+       if (rtc_is_updating())
+               mdelay(20);
+
+       spin_lock_irqsave(&rtc_lock, flags);
+       time->tm_sec = vrtc_cmos_read(RTC_SECONDS);
+       time->tm_min = vrtc_cmos_read(RTC_MINUTES);
+       time->tm_hour = vrtc_cmos_read(RTC_HOURS);
+       time->tm_mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
+       time->tm_mon = vrtc_cmos_read(RTC_MONTH);
+       time->tm_year = vrtc_cmos_read(RTC_YEAR);
+       spin_unlock_irqrestore(&rtc_lock, flags);
+
+       /* Adjust for the 1960/1900 */
+       time->tm_year += 60;
+       time->tm_mon--;
+       return RTC_24H;
+}
+
+static int mrst_set_time(struct device *dev, struct rtc_time *time)
+{
+       int ret;
+       unsigned long flags;
+       unsigned char mon, day, hrs, min, sec;
+       unsigned int yrs;
+
+       yrs = time->tm_year;
+       mon = time->tm_mon + 1;   /* tm_mon starts at zero */
+       day = time->tm_mday;
+       hrs = time->tm_hour;
+       min = time->tm_min;
+       sec = time->tm_sec;
+
+       if (yrs < 70 || yrs > 138)
+               return -EINVAL;
+       yrs -= 60;
+
+       spin_lock_irqsave(&rtc_lock, flags);
+
+       vrtc_cmos_write(yrs, RTC_YEAR);
+       vrtc_cmos_write(mon, RTC_MONTH);
+       vrtc_cmos_write(day, RTC_DAY_OF_MONTH);
+       vrtc_cmos_write(hrs, RTC_HOURS);
+       vrtc_cmos_write(min, RTC_MINUTES);
+       vrtc_cmos_write(sec, RTC_SECONDS);
+
+       spin_unlock_irqrestore(&rtc_lock, flags);
+
+       ret = intel_scu_ipc_simple_command(IPCMSG_VRTC, IPC_CMD_VRTC_SETTIME);
+       return ret;
+}
+
+static int mrst_read_alarm(struct device *dev, struct rtc_wkalrm *t)
+{
+       struct mrst_rtc *mrst = dev_get_drvdata(dev);
+       unsigned char rtc_control;
+
+       if (mrst->irq <= 0)
+               return -EIO;
+
+       /* Basic alarms only support hour, minute, and seconds fields.
+        * Some also support day and month, for alarms up to a year in
+        * the future.
+        */
+       t->time.tm_mday = -1;
+       t->time.tm_mon = -1;
+       t->time.tm_year = -1;
+
+       /* vRTC only supports binary mode */
+       spin_lock_irq(&rtc_lock);
+       t->time.tm_sec = vrtc_cmos_read(RTC_SECONDS_ALARM);
+       t->time.tm_min = vrtc_cmos_read(RTC_MINUTES_ALARM);
+       t->time.tm_hour = vrtc_cmos_read(RTC_HOURS_ALARM);
+
+       rtc_control = vrtc_cmos_read(RTC_CONTROL);
+       spin_unlock_irq(&rtc_lock);
+
+       t->enabled = !!(rtc_control & RTC_AIE);
+       t->pending = 0;
+
+       return 0;
+}
+
+static void mrst_checkintr(struct mrst_rtc *mrst, unsigned char rtc_control)
+{
+       unsigned char   rtc_intr;
+
+       /*
+        * NOTE after changing RTC_xIE bits we always read INTR_FLAGS;
+        * allegedly some older rtcs need that to handle irqs properly
+        */
+       rtc_intr = vrtc_cmos_read(RTC_INTR_FLAGS);
+       rtc_intr &= (rtc_control & RTC_IRQMASK) | RTC_IRQF;
+       if (is_intr(rtc_intr))
+               rtc_update_irq(mrst->rtc, 1, rtc_intr);
+}
+
+static void mrst_irq_enable(struct mrst_rtc *mrst, unsigned char mask)
+{
+       unsigned char   rtc_control;
+
+       /*
+        * Flush any pending IRQ status, notably for update irqs,
+        * before we enable new IRQs
+        */
+       rtc_control = vrtc_cmos_read(RTC_CONTROL);
+       mrst_checkintr(mrst, rtc_control);
+
+       rtc_control |= mask;
+       vrtc_cmos_write(rtc_control, RTC_CONTROL);
+
+       mrst_checkintr(mrst, rtc_control);
+}
+
+static void mrst_irq_disable(struct mrst_rtc *mrst, unsigned char mask)
+{
+       unsigned char   rtc_control;
+
+       rtc_control = vrtc_cmos_read(RTC_CONTROL);
+       rtc_control &= ~mask;
+       vrtc_cmos_write(rtc_control, RTC_CONTROL);
+       mrst_checkintr(mrst, rtc_control);
+}
+
+static int mrst_set_alarm(struct device *dev, struct rtc_wkalrm *t)
+{
+       struct mrst_rtc *mrst = dev_get_drvdata(dev);
+       unsigned char hrs, min, sec;
+       int ret = 0;
+
+       if (!mrst->irq)
+               return -EIO;
+
+       hrs = t->time.tm_hour;
+       min = t->time.tm_min;
+       sec = t->time.tm_sec;
+
+       spin_lock_irq(&rtc_lock);
+       /* Next rtc irq must not be from previous alarm setting */
+       mrst_irq_disable(mrst, RTC_AIE);
+
+       /* Update alarm */
+       vrtc_cmos_write(hrs, RTC_HOURS_ALARM);
+       vrtc_cmos_write(min, RTC_MINUTES_ALARM);
+       vrtc_cmos_write(sec, RTC_SECONDS_ALARM);
+
+       spin_unlock_irq(&rtc_lock);
+
+       ret = intel_scu_ipc_simple_command(IPCMSG_VRTC, IPC_CMD_VRTC_SETALARM);
+       if (ret)
+               return ret;
+
+       spin_lock_irq(&rtc_lock);
+       if (t->enabled)
+               mrst_irq_enable(mrst, RTC_AIE);
+
+       spin_unlock_irq(&rtc_lock);
+
+       return 0;
+}
+
+static int mrst_irq_set_state(struct device *dev, int enabled)
+{
+       struct mrst_rtc *mrst = dev_get_drvdata(dev);
+       unsigned long   flags;
+
+       if (!mrst->irq)
+               return -ENXIO;
+
+       spin_lock_irqsave(&rtc_lock, flags);
+
+       if (enabled)
+               mrst_irq_enable(mrst, RTC_PIE);
+       else
+               mrst_irq_disable(mrst, RTC_PIE);
+
+       spin_unlock_irqrestore(&rtc_lock, flags);
+       return 0;
+}
+
+#if defined(CONFIG_RTC_INTF_DEV) || defined(CONFIG_RTC_INTF_DEV_MODULE)
+
+/* Currently, the vRTC doesn't support UIE ON/OFF */
+static int
+mrst_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
+{
+       struct mrst_rtc *mrst = dev_get_drvdata(dev);
+       unsigned long   flags;
+
+       switch (cmd) {
+       case RTC_AIE_OFF:
+       case RTC_AIE_ON:
+               if (!mrst->irq)
+                       return -EINVAL;
+               break;
+       default:
+               /* PIE ON/OFF is handled by mrst_irq_set_state() */
+               return -ENOIOCTLCMD;
+       }
+
+       spin_lock_irqsave(&rtc_lock, flags);
+       switch (cmd) {
+       case RTC_AIE_OFF:       /* alarm off */
+               mrst_irq_disable(mrst, RTC_AIE);
+               break;
+       case RTC_AIE_ON:        /* alarm on */
+               mrst_irq_enable(mrst, RTC_AIE);
+               break;
+       }
+       spin_unlock_irqrestore(&rtc_lock, flags);
+       return 0;
+}
+
+#else
+#define        mrst_rtc_ioctl  NULL
+#endif
+
+#if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE)
+
+static int mrst_procfs(struct device *dev, struct seq_file *seq)
+{
+       unsigned char   rtc_control, valid;
+
+       spin_lock_irq(&rtc_lock);
+       rtc_control = vrtc_cmos_read(RTC_CONTROL);
+       valid = vrtc_cmos_read(RTC_VALID);
+       spin_unlock_irq(&rtc_lock);
+
+       return seq_printf(seq,
+                       "periodic_IRQ\t: %s\n"
+                       "alarm\t\t: %s\n"
+                       "BCD\t\t: no\n"
+                       "periodic_freq\t: daily (not adjustable)\n",
+                       (rtc_control & RTC_PIE) ? "on" : "off",
+                       (rtc_control & RTC_AIE) ? "on" : "off");
+}
+
+#else
+#define        mrst_procfs     NULL
+#endif
+
+static const struct rtc_class_ops mrst_rtc_ops = {
+       .ioctl          = mrst_rtc_ioctl,
+       .read_time      = mrst_read_time,
+       .set_time       = mrst_set_time,
+       .read_alarm     = mrst_read_alarm,
+       .set_alarm      = mrst_set_alarm,
+       .proc           = mrst_procfs,
+       .irq_set_state  = mrst_irq_set_state,
+};
+
+static struct mrst_rtc mrst_rtc;
+
+/*
+ * When vRTC IRQ is captured by SCU FW, FW will clear the AIE bit in
+ * Reg B, so no need for this driver to clear it
+ */
+static irqreturn_t mrst_rtc_irq(int irq, void *p)
+{
+       u8 irqstat;
+
+       spin_lock(&rtc_lock);
+       /* This read will clear all IRQ flags inside Reg C */
+       irqstat = vrtc_cmos_read(RTC_INTR_FLAGS);
+       spin_unlock(&rtc_lock);
+
+       irqstat &= RTC_IRQMASK | RTC_IRQF;
+       if (is_intr(irqstat)) {
+               rtc_update_irq(p, 1, irqstat);
+               return IRQ_HANDLED;
+       }
+       return IRQ_NONE;
+}
+
+static int __init
+vrtc_mrst_do_probe(struct device *dev, struct resource *iomem, int rtc_irq)
+{
+       int retval = 0;
+       unsigned char rtc_control;
+
+       /* There can be only one ... */
+       if (mrst_rtc.dev)
+               return -EBUSY;
+
+       if (!iomem)
+               return -ENODEV;
+
+       iomem = request_mem_region(iomem->start,
+                       iomem->end + 1 - iomem->start,
+                       driver_name);
+       if (!iomem) {
+               dev_dbg(dev, "i/o mem already in use.\n");
+               return -EBUSY;
+       }
+
+       mrst_rtc.irq = rtc_irq;
+       mrst_rtc.iomem = iomem;
+
+       mrst_rtc.rtc = rtc_device_register(driver_name, dev,
+                               &mrst_rtc_ops, THIS_MODULE);
+       if (IS_ERR(mrst_rtc.rtc)) {
+               retval = PTR_ERR(mrst_rtc.rtc);
+               goto cleanup0;
+       }
+
+       mrst_rtc.dev = dev;
+       dev_set_drvdata(dev, &mrst_rtc);
+       rename_region(iomem, dev_name(&mrst_rtc.rtc->dev));
+
+       spin_lock_irq(&rtc_lock);
+       mrst_irq_disable(&mrst_rtc, RTC_PIE | RTC_AIE);
+       rtc_control = vrtc_cmos_read(RTC_CONTROL);
+       spin_unlock_irq(&rtc_lock);
+
+       if (!(rtc_control & RTC_24H) || (rtc_control & (RTC_DM_BINARY)))
+               dev_dbg(dev, "TODO: support more than 24-hr BCD mode\n");
+
+       if (rtc_irq) {
+               retval = request_irq(rtc_irq, mrst_rtc_irq,
+                               IRQF_DISABLED, dev_name(&mrst_rtc.rtc->dev),
+                               mrst_rtc.rtc);
+               if (retval < 0) {
+                       dev_dbg(dev, "IRQ %d is already in use, err %d\n",
+                               rtc_irq, retval);
+                       goto cleanup1;
+               }
+       }
+       dev_dbg(dev, "initialised\n");
+       return 0;
+
+cleanup1:
+       mrst_rtc.dev = NULL;
+       rtc_device_unregister(mrst_rtc.rtc);
+cleanup0:
+       release_region(iomem->start, iomem->end + 1 - iomem->start);
+       dev_err(dev, "rtc-mrst: unable to initialise\n");
+       return retval;
+}
+
+static void rtc_mrst_do_shutdown(void)
+{
+       spin_lock_irq(&rtc_lock);
+       mrst_irq_disable(&mrst_rtc, RTC_IRQMASK);
+       spin_unlock_irq(&rtc_lock);
+}
+
+static void __exit rtc_mrst_do_remove(struct device *dev)
+{
+       struct mrst_rtc *mrst = dev_get_drvdata(dev);
+       struct resource *iomem;
+
+       rtc_mrst_do_shutdown();
+
+       if (mrst->irq)
+               free_irq(mrst->irq, mrst->rtc);
+
+       rtc_device_unregister(mrst->rtc);
+       mrst->rtc = NULL;
+
+       iomem = mrst->iomem;
+       release_region(iomem->start, iomem->end + 1 - iomem->start);
+       mrst->iomem = NULL;
+
+       mrst->dev = NULL;
+       dev_set_drvdata(dev, NULL);
+}
+
+#ifdef CONFIG_PM
+static int mrst_suspend(struct device *dev, pm_message_t mesg)
+{
+       struct mrst_rtc *mrst = dev_get_drvdata(dev);
+       unsigned char   tmp;
+
+       /* Only the alarm might be a wakeup event source */
+       spin_lock_irq(&rtc_lock);
+       mrst->suspend_ctrl = tmp = vrtc_cmos_read(RTC_CONTROL);
+       if (tmp & (RTC_PIE | RTC_AIE)) {
+               unsigned char   mask;
+
+               if (device_may_wakeup(dev))
+                       mask = RTC_IRQMASK & ~RTC_AIE;
+               else
+                       mask = RTC_IRQMASK;
+               tmp &= ~mask;
+               vrtc_cmos_write(tmp, RTC_CONTROL);
+
+               mrst_checkintr(mrst, tmp);
+       }
+       spin_unlock_irq(&rtc_lock);
+
+       if (tmp & RTC_AIE) {
+               mrst->enabled_wake = 1;
+               enable_irq_wake(mrst->irq);
+       }
+
+       dev_dbg(&mrst_rtc.rtc->dev, "suspend%s, ctrl %02x\n",
+                       (tmp & RTC_AIE) ? ", alarm may wake" : "",
+                       tmp);
+
+       return 0;
+}
+
+/*
+ * We want RTC alarms to wake us from the deep power saving state
+ */
+static inline int mrst_poweroff(struct device *dev)
+{
+       return mrst_suspend(dev, PMSG_HIBERNATE);
+}
+
+static int mrst_resume(struct device *dev)
+{
+       struct mrst_rtc *mrst = dev_get_drvdata(dev);
+       unsigned char tmp = mrst->suspend_ctrl;
+
+       /* Re-enable any irqs previously active */
+       if (tmp & RTC_IRQMASK) {
+               unsigned char   mask;
+
+               if (mrst->enabled_wake) {
+                       disable_irq_wake(mrst->irq);
+                       mrst->enabled_wake = 0;
+               }
+
+               spin_lock_irq(&rtc_lock);
+               do {
+                       vrtc_cmos_write(tmp, RTC_CONTROL);
+
+                       mask = vrtc_cmos_read(RTC_INTR_FLAGS);
+                       mask &= (tmp & RTC_IRQMASK) | RTC_IRQF;
+                       if (!is_intr(mask))
+                               break;
+
+                       rtc_update_irq(mrst->rtc, 1, mask);
+                       tmp &= ~RTC_AIE;
+               } while (mask & RTC_AIE);
+               spin_unlock_irq(&rtc_lock);
+       }
+
+       dev_dbg(&mrst_rtc.rtc->dev, "resume, ctrl %02x\n", tmp);
+
+       return 0;
+}
+
+#else
+#define        mrst_suspend    NULL
+#define        mrst_resume     NULL
+
+static inline int mrst_poweroff(struct device *dev)
+{
+       return -ENOSYS;
+}
+
+#endif
+
+static int __init vrtc_mrst_platform_probe(struct platform_device *pdev)
+{
+       return vrtc_mrst_do_probe(&pdev->dev,
+                       platform_get_resource(pdev, IORESOURCE_MEM, 0),
+                       platform_get_irq(pdev, 0));
+}
+
+static int __exit vrtc_mrst_platform_remove(struct platform_device *pdev)
+{
+       rtc_mrst_do_remove(&pdev->dev);
+       return 0;
+}
+
+static void vrtc_mrst_platform_shutdown(struct platform_device *pdev)
+{
+       if (system_state == SYSTEM_POWER_OFF && !mrst_poweroff(&pdev->dev))
+               return;
+
+       rtc_mrst_do_shutdown();
+}
+
+MODULE_ALIAS("platform:vrtc_mrst");
+
+static struct platform_driver vrtc_mrst_platform_driver = {
+       .probe          = vrtc_mrst_platform_probe,
+       .remove         = __exit_p(vrtc_mrst_platform_remove),
+       .shutdown       = vrtc_mrst_platform_shutdown,
+       .driver = {
+               .name           = (char *) driver_name,
+               .suspend        = mrst_suspend,
+               .resume         = mrst_resume,
+       }
+};
+
+static int __init vrtc_mrst_init(void)
+{
+       return platform_driver_register(&vrtc_mrst_platform_driver);
+}
+
+static void __exit vrtc_mrst_exit(void)
+{
+       platform_driver_unregister(&vrtc_mrst_platform_driver);
+}
+
+module_init(vrtc_mrst_init);
+module_exit(vrtc_mrst_exit);
+
+MODULE_AUTHOR("Jacob Pan; Feng Tang");
+MODULE_DESCRIPTION("Driver for Moorestown virtual RTC");
+MODULE_LICENSE("GPL");
diff --git a/drivers/spi/coldfire_qspi.c b/drivers/spi/coldfire_qspi.c

index 052b3c7fa6a0f644d26c613e67e69ffe00ad05be..8856bcca9d2933db4a8f98526bc716c6640bb075 100644 (file)
--- a/drivers/spi/coldfire_qspi.c
+++ b/drivers/spi/coldfire_qspi.c
@@ -317,7 +317,7 @@ static void mcfqspi_work(struct work_struct *work)
                 msg = container_of(mcfqspi->msgq.next, struct spi_message,
                                    queue);
  
-               list_del_init(&mcfqspi->msgq);
+               list_del_init(&msg->queue);
                 spin_unlock_irqrestore(&mcfqspi->lock, flags);
  
                 spi = msg->spi;
diff --git a/drivers/spi/omap2_mcspi.c b/drivers/spi/omap2_mcspi.c

index 2a651e61bfbff30f23e8e44dddfcb70ab8a05173..951a160fc27fbe2614376776ba3b712646e305ff 100644 (file)
--- a/drivers/spi/omap2_mcspi.c
+++ b/drivers/spi/omap2_mcspi.c
@@ -1305,10 +1305,49 @@ static int __exit omap2_mcspi_remove(struct platform_device *pdev)
  /* work with hotplug and coldplug */
  MODULE_ALIAS("platform:omap2_mcspi");
  
+#ifdef CONFIG_SUSPEND
+/*
+ * When SPI wake up from off-mode, CS is in activate state. If it was in
+ * unactive state when driver was suspend, then force it to unactive state at
+ * wake up.
+ */
+static int omap2_mcspi_resume(struct device *dev)
+{
+       struct spi_master       *master = dev_get_drvdata(dev);
+       struct omap2_mcspi      *mcspi = spi_master_get_devdata(master);
+       struct omap2_mcspi_cs *cs;
+
+       omap2_mcspi_enable_clocks(mcspi);
+       list_for_each_entry(cs, &omap2_mcspi_ctx[master->bus_num - 1].cs,
+                           node) {
+               if ((cs->chconf0 & OMAP2_MCSPI_CHCONF_FORCE) == 0) {
+
+                       /*
+                        * We need to toggle CS state for OMAP take this
+                        * change in account.
+                        */
+                       MOD_REG_BIT(cs->chconf0, OMAP2_MCSPI_CHCONF_FORCE, 1);
+                       __raw_writel(cs->chconf0, cs->base + OMAP2_MCSPI_CHCONF0);
+                       MOD_REG_BIT(cs->chconf0, OMAP2_MCSPI_CHCONF_FORCE, 0);
+                       __raw_writel(cs->chconf0, cs->base + OMAP2_MCSPI_CHCONF0);
+               }
+       }
+       omap2_mcspi_disable_clocks(mcspi);
+       return 0;
+}
+#else
+#define        omap2_mcspi_resume      NULL
+#endif
+
+static const struct dev_pm_ops omap2_mcspi_pm_ops = {
+       .resume = omap2_mcspi_resume,
+};
+
  static struct platform_driver omap2_mcspi_driver = {
         .driver = {
                 .name =         "omap2_mcspi",
                 .owner =        THIS_MODULE,
+               .pm =           &omap2_mcspi_pm_ops
         },
         .remove =       __exit_p(omap2_mcspi_remove),
  };
diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c

index 8c3c057aa8478fb2a436edfd571843513544ca71..d0e9e0207539e2c9491fc6c1de25c75dda4bd9c6 100644 (file)
--- a/drivers/staging/zram/zram_drv.c
+++ b/drivers/staging/zram/zram_drv.c
@@ -435,12 +435,6 @@ static int zram_make_request(struct request_queue *queue, struct bio *bio)
         int ret = 0;
         struct zram *zram = queue->queuedata;
  
-       if (unlikely(!zram->init_done)) {
-               set_bit(BIO_UPTODATE, &bio->bi_flags);
-               bio_endio(bio, 0);
-               return 0;
-       }
-
         if (!valid_io_request(zram, bio)) {
                 zram_stat64_inc(zram, &zram->stats.invalid_io);
                 bio_io_error(bio);
diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c

index 44447f54942f6d6e1c7c7e2a777b4c882f6b546d..99ac70e32556f841b83c909108767dedf444237e 100644 (file)
--- a/drivers/usb/atm/ueagle-atm.c
+++ b/drivers/usb/atm/ueagle-atm.c
@@ -2206,8 +2206,11 @@ static int uea_boot(struct uea_softc *sc)
                 goto err1;
         }
  
-       sc->kthread = kthread_run(uea_kthread, sc, "ueagle-atm");
-       if (sc->kthread == ERR_PTR(-ENOMEM)) {
+       /* Create worker thread, but don't start it here.  Start it after
+        * all usbatm generic initialization is done.
+        */
+       sc->kthread = kthread_create(uea_kthread, sc, "ueagle-atm");
+       if (IS_ERR(sc->kthread)) {
                 uea_err(INS_TO_USBDEV(sc), "failed to create thread\n");
                 goto err2;
         }
@@ -2624,6 +2627,7 @@ static struct usbatm_driver uea_usbatm_driver = {
  static int uea_probe(struct usb_interface *intf, const struct usb_device_id *id)
  {
         struct usb_device *usb = interface_to_usbdev(intf);
+       int ret;
  
         uea_enters(usb);
         uea_info(usb, "ADSL device founded vid (%#X) pid (%#X) Rev (%#X): %s\n",
@@ -2637,7 +2641,19 @@ static int uea_probe(struct usb_interface *intf, const struct usb_device_id *id)
         if (UEA_IS_PREFIRM(id))
                 return uea_load_firmware(usb, UEA_CHIP_VERSION(id));
  
-       return usbatm_usb_probe(intf, id, &uea_usbatm_driver);
+       ret = usbatm_usb_probe(intf, id, &uea_usbatm_driver);
+       if (ret == 0) {
+               struct usbatm_data *usbatm = usb_get_intfdata(intf);
+               struct uea_softc *sc = usbatm->driver_data;
+
+               /* Ensure carrier is initialized to off as early as possible */
+               UPDATE_ATM_SIGNAL(ATM_PHY_SIG_LOST);
+
+               /* Only start the worker thread when all init is done */
+               wake_up_process(sc->kthread);
+       }
+
+       return ret;
  }
  
  static void uea_disconnect(struct usb_interface *intf)
diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c

index 3d77116e463410dac81b6c530c74d5a55f9c7afc..dea7b5bf6e2ccd986cf99840db504d6a38382293 100644 (file)
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -642,19 +642,14 @@ static struct notifier_block die_notifier = {
   */
  
  #ifdef CONFIG_HPWDT_NMI_DECODING
-#ifdef ARCH_HAS_NMI_WATCHDOG
+#ifdef CONFIG_X86_LOCAL_APIC
  static void __devinit hpwdt_check_nmi_decoding(struct pci_dev *dev)
  {
         /*
          * If nmi_watchdog is turned off then we can turn on
          * our nmi decoding capability.
          */
-       if (!nmi_watchdog_active())
-               hpwdt_nmi_decoding = 1;
-       else
-               dev_warn(&dev->dev, "NMI decoding is disabled. To enable this "
-                       "functionality you must reboot with nmi_watchdog=0 "
-                       "and load the hpwdt driver with priority=1.\n");
+       hpwdt_nmi_decoding = 1;
  }
  #else
  static void __devinit hpwdt_check_nmi_decoding(struct pci_dev *dev)
@@ -662,7 +657,7 @@ static void __devinit hpwdt_check_nmi_decoding(struct pci_dev *dev)
         dev_warn(&dev->dev, "NMI decoding is disabled. "
                 "Your kernel does not support a NMI Watchdog.\n");
  }
-#endif /* ARCH_HAS_NMI_WATCHDOG */
+#endif /* CONFIG_X86_LOCAL_APIC */
  
  static int __devinit hpwdt_init_nmi_decoding(struct pci_dev *dev)
  {
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c

index 5476c066d4ee336733445eda2f804561179ecb41..3c4039d5eef12d1b35ffd93c3f1861e43cc9b520 100644 (file)
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -763,7 +763,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
         int metadata;
         unsigned int revokes = 0;
         int x;
-       int error;
+       int error = 0;
  
         if (!*top)
                 sm->sm_first = 0;
@@ -780,7 +780,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
         if (metadata)
                 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
  
-       error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+       if (ip != GFS2_I(sdp->sd_rindex))
+               error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+       else if (!sdp->sd_rgrps)
+               error = gfs2_ri_update(ip);
+
         if (error)
                 return error;
  
@@ -879,7 +883,8 @@ out_rg_gunlock:
  out_rlist:
         gfs2_rlist_free(&rlist);
  out:
-       gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
+       if (ip != GFS2_I(sdp->sd_rindex))
+               gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
         return error;
  }
  
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c

index f92c1770416981df8b625b6f918bac6c061c6e5e..08a8beb152e60d6aa4dd0b38ea852973e99263d4 100644 (file)
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -541,21 +541,6 @@ out_locked:
         spin_unlock(&gl->gl_spin);
  }
  
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-                                unsigned int req_state,
-                                unsigned int flags)
-{
-       int ret = LM_OUT_ERROR;
-
-       if (!sdp->sd_lockstruct.ls_ops->lm_lock)
-               return req_state == LM_ST_UNLOCKED ? 0 : req_state;
-
-       if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-               ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
-                                                        req_state, flags);
-       return ret;
-}
-
  /**
   * do_xmote - Calls the DLM to change the state of a lock
   * @gl: The lock state
@@ -575,13 +560,14 @@ __acquires(&gl->gl_spin)
  
         lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
                       LM_FLAG_PRIORITY);
-       BUG_ON(gl->gl_state == target);
-       BUG_ON(gl->gl_state == gl->gl_target);
+       GLOCK_BUG_ON(gl, gl->gl_state == target);
+       GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
         if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
             glops->go_inval) {
                 set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
                 do_error(gl, 0); /* Fail queued try locks */
         }
+       gl->gl_req = target;
         spin_unlock(&gl->gl_spin);
         if (glops->go_xmote_th)
                 glops->go_xmote_th(gl);
@@ -594,15 +580,17 @@ __acquires(&gl->gl_spin)
             gl->gl_state == LM_ST_DEFERRED) &&
             !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
                 lck_flags |= LM_FLAG_TRY_1CB;
-       ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
  
-       if (!(ret & LM_OUT_ASYNC)) {
-               finish_xmote(gl, ret);
+       if (sdp->sd_lockstruct.ls_ops->lm_lock) {
+               /* lock_dlm */
+               ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
+               GLOCK_BUG_ON(gl, ret);
+       } else { /* lock_nolock */
+               finish_xmote(gl, target);
                 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                         gfs2_glock_put(gl);
-       } else {
-               GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
         }
+
         spin_lock(&gl->gl_spin);
  }
  
@@ -951,17 +939,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
  
  void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
  {
+       struct va_format vaf;
         va_list args;
  
         va_start(args, fmt);
+
         if (seq) {
                 struct gfs2_glock_iter *gi = seq->private;
                 vsprintf(gi->string, fmt, args);
                 seq_printf(seq, gi->string);
         } else {
-               printk(KERN_ERR " ");
-               vprintk(fmt, args);
+               vaf.fmt = fmt;
+               vaf.va = &args;
+
+               printk(KERN_ERR " %pV", &vaf);
         }
+
         va_end(args);
  }
  
@@ -1361,24 +1354,28 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
   * @gl: Pointer to the glock
   * @ret: The return value from the dlm
   *
+ * The gl_reply field is under the gl_spin lock so that it is ok
+ * to use a bitfield shared with other glock state fields.
   */
  
  void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
  {
         struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
  
+       spin_lock(&gl->gl_spin);
         gl->gl_reply = ret;
  
         if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
-               spin_lock(&gl->gl_spin);
                 if (gfs2_should_freeze(gl)) {
                         set_bit(GLF_FROZEN, &gl->gl_flags);
                         spin_unlock(&gl->gl_spin);
                         return;
                 }
-               spin_unlock(&gl->gl_spin);
         }
+
+       spin_unlock(&gl->gl_spin);
         set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+       smp_wmb();
         gfs2_glock_hold(gl);
         if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
                 gfs2_glock_put(gl);
@@ -1626,18 +1623,17 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
  static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
  {
         struct task_struct *gh_owner = NULL;
-       char buffer[KSYM_SYMBOL_LEN];
         char flags_buf[32];
  
-       sprint_symbol(buffer, gh->gh_ip);
         if (gh->gh_owner_pid)
                 gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
-       gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
-                 state2str(gh->gh_state),
-                 hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
-                 gh->gh_error, 
-                 gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
-                 gh_owner ? gh_owner->comm : "(ended)", buffer);
+       gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
+                      state2str(gh->gh_state),
+                      hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+                      gh->gh_error,
+                      gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+                      gh_owner ? gh_owner->comm : "(ended)",
+                      (void *)gh->gh_ip);
         return 0;
  }
  
@@ -1782,12 +1778,13 @@ int __init gfs2_glock_init(void)
         }
  #endif
  
-       glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+       glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
                                           WQ_HIGHPRI | WQ_FREEZEABLE, 0);
         if (IS_ERR(glock_workqueue))
                 return PTR_ERR(glock_workqueue);
-       gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
-                                               WQ_FREEZEABLE, 0);
+       gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
+                                               WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+                                               0);
         if (IS_ERR(gfs2_delete_workqueue)) {
                 destroy_workqueue(glock_workqueue);
                 return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h

index db1c26d6d2206c8f9e9b68396380ed8791f3c720..691851ceb6153f59b91cd64d1ce0fecea46904e0 100644 (file)
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -87,11 +87,10 @@ enum {
  #define GL_ASYNC               0x00000040
  #define GL_EXACT               0x00000080
  #define GL_SKIP                        0x00000100
-#define GL_ATIME               0x00000200
  #define GL_NOCACHE             0x00000400
    
  /*
- * lm_lock() and lm_async_cb return flags
+ * lm_async_cb return flags
   *
   * LM_OUT_ST_MASK
   * Masks the lower two bits of lock state in the returned value.
@@ -99,15 +98,11 @@ enum {
   * LM_OUT_CANCELED
   * The lock request was canceled.
   *
- * LM_OUT_ASYNC
- * The result of the request will be returned in an LM_CB_ASYNC callback.
- *
   */
  
  #define LM_OUT_ST_MASK         0x00000003
  #define LM_OUT_CANCELED                0x00000008
-#define LM_OUT_ASYNC           0x00000080
-#define LM_OUT_ERROR           0x00000100
+#define LM_OUT_ERROR           0x00000004
  
  /*
   * lm_recovery_done() messages
@@ -124,25 +119,12 @@ struct lm_lockops {
         void (*lm_unmount) (struct gfs2_sbd *sdp);
         void (*lm_withdraw) (struct gfs2_sbd *sdp);
         void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
-       unsigned int (*lm_lock) (struct gfs2_glock *gl,
-                                unsigned int req_state, unsigned int flags);
+       int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
+                       unsigned int flags);
         void (*lm_cancel) (struct gfs2_glock *gl);
         const match_table_t *lm_tokens;
  };
  
-#define LM_FLAG_TRY            0x00000001
-#define LM_FLAG_TRY_1CB                0x00000002
-#define LM_FLAG_NOEXP          0x00000004
-#define LM_FLAG_ANY            0x00000008
-#define LM_FLAG_PRIORITY       0x00000010
-
-#define GL_ASYNC               0x00000040
-#define GL_EXACT               0x00000080
-#define GL_SKIP                        0x00000100
-#define GL_NOCACHE             0x00000400
-
-#define GLR_TRYFAILED          13
-
  extern struct workqueue_struct *gfs2_delete_workqueue;
  static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
  {
@@ -212,6 +194,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
  int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
  void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
  void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+
+__attribute__ ((format(printf, 2, 3)))
  void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
  
  /**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c

index 0d149dcc04e515adfaaeb632a6677e5e3b555f45..263561bf1a5059b4bf644340faa6a4435c62d14f 100644 (file)
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -325,7 +325,6 @@ static void trans_go_sync(struct gfs2_glock *gl)
  
         if (gl->gl_state != LM_ST_UNLOCKED &&
             test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-               flush_workqueue(gfs2_delete_workqueue);
                 gfs2_meta_syncfs(sdp);
                 gfs2_log_shutdown(sdp);
         }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h

index 764fbb49efc8e3adbdeda7f83f178b0fd6ea70f8..8d3d2b4a0a7d64431d63edff082cbedbd5b2543b 100644 (file)
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -207,12 +207,14 @@ struct gfs2_glock {
  
         spinlock_t gl_spin;
  
-       unsigned int gl_state;
-       unsigned int gl_target;
-       unsigned int gl_reply;
+       /* State fields protected by gl_spin */
+       unsigned int gl_state:2,        /* Current state */
+                    gl_target:2,       /* Target state */
+                    gl_demote_state:2, /* State requested by remote node */
+                    gl_req:2,          /* State in last dlm request */
+                    gl_reply:8;        /* Last reply from the dlm */
+
         unsigned int gl_hash;
-       unsigned int gl_req;
-       unsigned int gl_demote_state; /* state requested by remote node */
         unsigned long gl_demote_time; /* time of first demote request */
         struct list_head gl_holders;
  
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c

index e1213f7f92179aa2472304ff0db4294be66040d8..14e682dbe8bff4bd4063e4a54a5445cbcfe17937 100644 (file)
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -916,17 +916,8 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
         if (error)
                 return error;
  
-       if ((attr->ia_valid & ATTR_SIZE) &&
-           attr->ia_size != i_size_read(inode)) {
-               error = vmtruncate(inode, attr->ia_size);
-               if (error)
-                       return error;
-       }
-
         setattr_copy(inode, attr);
         mark_inode_dirty(inode);
-
-       gfs2_assert_warn(GFS2_SB(inode), !error);
         gfs2_trans_add_bh(ip->i_gl, dibh, 1);
         gfs2_dinode_out(ip, dibh->b_data);
         brelse(dibh);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c

index 1c09425b45fd728ba52c1f5f49c3feac187640a2..6e493aee28f82dfb593574f751ed81025207645c 100644 (file)
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -146,15 +146,13 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
         return lkf;
  }
  
-static unsigned int gdlm_lock(struct gfs2_glock *gl,
-                             unsigned int req_state, unsigned int flags)
+static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
+                    unsigned int flags)
  {
         struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
-       int error;
         int req;
         u32 lkf;
  
-       gl->gl_req = req_state;
         req = make_mode(req_state);
         lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
  
@@ -162,13 +160,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
          * Submit the actual lock request.
          */
  
-       error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
-                        GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
-       if (error == -EAGAIN)
-               return 0;
-       if (error)
-               return LM_OUT_ERROR;
-       return LM_OUT_ASYNC;
+       return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+                       GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
  }
  
  static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c

index 12cbea7502c26040fb90db5750e764bdd831079a..1db6b73432298d4092c0e8684483b8fff29c78ee 100644 (file)
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1069,7 +1069,6 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
  {
         struct gfs2_inode *ip = GFS2_I(inode);
         struct gfs2_sbd *sdp = GFS2_SB(inode);
-       struct buffer_head *dibh;
         u32 ouid, ogid, nuid, ngid;
         int error;
  
@@ -1100,25 +1099,10 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
         if (error)
                 goto out_gunlock_q;
  
-       error = gfs2_meta_inode_buffer(ip, &dibh);
+       error = gfs2_setattr_simple(ip, attr);
         if (error)
                 goto out_end_trans;
  
-       if ((attr->ia_valid & ATTR_SIZE) &&
-           attr->ia_size != i_size_read(inode)) {
-               int error;
-
-               error = vmtruncate(inode, attr->ia_size);
-               gfs2_assert_warn(sdp, !error);
-       }
-
-       setattr_copy(inode, attr);
-       mark_inode_dirty(inode);
-
-       gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-       gfs2_dinode_out(ip, dibh->b_data);
-       brelse(dibh);
-
         if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
                 u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
                 gfs2_quota_change(ip, -blocks, ouid, ogid);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c

index f606baf9ba7247e9a5fd9ccfb2cc9426019e589e..a689901963dea43c82b6178a4451c09560061e76 100644 (file)
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -666,6 +666,10 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
                         qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
                         qd->qd_qb.qb_limit = qp->qu_limit;
                 }
+               if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
+                       qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+                       qd->qd_qb.qb_value = qp->qu_value;
+               }
         }
  
         /* Write the quota into the quota file on disk */
@@ -1509,7 +1513,7 @@ out:
  }
  
  /* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
  
  static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                           struct fs_disk_quota *fdq)
@@ -1569,9 +1573,15 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
         if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
             ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
                 fdq->d_fieldmask ^= FS_DQ_BSOFT;
+
         if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
             ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
                 fdq->d_fieldmask ^= FS_DQ_BHARD;
+
+       if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
+           ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+               fdq->d_fieldmask ^= FS_DQ_BCOUNT;
+
         if (fdq->d_fieldmask == 0)
                 goto out_i;
  
@@ -1620,4 +1630,3 @@ const struct quotactl_ops gfs2_quotactl_ops = {
         .get_dqblk      = gfs2_get_dqblk,
         .set_dqblk      = gfs2_set_dqblk,
  };
-
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c

index 33c8407b876f00ceef0741221ebae4ba46ecb426..7293ea27020c680307e0145e863ebbb7eb0d6949 100644 (file)
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
         for (rgrps = 0;; rgrps++) {
                 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
  
-               if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
+               if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
                         break;
                 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
                                            sizeof(struct gfs2_rindex));
@@ -583,7 +583,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
   * Returns: 0 on successful update, error code otherwise
   */
  
-static int gfs2_ri_update(struct gfs2_inode *ip)
+int gfs2_ri_update(struct gfs2_inode *ip)
  {
         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
         struct inode *inode = &ip->i_inode;
@@ -613,46 +613,6 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
         return 0;
  }
  
-/**
- * gfs2_ri_update_special - Pull in a new resource index from the disk
- *
- * This is a special version that's safe to call from gfs2_inplace_reserve_i.
- * In this case we know that we don't have any resource groups in memory yet.
- *
- * @ip: pointer to the rindex inode
- *
- * Returns: 0 on successful update, error code otherwise
- */
-static int gfs2_ri_update_special(struct gfs2_inode *ip)
-{
-       struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-       struct inode *inode = &ip->i_inode;
-       struct file_ra_state ra_state;
-       struct gfs2_rgrpd *rgd;
-       unsigned int max_data = 0;
-       int error;
-
-       file_ra_state_init(&ra_state, inode->i_mapping);
-       for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
-               /* Ignore partials */
-               if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-                   i_size_read(inode))
-                       break;
-               error = read_rindex_entry(ip, &ra_state);
-               if (error) {
-                       clear_rgrpdi(sdp);
-                       return error;
-               }
-       }
-       list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
-               if (rgd->rd_data > max_data)
-                       max_data = rgd->rd_data;
-       sdp->sd_max_rg_data = max_data;
-
-       sdp->sd_rindex_uptodate = 1;
-       return 0;
-}
-
  /**
   * gfs2_rindex_hold - Grab a lock on the rindex
   * @sdp: The GFS2 superblock
@@ -1226,16 +1186,25 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
                         error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
                 else if (!sdp->sd_rgrps) /* We may not have the rindex read
                                             in, so: */
-                       error = gfs2_ri_update_special(ip);
+                       error = gfs2_ri_update(ip);
                 if (error)
                         return error;
         }
  
+try_again:
         do {
                 error = get_local_rgrp(ip, &last_unlinked);
                 /* If there is no space, flushing the log may release some */
-               if (error)
+               if (error) {
+                       if (ip == GFS2_I(sdp->sd_rindex) &&
+                           !sdp->sd_rindex_uptodate) {
+                               error = gfs2_ri_update(ip);
+                               if (error)
+                                       return error;
+                               goto try_again;
+                       }
                         gfs2_log_flush(sdp, NULL);
+               }
         } while (error && tries++ < 3);
  
         if (error) {
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h

index 0e35c0466f9a6c5979a3fe8c339def323bc37fad..50c2bb04369c8dd617fed95513461f6dc3651d0d 100644 (file)
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -48,6 +48,7 @@ extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
  
  extern void gfs2_inplace_release(struct gfs2_inode *ip);
  
+extern int gfs2_ri_update(struct gfs2_inode *ip);
  extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
  extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
  
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c

index 30b58f07c8a6b219fc964efe101ce5f861397885..439b61c03262b767956e23f761b637e0b6905383 100644 (file)
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1296,10 +1296,8 @@ fail:
  
  int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
  {
-       struct inode *inode = &ip->i_inode;
         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
         struct gfs2_ea_location el;
-       struct buffer_head *dibh;
         int error;
  
         error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
@@ -1321,26 +1319,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
         if (error)
                 return error;
  
-       error = gfs2_meta_inode_buffer(ip, &dibh);
-       if (error)
-               goto out_trans_end;
-
-       if ((attr->ia_valid & ATTR_SIZE) &&
-           attr->ia_size != i_size_read(inode)) {
-               int error;
-
-               error = vmtruncate(inode, attr->ia_size);
-               gfs2_assert_warn(GFS2_SB(inode), !error);
-       }
-
-       setattr_copy(inode, attr);
-       mark_inode_dirty(inode);
-
-       gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-       gfs2_dinode_out(ip, dibh->b_data);
-       brelse(dibh);
-
-out_trans_end:
+       error = gfs2_setattr_simple(ip, attr);
         gfs2_trans_end(sdp);
         return error;
  }
diff --git a/fs/proc/base.c b/fs/proc/base.c

index 182845147fe45bde8f5607a799f23cc1e2818117..08cba2c3b61240e085b9861967af5bd0adb4d227 100644 (file)
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1407,6 +1407,82 @@ static const struct file_operations proc_pid_sched_operations = {
  
  #endif
  
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+       struct inode *inode = m->private;
+       struct task_struct *p;
+
+       p = get_proc_task(inode);
+       if (!p)
+               return -ESRCH;
+       proc_sched_autogroup_show_task(p, m);
+
+       put_task_struct(p);
+
+       return 0;
+}
+
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+           size_t count, loff_t *offset)
+{
+       struct inode *inode = file->f_path.dentry->d_inode;
+       struct task_struct *p;
+       char buffer[PROC_NUMBUF];
+       long nice;
+       int err;
+
+       memset(buffer, 0, sizeof(buffer));
+       if (count > sizeof(buffer) - 1)
+               count = sizeof(buffer) - 1;
+       if (copy_from_user(buffer, buf, count))
+               return -EFAULT;
+
+       err = strict_strtol(strstrip(buffer), 0, &nice);
+       if (err)
+               return -EINVAL;
+
+       p = get_proc_task(inode);
+       if (!p)
+               return -ESRCH;
+
+       err = nice;
+       err = proc_sched_autogroup_set_nice(p, &err);
+       if (err)
+               count = err;
+
+       put_task_struct(p);
+
+       return count;
+}
+
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
+       int ret;
+
+       ret = single_open(filp, sched_autogroup_show, NULL);
+       if (!ret) {
+               struct seq_file *m = filp->private_data;
+
+               m->private = inode;
+       }
+       return ret;
+}
+
+static const struct file_operations proc_pid_sched_autogroup_operations = {
+       .open           = sched_autogroup_open,
+       .read           = seq_read,
+       .write          = sched_autogroup_write,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
+
  static ssize_t comm_write(struct file *file, const char __user *buf,
                                 size_t count, loff_t *offset)
  {
@@ -2732,6 +2808,9 @@ static const struct pid_entry tgid_base_stuff[] = {
         INF("limits",     S_IRUGO, proc_pid_limits),
  #ifdef CONFIG_SCHED_DEBUG
         REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
+#endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+       REG("autogroup",  S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
  #endif
         REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
  #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
diff --git a/include/linux/completion.h b/include/linux/completion.h

index 36d57f74cd01c6c126ee2f7c2ea2c98f66868b2f..51494e6b55487f30496c8870165dd75f8ba4c7b1 100644 (file)
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -81,10 +81,10 @@ extern int wait_for_completion_interruptible(struct completion *x);
  extern int wait_for_completion_killable(struct completion *x);
  extern unsigned long wait_for_completion_timeout(struct completion *x,
                                                    unsigned long timeout);
-extern unsigned long wait_for_completion_interruptible_timeout(
-                       struct completion *x, unsigned long timeout);
-extern unsigned long wait_for_completion_killable_timeout(
-                       struct completion *x, unsigned long timeout);
+extern long wait_for_completion_interruptible_timeout(
+       struct completion *x, unsigned long timeout);
+extern long wait_for_completion_killable_timeout(
+       struct completion *x, unsigned long timeout);
  extern bool try_wait_for_completion(struct completion *x);
  extern bool completion_done(struct completion *x);
  
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h

index 9d8688b92d8b02c46980f9a4397e4b2d0c313413..8cd00ad98d3773a4afa44e5e7ad6eaac184e6d71 100644 (file)
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -824,6 +824,8 @@ enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie);
  #ifdef CONFIG_DMA_ENGINE
  enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx);
  void dma_issue_pending_all(void);
+struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param);
+void dma_release_channel(struct dma_chan *chan);
  #else
  static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
  {
@@ -831,7 +833,14 @@ static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descript
  }
  static inline void dma_issue_pending_all(void)
  {
-       do { } while (0);
+}
+static inline struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask,
+                                             dma_filter_fn fn, void *fn_param)
+{
+       return NULL;
+}
+static inline void dma_release_channel(struct dma_chan *chan)
+{
  }
  #endif
  
@@ -842,8 +851,6 @@ void dma_async_device_unregister(struct dma_device *device);
  void dma_run_dependencies(struct dma_async_tx_descriptor *tx);
  struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type);
  #define dma_request_channel(mask, x, y) __dma_request_channel(&(mask), x, y)
-struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param);
-void dma_release_channel(struct dma_chan *chan);
  
  /* --- Helper iov-locking functions --- */
  
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h

index 8beabb958f61d5147c8893f1e780415a91fcb2e6..47e3997f7b5cf39233283ff43d84937daa502c2f 100644 (file)
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -154,12 +154,14 @@ enum {
         TRACE_EVENT_FL_ENABLED_BIT,
         TRACE_EVENT_FL_FILTERED_BIT,
         TRACE_EVENT_FL_RECORDED_CMD_BIT,
+       TRACE_EVENT_FL_CAP_ANY_BIT,
  };
  
  enum {
         TRACE_EVENT_FL_ENABLED          = (1 << TRACE_EVENT_FL_ENABLED_BIT),
         TRACE_EVENT_FL_FILTERED         = (1 << TRACE_EVENT_FL_FILTERED_BIT),
         TRACE_EVENT_FL_RECORDED_CMD     = (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT),
+       TRACE_EVENT_FL_CAP_ANY          = (1 << TRACE_EVENT_FL_CAP_ANY_BIT),
  };
  
  struct ftrace_event_call {
@@ -196,6 +198,14 @@ struct ftrace_event_call {
  #endif
  };
  
+#define __TRACE_EVENT_FLAGS(name, value)                               \
+       static int __init trace_init_flags_##name(void)                 \
+       {                                                               \
+               event_##name.flags = value;                             \
+               return 0;                                               \
+       }                                                               \
+       early_initcall(trace_init_flags_##name);
+
  #define PERF_MAX_TRACE_SIZE    2048
  
  #define MAX_FILTER_PRED                32
@@ -215,6 +225,10 @@ enum {
         FILTER_PTR_STRING,
  };
  
+#define EVENT_STORAGE_SIZE 128
+extern struct mutex event_storage_mutex;
+extern char event_storage[EVENT_STORAGE_SIZE];
+
  extern int trace_event_raw_init(struct ftrace_event_call *call);
  extern int trace_define_field(struct ftrace_event_call *call, const char *type,
                               const char *name, int offset, int size,
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h

index fd0c1b857d3dbcd9c074e461ded81b69f3ab897d..330586ffffbbccad534b1f81b7309d766f8d48fb 100644 (file)
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -22,7 +22,7 @@
  #include <linux/wait.h>
  #include <linux/percpu.h>
  #include <linux/timer.h>
-
+#include <linux/timerqueue.h>
  
  struct hrtimer_clock_base;
  struct hrtimer_cpu_base;
@@ -79,8 +79,8 @@ enum hrtimer_restart {
  
  /**
   * struct hrtimer - the basic hrtimer structure
- * @node:      red black tree node for time ordered insertion
- * @_expires:  the absolute expiry time in the hrtimers internal
+ * @node:      timerqueue node, which also manages node.expires,
+ *             the absolute expiry time in the hrtimers internal
   *             representation. The time is related to the clock on
   *             which the timer is based. Is setup by adding
   *             slack to the _softexpires value. For non range timers
@@ -101,8 +101,7 @@ enum hrtimer_restart {
   * The hrtimer structure must be initialized by hrtimer_init()
   */
  struct hrtimer {
-       struct rb_node                  node;
-       ktime_t                         _expires;
+       struct timerqueue_node          node;
         ktime_t                         _softexpires;
         enum hrtimer_restart            (*function)(struct hrtimer *);
         struct hrtimer_clock_base       *base;
@@ -141,8 +140,7 @@ struct hrtimer_sleeper {
  struct hrtimer_clock_base {
         struct hrtimer_cpu_base *cpu_base;
         clockid_t               index;
-       struct rb_root          active;
-       struct rb_node          *first;
+       struct timerqueue_head  active;
         ktime_t                 resolution;
         ktime_t                 (*get_time)(void);
         ktime_t                 softirq_time;
@@ -158,7 +156,6 @@ struct hrtimer_clock_base {
   * @lock:              lock protecting the base and associated clock bases
   *                     and timers
   * @clock_base:                array of clock bases for this cpu
- * @curr_timer:                the timer which is executing a callback right now
   * @expires_next:      absolute time of the next event which was scheduled
   *                     via clock_set_next_event()
   * @hres_active:       State of high resolution mode
@@ -184,43 +181,43 @@ struct hrtimer_cpu_base {
  
  static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
  {
-       timer->_expires = time;
+       timer->node.expires = time;
         timer->_softexpires = time;
  }
  
  static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
  {
         timer->_softexpires = time;
-       timer->_expires = ktime_add_safe(time, delta);
+       timer->node.expires = ktime_add_safe(time, delta);
  }
  
  static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta)
  {
         timer->_softexpires = time;
-       timer->_expires = ktime_add_safe(time, ns_to_ktime(delta));
+       timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta));
  }
  
  static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
  {
-       timer->_expires.tv64 = tv64;
+       timer->node.expires.tv64 = tv64;
         timer->_softexpires.tv64 = tv64;
  }
  
  static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
  {
-       timer->_expires = ktime_add_safe(timer->_expires, time);
+       timer->node.expires = ktime_add_safe(timer->node.expires, time);
         timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
  }
  
  static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns)
  {
-       timer->_expires = ktime_add_ns(timer->_expires, ns);
+       timer->node.expires = ktime_add_ns(timer->node.expires, ns);
         timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
  }
  
  static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
  {
-       return timer->_expires;
+       return timer->node.expires;
  }
  
  static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
@@ -230,7 +227,7 @@ static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
  
  static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
  {
-       return timer->_expires.tv64;
+       return timer->node.expires.tv64;
  }
  static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
  {
@@ -239,12 +236,12 @@ static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
  
  static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
  {
-       return ktime_to_ns(timer->_expires);
+       return ktime_to_ns(timer->node.expires);
  }
  
  static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
  {
-    return ktime_sub(timer->_expires, timer->base->get_time());
+       return ktime_sub(timer->node.expires, timer->base->get_time());
  }
  
  #ifdef CONFIG_HIGH_RES_TIMERS
diff --git a/include/linux/init_task.h b/include/linux/init_task.h

index 1f8c06ce0fa66b83760863735eaf1209908205d7..caa151fbebb74c661289a69ffb52762435178d53 100644 (file)
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -12,6 +12,13 @@
  #include <linux/securebits.h>
  #include <net/net_namespace.h>
  
+#ifdef CONFIG_SMP
+# define INIT_PUSHABLE_TASKS(tsk)                                      \
+       .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
+#else
+# define INIT_PUSHABLE_TASKS(tsk)
+#endif
+
  extern struct files_struct init_files;
  extern struct fs_struct init_fs;
  
@@ -83,6 +90,12 @@ extern struct group_info init_groups;
   */
  # define CAP_INIT_BSET  CAP_FULL_SET
  
+#ifdef CONFIG_RCU_BOOST
+#define INIT_TASK_RCU_BOOST()                                          \
+       .rcu_boost_mutex = NULL,
+#else
+#define INIT_TASK_RCU_BOOST()
+#endif
  #ifdef CONFIG_TREE_PREEMPT_RCU
  #define INIT_TASK_RCU_TREE_PREEMPT()                                   \
         .rcu_blocked_node = NULL,
@@ -94,7 +107,8 @@ extern struct group_info init_groups;
         .rcu_read_lock_nesting = 0,                                     \
         .rcu_read_unlock_special = 0,                                   \
         .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),           \
-       INIT_TASK_RCU_TREE_PREEMPT()
+       INIT_TASK_RCU_TREE_PREEMPT()                                    \
+       INIT_TASK_RCU_BOOST()
  #else
  #define INIT_TASK_RCU_PREEMPT(tsk)
  #endif
@@ -137,7 +151,7 @@ extern struct cred init_cred;
                 .nr_cpus_allowed = NR_CPUS,                             \
         },                                                              \
         .tasks          = LIST_HEAD_INIT(tsk.tasks),                    \
-       .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
+       INIT_PUSHABLE_TASKS(tsk)                                        \
         .ptraced        = LIST_HEAD_INIT(tsk.ptraced),                  \
         .ptrace_entry   = LIST_HEAD_INIT(tsk.ptrace_entry),             \
         .real_parent    = &tsk,                                         \
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h

index 79d0c4f6d0719452c20494b1439d0d695e212e90..55e0d4253e4927eb67254f38137b2a9e787afa9d 100644 (file)
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -114,15 +114,15 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
  struct irqaction {
         irq_handler_t handler;
         unsigned long flags;
-       const char *name;
         void *dev_id;
         struct irqaction *next;
         int irq;
-       struct proc_dir_entry *dir;
         irq_handler_t thread_fn;
         struct task_struct *thread;
         unsigned long thread_flags;
-};
+       const char *name;
+       struct proc_dir_entry *dir;
+} ____cacheline_internodealigned_in_smp;
  
  extern irqreturn_t no_action(int cpl, void *dev_id);
  
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h

index e7d1b2e0070d3570b7022877a79fa2f0ed081507..b78edb58ee66164e756b4789baf71ab86e8684c4 100644 (file)
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -275,7 +275,9 @@ extern int arch_prepared_optinsn(struct arch_optimized_insn *optinsn);
  extern int arch_check_optimized_kprobe(struct optimized_kprobe *op);
  extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op);
  extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op);
-extern int  arch_optimize_kprobe(struct optimized_kprobe *op);
+extern void arch_optimize_kprobes(struct list_head *oplist);
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+                                   struct list_head *done_list);
  extern void arch_unoptimize_kprobe(struct optimized_kprobe *op);
  extern kprobe_opcode_t *get_optinsn_slot(void);
  extern void free_optinsn_slot(kprobe_opcode_t *slot, int dirty);
diff --git a/include/linux/module.h b/include/linux/module.h

index 7575bbbdf2a2b8e6a716fb6252c3d8e958756f52..8b17fd8c790d8601f8aff0a33c7b909984e30545 100644 (file)
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -308,6 +308,9 @@ struct module
         /* The size of the executable code in each section.  */
         unsigned int init_text_size, core_text_size;
  
+       /* Size of RO sections of the module (text+rodata) */
+       unsigned int init_ro_size, core_ro_size;
+
         /* Arch-specific module values */
         struct mod_arch_specific arch;
  
@@ -672,7 +675,6 @@ static inline int module_get_iter_tracepoints(struct tracepoint_iter *iter)
  {
         return 0;
  }
-
  #endif /* CONFIG_MODULES */
  
  #ifdef CONFIG_SYSFS
@@ -687,6 +689,13 @@ extern int module_sysfs_initialized;
  
  #define __MODULE_STRING(x) __stringify(x)
  
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+extern void set_all_modules_text_rw(void);
+extern void set_all_modules_text_ro(void);
+#else
+static inline void set_all_modules_text_rw(void) { }
+static inline void set_all_modules_text_ro(void) { }
+#endif
  
  #ifdef CONFIG_GENERIC_BUG
  void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
diff --git a/include/linux/mutex.h b/include/linux/mutex.h

index f363bc8fdc74c821c99aa59d5bfcb9554c012c9a..94b48bd40dd735f77963fcd31797d32bb68b3379 100644 (file)
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -160,4 +160,8 @@ extern int mutex_trylock(struct mutex *lock);
  extern void mutex_unlock(struct mutex *lock);
  extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
  
+#ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
+#define arch_mutex_cpu_relax() cpu_relax()
+#endif
+
  #endif
diff --git a/include/linux/nmi.h b/include/linux/nmi.h

index 06aab5eee134cd56c4bade9005912fa3a785a327..c536f8545f74c11e345943187f201bfc25e48baa 100644 (file)
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -14,22 +14,14 @@
   * may be used to reset the timeout - for code which intentionally
   * disables interrupts for a long time. This call is stateless.
   */
-#ifdef ARCH_HAS_NMI_WATCHDOG
+#if defined(ARCH_HAS_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
  #include <asm/nmi.h>
  extern void touch_nmi_watchdog(void);
-extern void acpi_nmi_disable(void);
-extern void acpi_nmi_enable(void);
  #else
-#ifndef CONFIG_HARDLOCKUP_DETECTOR
  static inline void touch_nmi_watchdog(void)
  {
         touch_softlockup_watchdog();
  }
-#else
-extern void touch_nmi_watchdog(void);
-#endif
-static inline void acpi_nmi_disable(void) { }
-static inline void acpi_nmi_enable(void) { }
  #endif
  
  /*
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index 4f1279e105ee143e4317219b3cb093bc8bbdd954..dda5b0a3ff6014b8a0741a186ed0e3968b63d298 100644 (file)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -215,8 +215,9 @@ struct perf_event_attr {
                                  */
                                 precise_ip     :  2, /* skid constraint       */
                                 mmap_data      :  1, /* non-exec mmap data    */
+                               sample_id_all  :  1, /* sample_type all events */
  
-                               __reserved_1   : 46;
+                               __reserved_1   : 45;
  
         union {
                 __u32           wakeup_events;    /* wakeup every n events */
@@ -327,6 +328,15 @@ struct perf_event_header {
  enum perf_event_type {
  
         /*
+        * If perf_event_attr.sample_id_all is set then all event types will
+        * have the sample_type selected fields related to where/when
+        * (identity) an event took place (TID, TIME, ID, CPU, STREAM_ID)
+        * described in PERF_RECORD_SAMPLE below, it will be stashed just after
+        * the perf_event_header and the fields already present for the existing
+        * fields, i.e. at the end of the payload. That way a newer perf.data
+        * file will be supported by older perf tools, with these new optional
+        * fields being ignored.
+        *
          * The MMAP events record the PROT_EXEC mappings so that we can
          * correlate userspace IPs to code. They have the following structure:
          *
@@ -578,6 +588,10 @@ struct perf_event;
  struct pmu {
         struct list_head                entry;
  
+       struct device                   *dev;
+       char                            *name;
+       int                             type;
+
         int * __percpu                  pmu_disable_count;
         struct perf_cpu_context * __percpu pmu_cpu_context;
         int                             task_ctx_nr;
@@ -758,6 +772,9 @@ struct perf_event {
         u64                             shadow_ctx_time;
  
         struct perf_event_attr          attr;
+       u16                             header_size;
+       u16                             id_header_size;
+       u16                             read_size;
         struct hw_perf_event            hw;
  
         struct perf_event_context       *ctx;
@@ -903,7 +920,7 @@ struct perf_output_handle {
  
  #ifdef CONFIG_PERF_EVENTS
  
-extern int perf_pmu_register(struct pmu *pmu);
+extern int perf_pmu_register(struct pmu *pmu, char *name, int type);
  extern void perf_pmu_unregister(struct pmu *pmu);
  
  extern int perf_num_counters(void);
@@ -970,6 +987,11 @@ extern int perf_event_overflow(struct perf_event *event, int nmi,
                                  struct perf_sample_data *data,
                                  struct pt_regs *regs);
  
+static inline bool is_sampling_event(struct perf_event *event)
+{
+       return event->attr.sample_period != 0;
+}
+
  /*
   * Return 1 for a software event, 0 for a hardware event
   */
diff --git a/include/linux/rculist.h b/include/linux/rculist.h

index f31ef61f1c650b585bd6faf969f7cec754dffe2d..2dea94fc44026a1048f912913be298b8df179873 100644 (file)
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -241,11 +241,6 @@ static inline void list_splice_init_rcu(struct list_head *list,
  #define list_first_entry_rcu(ptr, type, member) \
         list_entry_rcu((ptr)->next, type, member)
  
-#define __list_for_each_rcu(pos, head) \
-       for (pos = rcu_dereference_raw(list_next_rcu(head)); \
-               pos != (head); \
-               pos = rcu_dereference_raw(list_next_rcu((pos)))
-
  /**
   * list_for_each_entry_rcu     -       iterate over rcu list of given type
   * @pos:       the type * to use as a loop cursor.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h

index 03cda7bed98587b128c5a9953316644a8debb4d2..af5614856285d32e0f07d3ca7e7294b03b9b27b7 100644 (file)
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -47,6 +47,8 @@
  extern int rcutorture_runnable; /* for sysctl */
  #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
  
+#define UINT_CMP_GE(a, b)      (UINT_MAX / 2 >= (a) - (b))
+#define UINT_CMP_LT(a, b)      (UINT_MAX / 2 < (a) - (b))
  #define ULONG_CMP_GE(a, b)     (ULONG_MAX / 2 >= (a) - (b))
  #define ULONG_CMP_LT(a, b)     (ULONG_MAX / 2 < (a) - (b))
  
@@ -66,7 +68,6 @@ extern void call_rcu_sched(struct rcu_head *head,
  extern void synchronize_sched(void);
  extern void rcu_barrier_bh(void);
  extern void rcu_barrier_sched(void);
-extern void synchronize_sched_expedited(void);
  extern int sched_expedited_torture_stats(char *page);
  
  static inline void __rcu_read_lock_bh(void)
@@ -118,7 +119,6 @@ static inline int rcu_preempt_depth(void)
  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
  
  /* Internal to kernel */
-extern void rcu_init(void);
  extern void rcu_sched_qs(int cpu);
  extern void rcu_bh_qs(int cpu);
  extern void rcu_check_callbacks(int cpu, int user);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h

index 13877cb93a6000043f11a6704f2d90b0cc04552d..30ebd7c8d874b4dfeb9c9c9e5c5e857fbb43ab62 100644 (file)
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,7 +27,9 @@
  
  #include <linux/cache.h>
  
-#define rcu_init_sched()       do { } while (0)
+static inline void rcu_init(void)
+{
+}
  
  #ifdef CONFIG_TINY_RCU
  
@@ -58,6 +60,11 @@ static inline void synchronize_rcu_bh_expedited(void)
         synchronize_sched();
  }
  
+static inline void synchronize_sched_expedited(void)
+{
+       synchronize_sched();
+}
+
  #ifdef CONFIG_TINY_RCU
  
  static inline void rcu_preempt_note_context_switch(void)
@@ -125,16 +132,12 @@ static inline void rcu_cpu_stall_reset(void)
  }
  
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
-
  extern int rcu_scheduler_active __read_mostly;
  extern void rcu_scheduler_starting(void);
-
  #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
  static inline void rcu_scheduler_starting(void)
  {
  }
-
  #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
  
  #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h

index 95518e6287946177e0eceb5cbf201ebfcaf0e072..3a933482734aeccbafc7a0bb735be11ede47cbd2 100644 (file)
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,6 +30,7 @@
  #ifndef __LINUX_RCUTREE_H
  #define __LINUX_RCUTREE_H
  
+extern void rcu_init(void);
  extern void rcu_note_context_switch(int cpu);
  extern int rcu_needs_cpu(int cpu);
  extern void rcu_cpu_stall_reset(void);
@@ -47,6 +48,7 @@ static inline void exit_rcu(void)
  #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
  
  extern void synchronize_rcu_bh(void);
+extern void synchronize_sched_expedited(void);
  extern void synchronize_rcu_expedited(void);
  
  static inline void synchronize_rcu_bh_expedited(void)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 223874538b33208e3c5ff11710f3161d58b4aef2..777cd01e240ee0fca7a8b6a76c74137d0dcfaaa9 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -316,6 +316,7 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
                                   size_t *lenp, loff_t *ppos);
  extern unsigned int  softlockup_panic;
  extern int softlockup_thresh;
+void lockup_detector_init(void);
  #else
  static inline void touch_softlockup_watchdog(void)
  {
@@ -326,6 +327,9 @@ static inline void touch_softlockup_watchdog_sync(void)
  static inline void touch_all_softlockup_watchdogs(void)
  {
  }
+static inline void lockup_detector_init(void)
+{
+}
  #endif
  
  #ifdef CONFIG_DETECT_HUNG_TASK
@@ -509,6 +513,8 @@ struct thread_group_cputimer {
         spinlock_t lock;
  };
  
+struct autogroup;
+
  /*
   * NOTE! "signal_struct" does not have it's own
   * locking, because a shared signal_struct always
@@ -576,6 +582,9 @@ struct signal_struct {
  
         struct tty_struct *tty; /* NULL if no tty */
  
+#ifdef CONFIG_SCHED_AUTOGROUP
+       struct autogroup *autogroup;
+#endif
         /*
          * Cumulative resource counters for dead threads in the group,
          * and for reaped dead child processes forked by this group.
@@ -1229,13 +1238,18 @@ struct task_struct {
  #ifdef CONFIG_TREE_PREEMPT_RCU
         struct rcu_node *rcu_blocked_node;
  #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+       struct rt_mutex *rcu_boost_mutex;
+#endif /* #ifdef CONFIG_RCU_BOOST */
  
  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
         struct sched_info sched_info;
  #endif
  
         struct list_head tasks;
+#ifdef CONFIG_SMP
         struct plist_node pushable_tasks;
+#endif
  
         struct mm_struct *mm, *active_mm;
  #if defined(SPLIT_RSS_COUNTING)
@@ -1759,7 +1773,8 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
  #ifdef CONFIG_PREEMPT_RCU
  
  #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
-#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
+#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
  
  static inline void rcu_copy_process(struct task_struct *p)
  {
@@ -1767,7 +1782,10 @@ static inline void rcu_copy_process(struct task_struct *p)
         p->rcu_read_unlock_special = 0;
  #ifdef CONFIG_TREE_PREEMPT_RCU
         p->rcu_blocked_node = NULL;
-#endif
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+       p->rcu_boost_mutex = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
         INIT_LIST_HEAD(&p->rcu_node_entry);
  }
  
@@ -1872,14 +1890,11 @@ extern void sched_clock_idle_sleep_event(void);
  extern void sched_clock_idle_wakeup_event(u64 delta_ns);
  
  #ifdef CONFIG_HOTPLUG_CPU
-extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
  extern void idle_task_exit(void);
  #else
  static inline void idle_task_exit(void) {}
  #endif
  
-extern void sched_idle_next(void);
-
  #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
  extern void wake_up_idle_cpu(int cpu);
  #else
@@ -1889,8 +1904,6 @@ static inline void wake_up_idle_cpu(int cpu) { }
  extern unsigned int sysctl_sched_latency;
  extern unsigned int sysctl_sched_min_granularity;
  extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_shares_ratelimit;
-extern unsigned int sysctl_sched_shares_thresh;
  extern unsigned int sysctl_sched_child_runs_first;
  
  enum sched_tunable_scaling {
@@ -1906,6 +1919,7 @@ extern unsigned int sysctl_sched_migration_cost;
  extern unsigned int sysctl_sched_nr_migrate;
  extern unsigned int sysctl_sched_time_avg;
  extern unsigned int sysctl_timer_migration;
+extern unsigned int sysctl_sched_shares_window;
  
  int sched_proc_update_handler(struct ctl_table *table, int write,
                 void __user *buffer, size_t *length,
@@ -1931,6 +1945,24 @@ int sched_rt_handler(struct ctl_table *table, int write,
  
  extern unsigned int sysctl_sched_compat_yield;
  
+#ifdef CONFIG_SCHED_AUTOGROUP
+extern unsigned int sysctl_sched_autogroup_enabled;
+
+extern void sched_autogroup_create_attach(struct task_struct *p);
+extern void sched_autogroup_detach(struct task_struct *p);
+extern void sched_autogroup_fork(struct signal_struct *sig);
+extern void sched_autogroup_exit(struct signal_struct *sig);
+#ifdef CONFIG_PROC_FS
+extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
+extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice);
+#endif
+#else
+static inline void sched_autogroup_create_attach(struct task_struct *p) { }
+static inline void sched_autogroup_detach(struct task_struct *p) { }
+static inline void sched_autogroup_fork(struct signal_struct *sig) { }
+static inline void sched_autogroup_exit(struct signal_struct *sig) { }
+#endif
+
  #ifdef CONFIG_RT_MUTEXES
  extern int rt_mutex_getprio(struct task_struct *p);
  extern void rt_mutex_setprio(struct task_struct *p, int prio);
@@ -1949,9 +1981,10 @@ extern int task_nice(const struct task_struct *p);
  extern int can_nice(const struct task_struct *p, const int nice);
  extern int task_curr(const struct task_struct *p);
  extern int idle_cpu(int cpu);
-extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
+extern int sched_setscheduler(struct task_struct *, int,
+                             const struct sched_param *);
  extern int sched_setscheduler_nocheck(struct task_struct *, int,
-                                     struct sched_param *);
+                                     const struct sched_param *);
  extern struct task_struct *idle_task(int cpu);
  extern struct task_struct *curr_task(int cpu);
  extern void set_curr_task(int cpu, struct task_struct *p);
diff --git a/include/linux/sfi.h b/include/linux/sfi.h

index 7f770c638e99d670840ed99856dce0b07a660f8b..fe817918b30e49ff96648394d99d8fc85eebaac8 100644 (file)
--- a/include/linux/sfi.h
+++ b/include/linux/sfi.h
@@ -77,6 +77,8 @@
  #define SFI_OEM_ID_SIZE                6
  #define SFI_OEM_TABLE_ID_SIZE  8
  
+#define SFI_NAME_LEN           16
+
  #define SFI_SYST_SEARCH_BEGIN          0x000E0000
  #define SFI_SYST_SEARCH_END            0x000FFFFF
  
@@ -156,13 +158,13 @@ struct sfi_device_table_entry {
         u16     addr;
         u8      irq;
         u32     max_freq;
-       char    name[16];
+       char    name[SFI_NAME_LEN];
  } __packed;
  
  struct sfi_gpio_table_entry {
-       char    controller_name[16];
+       char    controller_name[SFI_NAME_LEN];
         u16     pin_no;
-       char    pin_name[16];
+       char    pin_name[SFI_NAME_LEN];
  } __packed;
  
  typedef int (*sfi_table_handler) (struct sfi_table_header *table);
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h

index 51efbef38fb0e204cfddb61b56619d52cefab623..25310f1d7f3773c540e51e7103a4edfa98db7a33 100644 (file)
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -2,6 +2,7 @@
  #define __LINUX_STACKTRACE_H
  
  struct task_struct;
+struct pt_regs;
  
  #ifdef CONFIG_STACKTRACE
  struct task_struct;
@@ -13,7 +14,8 @@ struct stack_trace {
  };
  
  extern void save_stack_trace(struct stack_trace *trace);
-extern void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp);
+extern void save_stack_trace_regs(struct stack_trace *trace,
+                                 struct pt_regs *regs);
  extern void save_stack_trace_tsk(struct task_struct *tsk,
                                 struct stack_trace *trace);
  
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h

index cacc27a0e285163d9a8727a4131ffed478b8f46c..18cd0684fc4ec4bb2e6fb52ed6a7838737688c17 100644 (file)
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -127,8 +127,6 @@ extern struct trace_event_functions exit_syscall_print_funcs;
  #define SYSCALL_TRACE_ENTER_EVENT(sname)                               \
         static struct syscall_metadata                                  \
         __attribute__((__aligned__(4))) __syscall_meta_##sname;         \
-       static struct ftrace_event_call                                 \
-       __attribute__((__aligned__(4))) event_enter_##sname;            \
         static struct ftrace_event_call __used                          \
           __attribute__((__aligned__(4)))                               \
           __attribute__((section("_ftrace_events")))                    \
@@ -137,13 +135,12 @@ extern struct trace_event_functions exit_syscall_print_funcs;
                 .class                  = &event_class_syscall_enter,   \
                 .event.funcs            = &enter_syscall_print_funcs,   \
                 .data                   = (void *)&__syscall_meta_##sname,\
-       }
+       };                                                              \
+       __TRACE_EVENT_FLAGS(enter_##sname, TRACE_EVENT_FL_CAP_ANY)
  
  #define SYSCALL_TRACE_EXIT_EVENT(sname)                                        \
         static struct syscall_metadata                                  \
         __attribute__((__aligned__(4))) __syscall_meta_##sname;         \
-       static struct ftrace_event_call                                 \
-       __attribute__((__aligned__(4))) event_exit_##sname;             \
         static struct ftrace_event_call __used                          \
           __attribute__((__aligned__(4)))                               \
           __attribute__((section("_ftrace_events")))                    \
@@ -152,7 +149,8 @@ extern struct trace_event_functions exit_syscall_print_funcs;
                 .class                  = &event_class_syscall_exit,    \
                 .event.funcs            = &exit_syscall_print_funcs,    \
                 .data                   = (void *)&__syscall_meta_##sname,\
-       }
+       };                                                              \
+       __TRACE_EVENT_FLAGS(exit_##sname, TRACE_EVENT_FL_CAP_ANY)
  
  #define SYSCALL_METADATA(sname, nb)                            \
         SYSCALL_TRACE_ENTER_EVENT(sname);                       \
diff --git a/include/linux/timer.h b/include/linux/timer.h

index 38cf093ef62c745d9f06e1038127ef61834796a1..6abd9138beda57f7555b96b9fa0d51c60edaa50f 100644 (file)
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -24,9 +24,9 @@ struct timer_list {
         int slack;
  
  #ifdef CONFIG_TIMER_STATS
+       int start_pid;
         void *start_site;
         char start_comm[16];
-       int start_pid;
  #endif
  #ifdef CONFIG_LOCKDEP
         struct lockdep_map lockdep_map;
@@ -48,12 +48,38 @@ extern struct tvec_base boot_tvec_bases;
  #define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)
  #endif
  
+/*
+ * Note that all tvec_bases are 2 byte aligned and lower bit of
+ * base in timer_list is guaranteed to be zero. Use the LSB to
+ * indicate whether the timer is deferrable.
+ *
+ * A deferrable timer will work normally when the system is busy, but
+ * will not cause a CPU to come out of idle just to service it; instead,
+ * the timer will be serviced when the CPU eventually wakes up with a
+ * subsequent non-deferrable timer.
+ */
+#define TBASE_DEFERRABLE_FLAG          (0x1)
+
  #define TIMER_INITIALIZER(_function, _expires, _data) {                \
                 .entry = { .prev = TIMER_ENTRY_STATIC },        \
                 .function = (_function),                        \
                 .expires = (_expires),                          \
                 .data = (_data),                                \
                 .base = &boot_tvec_bases,                       \
+               .slack = -1,                                    \
+               __TIMER_LOCKDEP_MAP_INITIALIZER(                \
+                       __FILE__ ":" __stringify(__LINE__))     \
+       }
+
+#define TBASE_MAKE_DEFERRED(ptr) ((struct tvec_base *)         \
+                 ((unsigned char *)(ptr) + TBASE_DEFERRABLE_FLAG))
+
+#define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data) {\
+               .entry = { .prev = TIMER_ENTRY_STATIC },        \
+               .function = (_function),                        \
+               .expires = (_expires),                          \
+               .data = (_data),                                \
+               .base = TBASE_MAKE_DEFERRED(&boot_tvec_bases),  \
                 __TIMER_LOCKDEP_MAP_INITIALIZER(                \
                         __FILE__ ":" __stringify(__LINE__))     \
         }
@@ -248,11 +274,11 @@ static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
  
  extern void add_timer(struct timer_list *timer);
  
+extern int try_to_del_timer_sync(struct timer_list *timer);
+
  #ifdef CONFIG_SMP
-  extern int try_to_del_timer_sync(struct timer_list *timer);
    extern int del_timer_sync(struct timer_list *timer);
  #else
-# define try_to_del_timer_sync(t)      del_timer(t)
  # define del_timer_sync(t)             del_timer(t)
  #endif
  
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h

new file mode 100644 (file)

index 0000000..d24aaba
--- /dev/null
+++ b/include/linux/timerqueue.h
@@ -0,0 +1,50 @@
+#ifndef _LINUX_TIMERQUEUE_H
+#define _LINUX_TIMERQUEUE_H
+
+#include <linux/rbtree.h>
+#include <linux/ktime.h>
+
+
+struct timerqueue_node {
+       struct rb_node node;
+       ktime_t expires;
+};
+
+struct timerqueue_head {
+       struct rb_root head;
+       struct timerqueue_node *next;
+};
+
+
+extern void timerqueue_add(struct timerqueue_head *head,
+                               struct timerqueue_node *node);
+extern void timerqueue_del(struct timerqueue_head *head,
+                               struct timerqueue_node *node);
+extern struct timerqueue_node *timerqueue_iterate_next(
+                                               struct timerqueue_node *node);
+
+/**
+ * timerqueue_getnext - Returns the timer with the earlies expiration time
+ *
+ * @head: head of timerqueue
+ *
+ * Returns a pointer to the timer node that has the
+ * earliest expiration time.
+ */
+static inline
+struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
+{
+       return head->next;
+}
+
+static inline void timerqueue_init(struct timerqueue_node *node)
+{
+       RB_CLEAR_NODE(&node->node);
+}
+
+static inline void timerqueue_init_head(struct timerqueue_head *head)
+{
+       head->head = RB_ROOT;
+       head->next = NULL;
+}
+#endif /* _LINUX_TIMERQUEUE_H */
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h

index a4a90b6726ce6129b43174609fb3e35a2bd088ae..d3e4f87e95c0fa67236f92c2a688fdaa640cfaae 100644 (file)
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -106,6 +106,7 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
  
  #define TP_PROTO(args...)      args
  #define TP_ARGS(args...)       args
+#define TP_CONDITION(args...)  args
  
  #ifdef CONFIG_TRACEPOINTS
  
@@ -119,12 +120,14 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
   * as "(void *, void)". The DECLARE_TRACE_NOARGS() will pass in just
   * "void *data", where as the DECLARE_TRACE() will pass in "void *data, proto".
   */
-#define __DO_TRACE(tp, proto, args)                                    \
+#define __DO_TRACE(tp, proto, args, cond)                              \
         do {                                                            \
                 struct tracepoint_func *it_func_ptr;                    \
                 void *it_func;                                          \
                 void *__data;                                           \
                                                                         \
+               if (!(cond))                                            \
+                       return;                                         \
                 rcu_read_lock_sched_notrace();                          \
                 it_func_ptr = rcu_dereference_sched((tp)->funcs);       \
                 if (it_func_ptr) {                                      \
@@ -142,7 +145,7 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
   * not add unwanted padding between the beginning of the section and the
   * structure. Force alignment to the same alignment as the section start.
   */
-#define __DECLARE_TRACE(name, proto, args, data_proto, data_args)      \
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args)        \
         extern struct tracepoint __tracepoint_##name;                   \
         static inline void trace_##name(proto)                          \
         {                                                               \
@@ -151,7 +154,8 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
  do_trace:                                                              \
                         __DO_TRACE(&__tracepoint_##name,                \
                                 TP_PROTO(data_proto),                   \
-                               TP_ARGS(data_args));                    \
+                               TP_ARGS(data_args),                     \
+                               TP_CONDITION(cond));                    \
         }                                                               \
         static inline int                                               \
         register_trace_##name(void (*probe)(data_proto), void *data)    \
@@ -186,7 +190,7 @@ do_trace:                                                           \
         EXPORT_SYMBOL(__tracepoint_##name)
  
  #else /* !CONFIG_TRACEPOINTS */
-#define __DECLARE_TRACE(name, proto, args, data_proto, data_args)      \
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args)        \
         static inline void trace_##name(proto)                          \
         { }                                                             \
         static inline int                                               \
@@ -227,13 +231,20 @@ do_trace:                                                         \
   * "void *__data, proto" as the callback prototype.
   */
  #define DECLARE_TRACE_NOARGS(name)                                     \
-               __DECLARE_TRACE(name, void, , void *__data, __data)
+               __DECLARE_TRACE(name, void, , 1, void *__data, __data)
  
  #define DECLARE_TRACE(name, proto, args)                               \
-               __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),      \
+               __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), 1,   \
                                 PARAMS(void *__data, proto),            \
                                 PARAMS(__data, args))
  
+#define DECLARE_TRACE_CONDITION(name, proto, args, cond)               \
+       __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), PARAMS(cond), \
+                       PARAMS(void *__data, proto),                    \
+                       PARAMS(__data, args))
+
+#define TRACE_EVENT_FLAGS(event, flag)
+
  #endif /* DECLARE_TRACE */
  
  #ifndef TRACE_EVENT
@@ -347,11 +358,21 @@ do_trace:                                                         \
         DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
  #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
         DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#define DEFINE_EVENT_CONDITION(template, name, proto,          \
+                              args, cond)                      \
+       DECLARE_TRACE_CONDITION(name, PARAMS(proto),            \
+                               PARAMS(args), PARAMS(cond))
  
  #define TRACE_EVENT(name, proto, args, struct, assign, print)  \
         DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
  #define TRACE_EVENT_FN(name, proto, args, struct,              \
                 assign, print, reg, unreg)                      \
         DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#define TRACE_EVENT_CONDITION(name, proto, args, cond,         \
+                             struct, assign, print)            \
+       DECLARE_TRACE_CONDITION(name, PARAMS(proto),            \
+                               PARAMS(args), PARAMS(cond))
+
+#define TRACE_EVENT_FLAGS(event, flag)
  
  #endif /* ifdef TRACE_EVENT (see note above) */
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h

index 0c0771f06bfa745e8e4e5add4ec4823cf52eb813..bd257fee60310184b52d0f8c15f1206f6a4f5dad 100644 (file)
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -127,12 +127,20 @@ struct execute_work {
         .timer = TIMER_INITIALIZER(NULL, 0, 0),                 \
         }
  
+#define __DEFERRED_WORK_INITIALIZER(n, f) {                    \
+       .work = __WORK_INITIALIZER((n).work, (f)),              \
+       .timer = TIMER_DEFERRED_INITIALIZER(NULL, 0, 0),        \
+       }
+
  #define DECLARE_WORK(n, f)                                     \
         struct work_struct n = __WORK_INITIALIZER(n, f)
  
  #define DECLARE_DELAYED_WORK(n, f)                             \
         struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f)
  
+#define DECLARE_DEFERRED_WORK(n, f)                            \
+       struct delayed_work n = __DEFERRED_WORK_INITIALIZER(n, f)
+
  /*
   * initialize a work item's function pointer
   */
diff --git a/include/media/wm8775.h b/include/media/wm8775.h

index a1c4d417dfa205e8d5c2cf1d4f9d6bbd7a6ec419..60739c5a23ae3e30e943b55dc45cd53289720db4 100644 (file)
--- a/include/media/wm8775.h
+++ b/include/media/wm8775.h
@@ -32,7 +32,4 @@
  #define WM8775_AIN3 4
  #define WM8775_AIN4 8
  
-/* subdev group ID */
-#define WM8775_GID (1 << 0)
-
  #endif
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h

index 1dfab54015113b83bce9f3302470c3a5ed95b5e7..b0b4eb24d592fb1f8ecba11294c10e802ff7cd2b 100644 (file)
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -26,6 +26,15 @@
  #define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
         DEFINE_TRACE(name)
  
+#undef TRACE_EVENT_CONDITION
+#define TRACE_EVENT_CONDITION(name, proto, args, cond, tstruct, assign, print) \
+       TRACE_EVENT(name,                                               \
+               PARAMS(proto),                                          \
+               PARAMS(args),                                           \
+               PARAMS(tstruct),                                        \
+               PARAMS(assign),                                         \
+               PARAMS(print))
+
  #undef TRACE_EVENT_FN
  #define TRACE_EVENT_FN(name, proto, args, tstruct,             \
                 assign, print, reg, unreg)                      \
@@ -39,6 +48,10 @@
  #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
         DEFINE_TRACE(name)
  
+#undef DEFINE_EVENT_CONDITION
+#define DEFINE_EVENT_CONDITION(template, name, proto, args, cond) \
+       DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
  #undef DECLARE_TRACE
  #define DECLARE_TRACE(name, proto, args)       \
         DEFINE_TRACE(name)
@@ -75,9 +88,11 @@
  
  #undef TRACE_EVENT
  #undef TRACE_EVENT_FN
+#undef TRACE_EVENT_CONDITION
  #undef DECLARE_EVENT_CLASS
  #undef DEFINE_EVENT
  #undef DEFINE_EVENT_PRINT
+#undef DEFINE_EVENT_CONDITION
  #undef TRACE_HEADER_MULTI_READ
  #undef DECLARE_TRACE
  
diff --git a/include/trace/events/power.h b/include/trace/events/power.h

index 286784d69b8f480343244d8046327a5a7d9883d9..1bcc2a8c00e29966aa8e48f1f135a3330364b668 100644 (file)
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -7,16 +7,67 @@
  #include <linux/ktime.h>
  #include <linux/tracepoint.h>
  
-#ifndef _TRACE_POWER_ENUM_
-#define _TRACE_POWER_ENUM_
-enum {
-       POWER_NONE      = 0,
-       POWER_CSTATE    = 1,    /* C-State */
-       POWER_PSTATE    = 2,    /* Fequency change or DVFS */
-       POWER_SSTATE    = 3,    /* Suspend */
-};
+DECLARE_EVENT_CLASS(cpu,
+
+       TP_PROTO(unsigned int state, unsigned int cpu_id),
+
+       TP_ARGS(state, cpu_id),
+
+       TP_STRUCT__entry(
+               __field(        u32,            state           )
+               __field(        u32,            cpu_id          )
+       ),
+
+       TP_fast_assign(
+               __entry->state = state;
+               __entry->cpu_id = cpu_id;
+       ),
+
+       TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state,
+                 (unsigned long)__entry->cpu_id)
+);
+
+DEFINE_EVENT(cpu, cpu_idle,
+
+       TP_PROTO(unsigned int state, unsigned int cpu_id),
+
+       TP_ARGS(state, cpu_id)
+);
+
+/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING
+
+#define PWR_EVENT_EXIT -1
  #endif
  
+DEFINE_EVENT(cpu, cpu_frequency,
+
+       TP_PROTO(unsigned int frequency, unsigned int cpu_id),
+
+       TP_ARGS(frequency, cpu_id)
+);
+
+TRACE_EVENT(machine_suspend,
+
+       TP_PROTO(unsigned int state),
+
+       TP_ARGS(state),
+
+       TP_STRUCT__entry(
+               __field(        u32,            state           )
+       ),
+
+       TP_fast_assign(
+               __entry->state = state;
+       ),
+
+       TP_printk("state=%lu", (unsigned long)__entry->state)
+);
+
+/* This code will be removed after deprecation time exceeded (2.6.41) */
+#ifdef CONFIG_EVENT_POWER_TRACING_DEPRECATED
+
  /*
   * The power events are used for cpuidle & suspend (power_start, power_end)
   *  and for cpufreq (power_frequency)
@@ -75,6 +126,36 @@ TRACE_EVENT(power_end,
  
  );
  
+/* Deprecated dummy functions must be protected against multi-declartion */
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+
+enum {
+       POWER_NONE = 0,
+       POWER_CSTATE = 1,
+       POWER_PSTATE = 2,
+};
+#endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */
+
+#else /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
+
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+enum {
+       POWER_NONE = 0,
+       POWER_CSTATE = 1,
+       POWER_PSTATE = 2,
+};
+
+/* These dummy declaration have to be ripped out when the deprecated
+   events get removed */
+static inline void trace_power_start(u64 type, u64 state, u64 cpuid) {};
+static inline void trace_power_end(u64 cpuid) {};
+static inline void trace_power_frequency(u64 type, u64 state, u64 cpuid) {};
+#endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */
+
+#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
+
  /*
   * The clock events are used for clock enable/disable and for
   *  clock rate change
@@ -153,7 +234,6 @@ DEFINE_EVENT(power_domain, power_domain_target,
  
         TP_ARGS(name, state, cpu_id)
  );
-
  #endif /* _TRACE_POWER_H */
  
  /* This part must be outside protection */
diff --git a/include/trace/events/syscalls.h b/include/trace/events/syscalls.h

index fb726ac7caee4f465033ff5d314d788db929ded5..5a4c04a75b3d369fc9665eca1deee12a2d442d61 100644 (file)
--- a/include/trace/events/syscalls.h
+++ b/include/trace/events/syscalls.h
@@ -40,6 +40,8 @@ TRACE_EVENT_FN(sys_enter,
         syscall_regfunc, syscall_unregfunc
  );
  
+TRACE_EVENT_FLAGS(sys_enter, TRACE_EVENT_FL_CAP_ANY)
+
  TRACE_EVENT_FN(sys_exit,
  
         TP_PROTO(struct pt_regs *regs, long ret),
@@ -62,6 +64,8 @@ TRACE_EVENT_FN(sys_exit,
         syscall_regfunc, syscall_unregfunc
  );
  
+TRACE_EVENT_FLAGS(sys_exit, TRACE_EVENT_FL_CAP_ANY)
+
  #endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */
  
  #endif /* _TRACE_EVENTS_SYSCALLS_H */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h

index a9377c0083ad3ed612547f783647132a8268ef09..e16610c208c954541587684c8af64584b01dbfda 100644 (file)
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -82,6 +82,10 @@
         TRACE_EVENT(name, PARAMS(proto), PARAMS(args),                  \
                 PARAMS(tstruct), PARAMS(assign), PARAMS(print))         \
  
+#undef TRACE_EVENT_FLAGS
+#define TRACE_EVENT_FLAGS(name, value)                                 \
+       __TRACE_EVENT_FLAGS(name, value)
+
  #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
  
  
@@ -129,6 +133,9 @@
  #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
         DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
  
+#undef TRACE_EVENT_FLAGS
+#define TRACE_EVENT_FLAGS(event, flag)
+
  #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
  
  /*
@@ -289,13 +296,19 @@ static struct trace_event_functions ftrace_event_type_funcs_##call = {    \
  
  #undef __array
  #define __array(type, item, len)                                       \
-       BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                         \
-       ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+       do {                                                            \
+               mutex_lock(&event_storage_mutex);                       \
+               BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                 \
+               snprintf(event_storage, sizeof(event_storage),          \
+                        "%s[%d]", #type, len);                         \
+               ret = trace_define_field(event_call, event_storage, #item, \
                                  offsetof(typeof(field), item),         \
                                  sizeof(field.item),                    \
                                  is_signed_type(type), FILTER_OTHER);   \
-       if (ret)                                                        \
-               return ret;
+               mutex_unlock(&event_storage_mutex);                     \
+               if (ret)                                                \
+                       return ret;                                     \
+       } while (0);
  
  #undef __dynamic_array
  #define __dynamic_array(type, item, len)                                      \
diff --git a/init/Kconfig b/init/Kconfig

index c9728992a776356e043d21df7b33aa045c2d7904..8dfd094e68753dd919dd50299c82fa7385d6621e 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -393,7 +393,6 @@ config PREEMPT_RCU
  
  config RCU_TRACE
         bool "Enable tracing for RCU"
-       depends on TREE_RCU || TREE_PREEMPT_RCU
         help
           This option provides tracing in RCU which presents stats
           in debugfs for debugging RCU implementation.
@@ -459,6 +458,60 @@ config TREE_RCU_TRACE
           TREE_PREEMPT_RCU implementations, permitting Makefile to
           trivially select kernel/rcutree_trace.c.
  
+config RCU_BOOST
+       bool "Enable RCU priority boosting"
+       depends on RT_MUTEXES && TINY_PREEMPT_RCU
+       default n
+       help
+         This option boosts the priority of preempted RCU readers that
+         block the current preemptible RCU grace period for too long.
+         This option also prevents heavy loads from blocking RCU
+         callback invocation for all flavors of RCU.
+
+         Say Y here if you are working with real-time apps or heavy loads
+         Say N here if you are unsure.
+
+config RCU_BOOST_PRIO
+       int "Real-time priority to boost RCU readers to"
+       range 1 99
+       depends on RCU_BOOST
+       default 1
+       help
+         This option specifies the real-time priority to which preempted
+         RCU readers are to be boosted.  If you are working with CPU-bound
+         real-time applications, you should specify a priority higher then
+         the highest-priority CPU-bound application.
+
+         Specify the real-time priority, or take the default if unsure.
+
+config RCU_BOOST_DELAY
+       int "Milliseconds to delay boosting after RCU grace-period start"
+       range 0 3000
+       depends on RCU_BOOST
+       default 500
+       help
+         This option specifies the time to wait after the beginning of
+         a given grace period before priority-boosting preempted RCU
+         readers blocking that grace period.  Note that any RCU reader
+         blocking an expedited RCU grace period is boosted immediately.
+
+         Accept the default if unsure.
+
+config SRCU_SYNCHRONIZE_DELAY
+       int "Microseconds to delay before waiting for readers"
+       range 0 20
+       default 10
+       help
+         This option controls how long SRCU delays before entering its
+         loop waiting on SRCU readers.  The purpose of this loop is
+         to avoid the unconditional context-switch penalty that would
+         otherwise be incurred if there was an active SRCU reader,
+         in a manner similar to adaptive locking schemes.  This should
+         be set to be a bit longer than the common-case SRCU read-side
+         critical-section overhead.
+
+         Accept the default if unsure.
+
  endmenu # "RCU Subsystem"
  
  config IKCONFIG
@@ -741,6 +794,19 @@ config NET_NS
  
  endif # NAMESPACES
  
+config SCHED_AUTOGROUP
+       bool "Automatic process group scheduling"
+       select EVENTFD
+       select CGROUPS
+       select CGROUP_SCHED
+       select FAIR_GROUP_SCHED
+       help
+         This option optimizes the scheduler for common desktop workloads by
+         automatically creating and populating task groups.  This separation
+         of workloads isolates aggressive CPU burners (like build jobs) from
+         desktop applications.  Task group autogeneration is currently based
+         upon task session.
+
  config MM_OWNER
         bool
  
diff --git a/init/do_mounts.c b/init/do_mounts.c

index 830aaec9c7d5e0cb8df39af760791b5335772768..2b54bef33b55c65f8fda4ebf3129db3453c8f53d 100644 (file)
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -93,7 +93,7 @@ no_match:
   *
   * Returns the matching dev_t on success or 0 on failure.
   */
-static dev_t __init devt_from_partuuid(char *uuid_str)
+static dev_t devt_from_partuuid(char *uuid_str)
  {
         dev_t res = 0;
         struct device *dev = NULL;
diff --git a/init/main.c b/init/main.c

index 8646401f7a0e4b77579aa13f8de6ac191787be73..ea51770c01701e312f70f9a81357babbf93fb865 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -67,6 +67,7 @@
  #include <linux/sfi.h>
  #include <linux/shmem_fs.h>
  #include <linux/slab.h>
+#include <linux/perf_event.h>
  
  #include <asm/io.h>
  #include <asm/bugs.h>
@@ -603,6 +604,8 @@ asmlinkage void __init start_kernel(void)
                                 "enabled *very* early, fixing it\n");
                 local_irq_disable();
         }
+       idr_init_cache();
+       perf_event_init();
         rcu_init();
         radix_tree_init();
         /* init some links before init_ISA_irqs() */
@@ -658,7 +661,6 @@ asmlinkage void __init start_kernel(void)
         enable_debug_pagealloc();
         kmemleak_init();
         debug_objects_mem_init();
-       idr_init_cache();
         setup_per_cpu_pageset();
         numa_policy_init();
         if (late_time_init)
@@ -882,6 +884,7 @@ static int __init kernel_init(void * unused)
         smp_prepare_cpus(setup_max_cpus);
  
         do_pre_smp_initcalls();
+       lockup_detector_init();
  
         smp_init();
         sched_init_smp();
diff --git a/kernel/cpu.c b/kernel/cpu.c

index f6e726f184916029e2d1cfdbcd4acb2b26f14e69..156cc555614089345553a6e7710580c4f069be0e 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
  }
  
  struct take_cpu_down_param {
-       struct task_struct *caller;
         unsigned long mod;
         void *hcpu;
  };
@@ -198,7 +197,6 @@ struct take_cpu_down_param {
  static int __ref take_cpu_down(void *_param)
  {
         struct take_cpu_down_param *param = _param;
-       unsigned int cpu = (unsigned long)param->hcpu;
         int err;
  
         /* Ensure this CPU doesn't handle any more interrupts. */
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param)
  
         cpu_notify(CPU_DYING | param->mod, param->hcpu);
  
-       if (task_cpu(param->caller) == cpu)
-               move_task_off_dead_cpu(cpu, param->caller);
-       /* Force idle task to run as soon as we yield: it should
-          immediately notice cpu is offline and die quickly. */
-       sched_idle_next();
         return 0;
  }
  
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
         void *hcpu = (void *)(long)cpu;
         unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
         struct take_cpu_down_param tcd_param = {
-               .caller = current,
                 .mod = mod,
                 .hcpu = hcpu,
         };
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
         }
         BUG_ON(cpu_online(cpu));
  
-       /* Wait for it to sleep (leaving idle task). */
+       /*
+        * The migration_call() CPU_DYING callback will have removed all
+        * runnable tasks from the cpu, there's only the idle task left now
+        * that the migration thread is done doing the stop_machine thing.
+        *
+        * Wait for the stop thread to go away.
+        */
         while (!idle_cpu(cpu))
-               yield();
+               cpu_relax();
  
         /* This actually kills the CPU. */
         __cpu_die(cpu);
@@ -386,6 +384,14 @@ out:
  #ifdef CONFIG_PM_SLEEP_SMP
  static cpumask_var_t frozen_cpus;
  
+void __weak arch_disable_nonboot_cpus_begin(void)
+{
+}
+
+void __weak arch_disable_nonboot_cpus_end(void)
+{
+}
+
  int disable_nonboot_cpus(void)
  {
         int cpu, first_cpu, error = 0;
@@ -397,6 +403,7 @@ int disable_nonboot_cpus(void)
          * with the userspace trying to use the CPU hotplug at the same time
          */
         cpumask_clear(frozen_cpus);
+       arch_disable_nonboot_cpus_begin();
  
         printk("Disabling non-boot CPUs ...\n");
         for_each_online_cpu(cpu) {
@@ -412,6 +419,8 @@ int disable_nonboot_cpus(void)
                 }
         }
  
+       arch_disable_nonboot_cpus_end();
+
         if (!error) {
                 BUG_ON(num_online_cpus() > 1);
                 /* Make sure the CPUs won't be enabled by someone else */
diff --git a/kernel/fork.c b/kernel/fork.c

index 5447dc7defa95b8f0e13acb80b45487df7dc2e73..7d164e25b0f0ea42498d748f389824773e2a0c78 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig)
  
  static inline void put_signal_struct(struct signal_struct *sig)
  {
-       if (atomic_dec_and_test(&sig->sigcnt))
+       if (atomic_dec_and_test(&sig->sigcnt)) {
+               sched_autogroup_exit(sig);
                 free_signal_struct(sig);
+       }
  }
  
  void __put_task_struct(struct task_struct *tsk)
@@ -905,6 +907,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
         posix_cpu_timers_init_group(sig);
  
         tty_audit_fork(sig);
+       sched_autogroup_fork(sig);
  
         sig->oom_adj = current->signal->oom_adj;
         sig->oom_score_adj = current->signal->oom_score_adj;
@@ -1315,7 +1318,7 @@ bad_fork_cleanup_mm:
         }
  bad_fork_cleanup_signal:
         if (!(clone_flags & CLONE_THREAD))
-               free_signal_struct(p->signal);
+               put_signal_struct(p->signal);
  bad_fork_cleanup_sighand:
         __cleanup_sighand(p->sighand);
  bad_fork_cleanup_fs:
diff --git a/kernel/futex.c b/kernel/futex.c

index 40a8777a27d0d85e173f4b7a3efbd4ecff1c9654..3019b92e691744169b3ac50bb3836afae5ab1085 100644 (file)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -68,6 +68,14 @@ int __read_mostly futex_cmpxchg_enabled;
  
  #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
  
+/*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#define FLAGS_SHARED           0x01
+#define FLAGS_CLOCKRT          0x02
+#define FLAGS_HAS_TIMEOUT      0x04
+
  /*
   * Priority Inheritance state:
   */
@@ -123,6 +131,12 @@ struct futex_q {
         u32 bitset;
  };
  
+static const struct futex_q futex_q_init = {
+       /* list gets initialized in queue_me()*/
+       .key = FUTEX_KEY_INIT,
+       .bitset = FUTEX_BITSET_MATCH_ANY
+};
+
  /*
   * Hash buckets are shared by all the futex_keys that hash to the same
   * location.  Each key may have multiple futex_q structures, one for each task
@@ -283,8 +297,7 @@ again:
         return 0;
  }
  
-static inline
-void put_futex_key(int fshared, union futex_key *key)
+static inline void put_futex_key(union futex_key *key)
  {
         drop_futex_key_refs(key);
  }
@@ -870,7 +883,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
  /*
   * Wake up waiters matching bitset queued on this futex (uaddr).
   */
-static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
  {
         struct futex_hash_bucket *hb;
         struct futex_q *this, *next;
@@ -881,7 +895,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
         if (!bitset)
                 return -EINVAL;
  
-       ret = get_futex_key(uaddr, fshared, &key);
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
         if (unlikely(ret != 0))
                 goto out;
  
@@ -907,7 +921,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
         }
  
         spin_unlock(&hb->lock);
-       put_futex_key(fshared, &key);
+       put_futex_key(&key);
  out:
         return ret;
  }
@@ -917,7 +931,7 @@ out:
   * to this virtual address:
   */
  static int
-futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
               int nr_wake, int nr_wake2, int op)
  {
         union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -927,10 +941,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
         int ret, op_ret;
  
  retry:
-       ret = get_futex_key(uaddr1, fshared, &key1);
+       ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
         if (unlikely(ret != 0))
                 goto out;
-       ret = get_futex_key(uaddr2, fshared, &key2);
+       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
         if (unlikely(ret != 0))
                 goto out_put_key1;
  
@@ -962,11 +976,11 @@ retry_private:
                 if (ret)
                         goto out_put_keys;
  
-               if (!fshared)
+               if (!(flags & FLAGS_SHARED))
                         goto retry_private;
  
-               put_futex_key(fshared, &key2);
-               put_futex_key(fshared, &key1);
+               put_futex_key(&key2);
+               put_futex_key(&key1);
                 goto retry;
         }
  
@@ -996,9 +1010,9 @@ retry_private:
  
         double_unlock_hb(hb1, hb2);
  out_put_keys:
-       put_futex_key(fshared, &key2);
+       put_futex_key(&key2);
  out_put_key1:
-       put_futex_key(fshared, &key1);
+       put_futex_key(&key1);
  out:
         return ret;
  }
@@ -1133,13 +1147,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
  /**
   * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
   * @uaddr1:    source futex user address
- * @fshared:   0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @flags:     futex flags (FLAGS_SHARED, etc.)
   * @uaddr2:    target futex user address
   * @nr_wake:   number of waiters to wake (must be 1 for requeue_pi)
   * @nr_requeue:        number of waiters to requeue (0-INT_MAX)
   * @cmpval:    @uaddr1 expected value (or %NULL)
   * @requeue_pi:        if we are attempting to requeue from a non-pi futex to a
- *             pi futex (pi to pi requeue is not supported)
+ *             pi futex (pi to pi requeue is not supported)
   *
   * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
   * uaddr2 atomically on behalf of the top waiter.
@@ -1148,9 +1162,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
   * >=0 - on success, the number of tasks requeued or woken
   *  <0 - on error
   */
-static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
-                        int nr_wake, int nr_requeue, u32 *cmpval,
-                        int requeue_pi)
+static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+                        u32 __user *uaddr2, int nr_wake, int nr_requeue,
+                        u32 *cmpval, int requeue_pi)
  {
         union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
         int drop_count = 0, task_count = 0, ret;
@@ -1191,10 +1205,10 @@ retry:
                 pi_state = NULL;
         }
  
-       ret = get_futex_key(uaddr1, fshared, &key1);
+       ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
         if (unlikely(ret != 0))
                 goto out;
-       ret = get_futex_key(uaddr2, fshared, &key2);
+       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
         if (unlikely(ret != 0))
                 goto out_put_key1;
  
@@ -1216,11 +1230,11 @@ retry_private:
                         if (ret)
                                 goto out_put_keys;
  
-                       if (!fshared)
+                       if (!(flags & FLAGS_SHARED))
                                 goto retry_private;
  
-                       put_futex_key(fshared, &key2);
-                       put_futex_key(fshared, &key1);
+                       put_futex_key(&key2);
+                       put_futex_key(&key1);
                         goto retry;
                 }
                 if (curval != *cmpval) {
@@ -1260,8 +1274,8 @@ retry_private:
                         break;
                 case -EFAULT:
                         double_unlock_hb(hb1, hb2);
-                       put_futex_key(fshared, &key2);
-                       put_futex_key(fshared, &key1);
+                       put_futex_key(&key2);
+                       put_futex_key(&key1);
                         ret = fault_in_user_writeable(uaddr2);
                         if (!ret)
                                 goto retry;
@@ -1269,8 +1283,8 @@ retry_private:
                 case -EAGAIN:
                         /* The owner was exiting, try again. */
                         double_unlock_hb(hb1, hb2);
-                       put_futex_key(fshared, &key2);
-                       put_futex_key(fshared, &key1);
+                       put_futex_key(&key2);
+                       put_futex_key(&key1);
                         cond_resched();
                         goto retry;
                 default:
@@ -1352,9 +1366,9 @@ out_unlock:
                 drop_futex_key_refs(&key1);
  
  out_put_keys:
-       put_futex_key(fshared, &key2);
+       put_futex_key(&key2);
  out_put_key1:
-       put_futex_key(fshared, &key1);
+       put_futex_key(&key1);
  out:
         if (pi_state != NULL)
                 free_pi_state(pi_state);
@@ -1494,7 +1508,7 @@ static void unqueue_me_pi(struct futex_q *q)
   * private futexes.
   */
  static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-                               struct task_struct *newowner, int fshared)
+                               struct task_struct *newowner)
  {
         u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
         struct futex_pi_state *pi_state = q->pi_state;
@@ -1587,20 +1601,11 @@ handle_fault:
         goto retry;
  }
  
-/*
- * In case we must use restart_block to restart a futex_wait,
- * we encode in the 'flags' shared capability
- */
-#define FLAGS_SHARED           0x01
-#define FLAGS_CLOCKRT          0x02
-#define FLAGS_HAS_TIMEOUT      0x04
-
  static long futex_wait_restart(struct restart_block *restart);
  
  /**
   * fixup_owner() - Post lock pi_state and corner case management
   * @uaddr:     user address of the futex
- * @fshared:   whether the futex is shared (1) or not (0)
   * @q:         futex_q (contains pi_state and access to the rt_mutex)
   * @locked:    if the attempt to take the rt_mutex succeeded (1) or not (0)
   *
@@ -1613,8 +1618,7 @@ static long futex_wait_restart(struct restart_block *restart);
   *  0 - success, lock not taken
   * <0 - on error (-EFAULT)
   */
-static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
-                      int locked)
+static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
  {
         struct task_struct *owner;
         int ret = 0;
@@ -1625,7 +1629,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
                  * did a lock-steal - fix up the PI-state in that case:
                  */
                 if (q->pi_state->owner != current)
-                       ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+                       ret = fixup_pi_state_owner(uaddr, q, current);
                 goto out;
         }
  
@@ -1652,7 +1656,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
                  * lock. Fix the state up.
                  */
                 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
-               ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+               ret = fixup_pi_state_owner(uaddr, q, owner);
                 goto out;
         }
  
@@ -1715,7 +1719,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
   * futex_wait_setup() - Prepare to wait on a futex
   * @uaddr:     the futex userspace address
   * @val:       the expected value
- * @fshared:   whether the futex is shared (1) or not (0)
+ * @flags:     futex flags (FLAGS_SHARED, etc.)
   * @q:         the associated futex_q
   * @hb:                storage for hash_bucket pointer to be returned to caller
   *
@@ -1728,7 +1732,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
   *  0 - uaddr contains val and hb has been locked
   * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
   */
-static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
                            struct futex_q *q, struct futex_hash_bucket **hb)
  {
         u32 uval;
@@ -1752,8 +1756,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
          * rare, but normal.
          */
  retry:
-       q->key = FUTEX_KEY_INIT;
-       ret = get_futex_key(uaddr, fshared, &q->key);
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
         if (unlikely(ret != 0))
                 return ret;
  
@@ -1769,10 +1772,10 @@ retry_private:
                 if (ret)
                         goto out;
  
-               if (!fshared)
+               if (!(flags & FLAGS_SHARED))
                         goto retry_private;
  
-               put_futex_key(fshared, &q->key);
+               put_futex_key(&q->key);
                 goto retry;
         }
  
@@ -1783,32 +1786,29 @@ retry_private:
  
  out:
         if (ret)
-               put_futex_key(fshared, &q->key);
+               put_futex_key(&q->key);
         return ret;
  }
  
-static int futex_wait(u32 __user *uaddr, int fshared,
-                     u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+                     ktime_t *abs_time, u32 bitset)
  {
         struct hrtimer_sleeper timeout, *to = NULL;
         struct restart_block *restart;
         struct futex_hash_bucket *hb;
-       struct futex_q q;
+       struct futex_q q = futex_q_init;
         int ret;
  
         if (!bitset)
                 return -EINVAL;
-
-       q.pi_state = NULL;
         q.bitset = bitset;
-       q.rt_waiter = NULL;
-       q.requeue_pi_key = NULL;
  
         if (abs_time) {
                 to = &timeout;
  
-               hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
-                                     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+               hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+                                     CLOCK_REALTIME : CLOCK_MONOTONIC,
+                                     HRTIMER_MODE_ABS);
                 hrtimer_init_sleeper(to, current);
                 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
                                              current->timer_slack_ns);
@@ -1819,7 +1819,7 @@ retry:
          * Prepare to wait on uaddr. On success, holds hb lock and increments
          * q.key refs.
          */
-       ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+       ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
         if (ret)
                 goto out;
  
@@ -1852,12 +1852,7 @@ retry:
         restart->futex.val = val;
         restart->futex.time = abs_time->tv64;
         restart->futex.bitset = bitset;
-       restart->futex.flags = FLAGS_HAS_TIMEOUT;
-
-       if (fshared)
-               restart->futex.flags |= FLAGS_SHARED;
-       if (clockrt)
-               restart->futex.flags |= FLAGS_CLOCKRT;
+       restart->futex.flags = flags;
  
         ret = -ERESTART_RESTARTBLOCK;
  
@@ -1873,7 +1868,6 @@ out:
  static long futex_wait_restart(struct restart_block *restart)
  {
         u32 __user *uaddr = restart->futex.uaddr;
-       int fshared = 0;
         ktime_t t, *tp = NULL;
  
         if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
@@ -1881,11 +1875,9 @@ static long futex_wait_restart(struct restart_block *restart)
                 tp = &t;
         }
         restart->fn = do_no_restart_syscall;
-       if (restart->futex.flags & FLAGS_SHARED)
-               fshared = 1;
-       return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
-                               restart->futex.bitset,
-                               restart->futex.flags & FLAGS_CLOCKRT);
+
+       return (long)futex_wait(uaddr, restart->futex.flags,
+                               restart->futex.val, tp, restart->futex.bitset);
  }
  
  
@@ -1895,12 +1887,12 @@ static long futex_wait_restart(struct restart_block *restart)
   * if there are waiters then it will block, it does PI, etc. (Due to
   * races the kernel might see a 0 value of the futex too.)
   */
-static int futex_lock_pi(u32 __user *uaddr, int fshared,
-                        int detect, ktime_t *time, int trylock)
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
+                        ktime_t *time, int trylock)
  {
         struct hrtimer_sleeper timeout, *to = NULL;
         struct futex_hash_bucket *hb;
-       struct futex_q q;
+       struct futex_q q = futex_q_init;
         int res, ret;
  
         if (refill_pi_state_cache())
@@ -1914,12 +1906,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
                 hrtimer_set_expires(&to->timer, *time);
         }
  
-       q.pi_state = NULL;
-       q.rt_waiter = NULL;
-       q.requeue_pi_key = NULL;
  retry:
-       q.key = FUTEX_KEY_INIT;
-       ret = get_futex_key(uaddr, fshared, &q.key);
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
         if (unlikely(ret != 0))
                 goto out;
  
@@ -1941,7 +1929,7 @@ retry_private:
                          * exit to complete.
                          */
                         queue_unlock(&q, hb);
-                       put_futex_key(fshared, &q.key);
+                       put_futex_key(&q.key);
                         cond_resched();
                         goto retry;
                 default:
@@ -1971,7 +1959,7 @@ retry_private:
          * Fixup the pi_state owner and possibly acquire the lock if we
          * haven't already.
          */
-       res = fixup_owner(uaddr, fshared, &q, !ret);
+       res = fixup_owner(uaddr, &q, !ret);
         /*
          * If fixup_owner() returned an error, proprogate that.  If it acquired
          * the lock, clear our -ETIMEDOUT or -EINTR.
@@ -1995,7 +1983,7 @@ out_unlock_put_key:
         queue_unlock(&q, hb);
  
  out_put_key:
-       put_futex_key(fshared, &q.key);
+       put_futex_key(&q.key);
  out:
         if (to)
                 destroy_hrtimer_on_stack(&to->timer);
@@ -2008,10 +1996,10 @@ uaddr_faulted:
         if (ret)
                 goto out_put_key;
  
-       if (!fshared)
+       if (!(flags & FLAGS_SHARED))
                 goto retry_private;
  
-       put_futex_key(fshared, &q.key);
+       put_futex_key(&q.key);
         goto retry;
  }
  
@@ -2020,7 +2008,7 @@ uaddr_faulted:
   * This is the in-kernel slowpath: we look up the PI state (if any),
   * and do the rt-mutex unlock.
   */
-static int futex_unlock_pi(u32 __user *uaddr, int fshared)
+static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
  {
         struct futex_hash_bucket *hb;
         struct futex_q *this, *next;
@@ -2038,7 +2026,7 @@ retry:
         if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
                 return -EPERM;
  
-       ret = get_futex_key(uaddr, fshared, &key);
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
         if (unlikely(ret != 0))
                 goto out;
  
@@ -2093,14 +2081,14 @@ retry:
  
  out_unlock:
         spin_unlock(&hb->lock);
-       put_futex_key(fshared, &key);
+       put_futex_key(&key);
  
  out:
         return ret;
  
  pi_faulted:
         spin_unlock(&hb->lock);
-       put_futex_key(fshared, &key);
+       put_futex_key(&key);
  
         ret = fault_in_user_writeable(uaddr);
         if (!ret)
@@ -2160,7 +2148,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
  /**
   * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
   * @uaddr:     the futex we initially wait on (non-pi)
- * @fshared:   whether the futexes are shared (1) or not (0).  They must be
+ * @flags:     futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
   *             the same type, no requeueing from private to shared, etc.
   * @val:       the expected value of uaddr
   * @abs_time:  absolute timeout
@@ -2198,16 +2186,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
   *  0 - On success
   * <0 - On error
   */
-static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                                  u32 val, ktime_t *abs_time, u32 bitset,
-                                int clockrt, u32 __user *uaddr2)
+                                u32 __user *uaddr2)
  {
         struct hrtimer_sleeper timeout, *to = NULL;
         struct rt_mutex_waiter rt_waiter;
         struct rt_mutex *pi_mutex = NULL;
         struct futex_hash_bucket *hb;
-       union futex_key key2;
-       struct futex_q q;
+       union futex_key key2 = FUTEX_KEY_INIT;
+       struct futex_q q = futex_q_init;
         int res, ret;
  
         if (!bitset)
@@ -2215,8 +2203,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
  
         if (abs_time) {
                 to = &timeout;
-               hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
-                                     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+               hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+                                     CLOCK_REALTIME : CLOCK_MONOTONIC,
+                                     HRTIMER_MODE_ABS);
                 hrtimer_init_sleeper(to, current);
                 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
                                              current->timer_slack_ns);
@@ -2229,12 +2218,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
         debug_rt_mutex_init_waiter(&rt_waiter);
         rt_waiter.task = NULL;
  
-       key2 = FUTEX_KEY_INIT;
-       ret = get_futex_key(uaddr2, fshared, &key2);
+       ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
         if (unlikely(ret != 0))
                 goto out;
  
-       q.pi_state = NULL;
         q.bitset = bitset;
         q.rt_waiter = &rt_waiter;
         q.requeue_pi_key = &key2;
@@ -2243,7 +2230,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
          * Prepare to wait on uaddr. On success, increments q.key (key1) ref
          * count.
          */
-       ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+       ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
         if (ret)
                 goto out_key2;
  
@@ -2273,8 +2260,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
                  */
                 if (q.pi_state && (q.pi_state->owner != current)) {
                         spin_lock(q.lock_ptr);
-                       ret = fixup_pi_state_owner(uaddr2, &q, current,
-                                                  fshared);
+                       ret = fixup_pi_state_owner(uaddr2, &q, current);
                         spin_unlock(q.lock_ptr);
                 }
         } else {
@@ -2293,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
                  * Fixup the pi_state owner and possibly acquire the lock if we
                  * haven't already.
                  */
-               res = fixup_owner(uaddr2, fshared, &q, !ret);
+               res = fixup_owner(uaddr2, &q, !ret);
                 /*
                  * If fixup_owner() returned an error, proprogate that.  If it
                  * acquired the lock, clear -ETIMEDOUT or -EINTR.
@@ -2324,9 +2310,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
         }
  
  out_put_keys:
-       put_futex_key(fshared, &q.key);
+       put_futex_key(&q.key);
  out_key2:
-       put_futex_key(fshared, &key2);
+       put_futex_key(&key2);
  
  out:
         if (to) {
@@ -2551,58 +2537,57 @@ void exit_robust_list(struct task_struct *curr)
  long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                 u32 __user *uaddr2, u32 val2, u32 val3)
  {
-       int clockrt, ret = -ENOSYS;
-       int cmd = op & FUTEX_CMD_MASK;
-       int fshared = 0;
+       int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
+       unsigned int flags = 0;
  
         if (!(op & FUTEX_PRIVATE_FLAG))
-               fshared = 1;
+               flags |= FLAGS_SHARED;
  
-       clockrt = op & FUTEX_CLOCK_REALTIME;
-       if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
-               return -ENOSYS;
+       if (op & FUTEX_CLOCK_REALTIME) {
+               flags |= FLAGS_CLOCKRT;
+               if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+                       return -ENOSYS;
+       }
  
         switch (cmd) {
         case FUTEX_WAIT:
                 val3 = FUTEX_BITSET_MATCH_ANY;
         case FUTEX_WAIT_BITSET:
-               ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
+               ret = futex_wait(uaddr, flags, val, timeout, val3);
                 break;
         case FUTEX_WAKE:
                 val3 = FUTEX_BITSET_MATCH_ANY;
         case FUTEX_WAKE_BITSET:
-               ret = futex_wake(uaddr, fshared, val, val3);
+               ret = futex_wake(uaddr, flags, val, val3);
                 break;
         case FUTEX_REQUEUE:
-               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
+               ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
                 break;
         case FUTEX_CMP_REQUEUE:
-               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
-                                   0);
+               ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
                 break;
         case FUTEX_WAKE_OP:
-               ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
+               ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
                 break;
         case FUTEX_LOCK_PI:
                 if (futex_cmpxchg_enabled)
-                       ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
+                       ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
                 break;
         case FUTEX_UNLOCK_PI:
                 if (futex_cmpxchg_enabled)
-                       ret = futex_unlock_pi(uaddr, fshared);
+                       ret = futex_unlock_pi(uaddr, flags);
                 break;
         case FUTEX_TRYLOCK_PI:
                 if (futex_cmpxchg_enabled)
-                       ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+                       ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
                 break;
         case FUTEX_WAIT_REQUEUE_PI:
                 val3 = FUTEX_BITSET_MATCH_ANY;
-               ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
-                                           clockrt, uaddr2);
+               ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+                                           uaddr2);
                 break;
         case FUTEX_CMP_REQUEUE_PI:
-               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
-                                   1);
+               ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
                 break;
         default:
                 ret = -ENOSYS;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c

index 72206cf5c6cf854898d889a6a645e44febdd526f..f2429fc3438c4f1c2094e59fe54415dc30e4bb51 100644 (file)
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -516,10 +516,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
  
         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
                 struct hrtimer *timer;
+               struct timerqueue_node *next;
  
-               if (!base->first)
+               next = timerqueue_getnext(&base->active);
+               if (!next)
                         continue;
-               timer = rb_entry(base->first, struct hrtimer, node);
+               timer = container_of(next, struct hrtimer, node);
+
                 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
                 /*
                  * clock_was_set() has changed base->offset so the
@@ -840,48 +843,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
  static int enqueue_hrtimer(struct hrtimer *timer,
                            struct hrtimer_clock_base *base)
  {
-       struct rb_node **link = &base->active.rb_node;
-       struct rb_node *parent = NULL;
-       struct hrtimer *entry;
-       int leftmost = 1;
-
         debug_activate(timer);
  
-       /*
-        * Find the right place in the rbtree:
-        */
-       while (*link) {
-               parent = *link;
-               entry = rb_entry(parent, struct hrtimer, node);
-               /*
-                * We dont care about collisions. Nodes with
-                * the same expiry time stay together.
-                */
-               if (hrtimer_get_expires_tv64(timer) <
-                               hrtimer_get_expires_tv64(entry)) {
-                       link = &(*link)->rb_left;
-               } else {
-                       link = &(*link)->rb_right;
-                       leftmost = 0;
-               }
-       }
-
-       /*
-        * Insert the timer to the rbtree and check whether it
-        * replaces the first pending timer
-        */
-       if (leftmost)
-               base->first = &timer->node;
+       timerqueue_add(&base->active, &timer->node);
  
-       rb_link_node(&timer->node, parent, link);
-       rb_insert_color(&timer->node, &base->active);
         /*
          * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
          * state of a possibly running callback.
          */
         timer->state |= HRTIMER_STATE_ENQUEUED;
  
-       return leftmost;
+       return (&timer->node == base->active.next);
  }
  
  /*
@@ -901,12 +873,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
         if (!(timer->state & HRTIMER_STATE_ENQUEUED))
                 goto out;
  
-       /*
-        * Remove the timer from the rbtree and replace the first
-        * entry pointer if necessary.
-        */
-       if (base->first == &timer->node) {
-               base->first = rb_next(&timer->node);
+       if (&timer->node == timerqueue_getnext(&base->active)) {
  #ifdef CONFIG_HIGH_RES_TIMERS
                 /* Reprogram the clock event device. if enabled */
                 if (reprogram && hrtimer_hres_active()) {
@@ -919,7 +886,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
                 }
  #endif
         }
-       rb_erase(&timer->node, &base->active);
+       timerqueue_del(&base->active, &timer->node);
  out:
         timer->state = newstate;
  }
@@ -1128,11 +1095,13 @@ ktime_t hrtimer_get_next_event(void)
         if (!hrtimer_hres_active()) {
                 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
                         struct hrtimer *timer;
+                       struct timerqueue_node *next;
  
-                       if (!base->first)
+                       next = timerqueue_getnext(&base->active);
+                       if (!next)
                                 continue;
  
-                       timer = rb_entry(base->first, struct hrtimer, node);
+                       timer = container_of(next, struct hrtimer, node);
                         delta.tv64 = hrtimer_get_expires_tv64(timer);
                         delta = ktime_sub(delta, base->get_time());
                         if (delta.tv64 < mindelta.tv64)
@@ -1162,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
  
         timer->base = &cpu_base->clock_base[clock_id];
         hrtimer_init_timer_hres(timer);
+       timerqueue_init(&timer->node);
  
  #ifdef CONFIG_TIMER_STATS
         timer->start_site = NULL;
@@ -1278,14 +1248,14 @@ retry:
  
         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                 ktime_t basenow;
-               struct rb_node *node;
+               struct timerqueue_node *node;
  
                 basenow = ktime_add(now, base->offset);
  
-               while ((node = base->first)) {
+               while ((node = timerqueue_getnext(&base->active))) {
                         struct hrtimer *timer;
  
-                       timer = rb_entry(node, struct hrtimer, node);
+                       timer = container_of(node, struct hrtimer, node);
  
                         /*
                          * The immediate goal for using the softexpires is
@@ -1441,7 +1411,7 @@ void hrtimer_run_pending(void)
   */
  void hrtimer_run_queues(void)
  {
-       struct rb_node *node;
+       struct timerqueue_node *node;
         struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
         struct hrtimer_clock_base *base;
         int index, gettime = 1;
@@ -1451,8 +1421,7 @@ void hrtimer_run_queues(void)
  
         for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
                 base = &cpu_base->clock_base[index];
-
-               if (!base->first)
+               if (!timerqueue_getnext(&base->active))
                         continue;
  
                 if (gettime) {
@@ -1462,10 +1431,10 @@ void hrtimer_run_queues(void)
  
                 raw_spin_lock(&cpu_base->lock);
  
-               while ((node = base->first)) {
+               while ((node = timerqueue_getnext(&base->active))) {
                         struct hrtimer *timer;
  
-                       timer = rb_entry(node, struct hrtimer, node);
+                       timer = container_of(node, struct hrtimer, node);
                         if (base->softirq_time.tv64 <=
                                         hrtimer_get_expires_tv64(timer))
                                 break;
@@ -1630,8 +1599,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
  
         raw_spin_lock_init(&cpu_base->lock);
  
-       for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+       for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                 cpu_base->clock_base[i].cpu_base = cpu_base;
+               timerqueue_init_head(&cpu_base->clock_base[i].active);
+       }
  
         hrtimer_init_hres(cpu_base);
  }
@@ -1642,10 +1613,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
                                 struct hrtimer_clock_base *new_base)
  {
         struct hrtimer *timer;
-       struct rb_node *node;
+       struct timerqueue_node *node;
  
-       while ((node = rb_first(&old_base->active))) {
-               timer = rb_entry(node, struct hrtimer, node);
+       while ((node = timerqueue_getnext(&old_base->active))) {
+               timer = container_of(node, struct hrtimer, node);
                 BUG_ON(hrtimer_callback_running(timer));
                 debug_deactivate(timer);
  
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c

index e5325825aeb6e1e4ea0514ee37cfa53412ec4e3c..086adf25a55e3aaecf3eb3172a7569b2c1a209e0 100644 (file)
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -641,7 +641,7 @@ int __init init_hw_breakpoint(void)
  
         constraints_initialized = 1;
  
-       perf_pmu_register(&perf_breakpoint);
+       perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
  
         return register_die_notifier(&hw_breakpoint_exceptions_nb);
  
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c

index 5f92acc5f952e0afb0489017c265a943a4a7d464..91a5fa25054e1d14d62339749f3229fae49f3766 100644 (file)
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
   */
  static int irq_thread(void *data)
  {
-       struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+       static struct sched_param param = {
+               .sched_priority = MAX_USER_RT_PRIO/2,
+       };
         struct irqaction *action = data;
         struct irq_desc *desc = irq_to_desc(action->irq);
         int wake, oneshot = desc->status & IRQ_ONESHOT;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c

index 9737a76e106ff1554ecc2174f0e49a92b5badf45..7663e5df0e6f731f1804201a5e6cdf9b6162dd05 100644 (file)
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -354,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p)
         return p->pre_handler == aggr_pre_handler;
  }
  
+/* Return true(!0) if the kprobe is unused */
+static inline int kprobe_unused(struct kprobe *p)
+{
+       return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
+              list_empty(&p->list);
+}
+
  /*
   * Keep all fields in the kprobe consistent
   */
-static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
  {
-       memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
-       memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+       memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
+       memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
  }
  
  #ifdef CONFIG_OPTPROBES
@@ -384,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
         }
  }
  
+/* Free optimized instructions and optimized_kprobe */
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+       struct optimized_kprobe *op;
+
+       op = container_of(p, struct optimized_kprobe, kp);
+       arch_remove_optimized_kprobe(op);
+       arch_remove_kprobe(p);
+       kfree(op);
+}
+
  /* Return true(!0) if the kprobe is ready for optimization. */
  static inline int kprobe_optready(struct kprobe *p)
  {
@@ -397,6 +415,33 @@ static inline int kprobe_optready(struct kprobe *p)
         return 0;
  }
  
+/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
+static inline int kprobe_disarmed(struct kprobe *p)
+{
+       struct optimized_kprobe *op;
+
+       /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
+       if (!kprobe_aggrprobe(p))
+               return kprobe_disabled(p);
+
+       op = container_of(p, struct optimized_kprobe, kp);
+
+       return kprobe_disabled(p) && list_empty(&op->list);
+}
+
+/* Return true(!0) if the probe is queued on (un)optimizing lists */
+static int __kprobes kprobe_queued(struct kprobe *p)
+{
+       struct optimized_kprobe *op;
+
+       if (kprobe_aggrprobe(p)) {
+               op = container_of(p, struct optimized_kprobe, kp);
+               if (!list_empty(&op->list))
+                       return 1;
+       }
+       return 0;
+}
+
  /*
   * Return an optimized kprobe whose optimizing code replaces
   * instructions including addr (exclude breakpoint).
@@ -422,30 +467,23 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
  
  /* Optimization staging list, protected by kprobe_mutex */
  static LIST_HEAD(optimizing_list);
+static LIST_HEAD(unoptimizing_list);
  
  static void kprobe_optimizer(struct work_struct *work);
  static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
+static DECLARE_COMPLETION(optimizer_comp);
  #define OPTIMIZE_DELAY 5
  
-/* Kprobe jump optimizer */
-static __kprobes void kprobe_optimizer(struct work_struct *work)
+/*
+ * Optimize (replace a breakpoint with a jump) kprobes listed on
+ * optimizing_list.
+ */
+static __kprobes void do_optimize_kprobes(void)
  {
-       struct optimized_kprobe *op, *tmp;
-
-       /* Lock modules while optimizing kprobes */
-       mutex_lock(&module_mutex);
-       mutex_lock(&kprobe_mutex);
-       if (kprobes_all_disarmed || !kprobes_allow_optimization)
-               goto end;
-
-       /*
-        * Wait for quiesence period to ensure all running interrupts
-        * are done. Because optprobe may modify multiple instructions
-        * there is a chance that Nth instruction is interrupted. In that
-        * case, running interrupt can return to 2nd-Nth byte of jump
-        * instruction. This wait is for avoiding it.
-        */
-       synchronize_sched();
+       /* Optimization never be done when disarmed */
+       if (kprobes_all_disarmed || !kprobes_allow_optimization ||
+           list_empty(&optimizing_list))
+               return;
  
         /*
          * The optimization/unoptimization refers online_cpus via
@@ -459,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
          */
         get_online_cpus();
         mutex_lock(&text_mutex);
-       list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
-               WARN_ON(kprobe_disabled(&op->kp));
-               if (arch_optimize_kprobe(op) < 0)
-                       op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
-               list_del_init(&op->list);
+       arch_optimize_kprobes(&optimizing_list);
+       mutex_unlock(&text_mutex);
+       put_online_cpus();
+}
+
+/*
+ * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
+ * if need) kprobes listed on unoptimizing_list.
+ */
+static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
+{
+       struct optimized_kprobe *op, *tmp;
+
+       /* Unoptimization must be done anytime */
+       if (list_empty(&unoptimizing_list))
+               return;
+
+       /* Ditto to do_optimize_kprobes */
+       get_online_cpus();
+       mutex_lock(&text_mutex);
+       arch_unoptimize_kprobes(&unoptimizing_list, free_list);
+       /* Loop free_list for disarming */
+       list_for_each_entry_safe(op, tmp, free_list, list) {
+               /* Disarm probes if marked disabled */
+               if (kprobe_disabled(&op->kp))
+                       arch_disarm_kprobe(&op->kp);
+               if (kprobe_unused(&op->kp)) {
+                       /*
+                        * Remove unused probes from hash list. After waiting
+                        * for synchronization, these probes are reclaimed.
+                        * (reclaiming is done by do_free_cleaned_kprobes.)
+                        */
+                       hlist_del_rcu(&op->kp.hlist);
+               } else
+                       list_del_init(&op->list);
         }
         mutex_unlock(&text_mutex);
         put_online_cpus();
-end:
+}
+
+/* Reclaim all kprobes on the free_list */
+static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
+{
+       struct optimized_kprobe *op, *tmp;
+
+       list_for_each_entry_safe(op, tmp, free_list, list) {
+               BUG_ON(!kprobe_unused(&op->kp));
+               list_del_init(&op->list);
+               free_aggr_kprobe(&op->kp);
+       }
+}
+
+/* Start optimizer after OPTIMIZE_DELAY passed */
+static __kprobes void kick_kprobe_optimizer(void)
+{
+       if (!delayed_work_pending(&optimizing_work))
+               schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+}
+
+/* Kprobe jump optimizer */
+static __kprobes void kprobe_optimizer(struct work_struct *work)
+{
+       LIST_HEAD(free_list);
+
+       /* Lock modules while optimizing kprobes */
+       mutex_lock(&module_mutex);
+       mutex_lock(&kprobe_mutex);
+
+       /*
+        * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
+        * kprobes before waiting for quiesence period.
+        */
+       do_unoptimize_kprobes(&free_list);
+
+       /*
+        * Step 2: Wait for quiesence period to ensure all running interrupts
+        * are done. Because optprobe may modify multiple instructions
+        * there is a chance that Nth instruction is interrupted. In that
+        * case, running interrupt can return to 2nd-Nth byte of jump
+        * instruction. This wait is for avoiding it.
+        */
+       synchronize_sched();
+
+       /* Step 3: Optimize kprobes after quiesence period */
+       do_optimize_kprobes();
+
+       /* Step 4: Free cleaned kprobes after quiesence period */
+       do_free_cleaned_kprobes(&free_list);
+
         mutex_unlock(&kprobe_mutex);
         mutex_unlock(&module_mutex);
+
+       /* Step 5: Kick optimizer again if needed */
+       if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
+               kick_kprobe_optimizer();
+       else
+               /* Wake up all waiters */
+               complete_all(&optimizer_comp);
+}
+
+/* Wait for completing optimization and unoptimization */
+static __kprobes void wait_for_kprobe_optimizer(void)
+{
+       if (delayed_work_pending(&optimizing_work))
+               wait_for_completion(&optimizer_comp);
  }
  
  /* Optimize kprobe if p is ready to be optimized */
@@ -495,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
         /* Check if it is already optimized. */
         if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
                 return;
-
         op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
-       list_add(&op->list, &optimizing_list);
-       if (!delayed_work_pending(&optimizing_work))
-               schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+
+       if (!list_empty(&op->list))
+               /* This is under unoptimizing. Just dequeue the probe */
+               list_del_init(&op->list);
+       else {
+               list_add(&op->list, &optimizing_list);
+               kick_kprobe_optimizer();
+       }
+}
+
+/* Short cut to direct unoptimizing */
+static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+       get_online_cpus();
+       arch_unoptimize_kprobe(op);
+       put_online_cpus();
+       if (kprobe_disabled(&op->kp))
+               arch_disarm_kprobe(&op->kp);
  }
  
  /* Unoptimize a kprobe if p is optimized */
-static __kprobes void unoptimize_kprobe(struct kprobe *p)
+static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
  {
         struct optimized_kprobe *op;
  
-       if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
-               op = container_of(p, struct optimized_kprobe, kp);
-               if (!list_empty(&op->list))
-                       /* Dequeue from the optimization queue */
+       if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
+               return; /* This is not an optprobe nor optimized */
+
+       op = container_of(p, struct optimized_kprobe, kp);
+       if (!kprobe_optimized(p)) {
+               /* Unoptimized or unoptimizing case */
+               if (force && !list_empty(&op->list)) {
+                       /*
+                        * Only if this is unoptimizing kprobe and forced,
+                        * forcibly unoptimize it. (No need to unoptimize
+                        * unoptimized kprobe again :)
+                        */
                         list_del_init(&op->list);
-               else
-                       /* Replace jump with break */
-                       arch_unoptimize_kprobe(op);
-               op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+                       force_unoptimize_kprobe(op);
+               }
+               return;
+       }
+
+       op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+       if (!list_empty(&op->list)) {
+               /* Dequeue from the optimization queue */
+               list_del_init(&op->list);
+               return;
+       }
+       /* Optimized kprobe case */
+       if (force)
+               /* Forcibly update the code: this is a special case */
+               force_unoptimize_kprobe(op);
+       else {
+               list_add(&op->list, &unoptimizing_list);
+               kick_kprobe_optimizer();
         }
  }
  
+/* Cancel unoptimizing for reusing */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+       struct optimized_kprobe *op;
+
+       BUG_ON(!kprobe_unused(ap));
+       /*
+        * Unused kprobe MUST be on the way of delayed unoptimizing (means
+        * there is still a relative jump) and disabled.
+        */
+       op = container_of(ap, struct optimized_kprobe, kp);
+       if (unlikely(list_empty(&op->list)))
+               printk(KERN_WARNING "Warning: found a stray unused "
+                       "aggrprobe@%p\n", ap->addr);
+       /* Enable the probe again */
+       ap->flags &= ~KPROBE_FLAG_DISABLED;
+       /* Optimize it again (remove from op->list) */
+       BUG_ON(!kprobe_optready(ap));
+       optimize_kprobe(ap);
+}
+
  /* Remove optimized instructions */
  static void __kprobes kill_optimized_kprobe(struct kprobe *p)
  {
         struct optimized_kprobe *op;
  
         op = container_of(p, struct optimized_kprobe, kp);
-       if (!list_empty(&op->list)) {
-               /* Dequeue from the optimization queue */
+       if (!list_empty(&op->list))
+               /* Dequeue from the (un)optimization queue */
                 list_del_init(&op->list);
-               op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
-       }
-       /* Don't unoptimize, because the target code will be freed. */
+
+       op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+       /* Don't touch the code, because it is already freed. */
         arch_remove_optimized_kprobe(op);
  }
  
@@ -543,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
         arch_prepare_optimized_kprobe(op);
  }
  
-/* Free optimized instructions and optimized_kprobe */
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
-{
-       struct optimized_kprobe *op;
-
-       op = container_of(p, struct optimized_kprobe, kp);
-       arch_remove_optimized_kprobe(op);
-       kfree(op);
-}
-
  /* Allocate new optimized_kprobe and try to prepare optimized instructions */
  static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
  {
@@ -587,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
         op = container_of(ap, struct optimized_kprobe, kp);
         if (!arch_prepared_optinsn(&op->optinsn)) {
                 /* If failed to setup optimizing, fallback to kprobe */
-               free_aggr_kprobe(ap);
+               arch_remove_optimized_kprobe(op);
+               kfree(op);
                 return;
         }
  
@@ -631,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void)
                 return;
  
         kprobes_allow_optimization = false;
-       printk(KERN_INFO "Kprobes globally unoptimized\n");
-       get_online_cpus();      /* For avoiding text_mutex deadlock */
-       mutex_lock(&text_mutex);
         for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                 head = &kprobe_table[i];
                 hlist_for_each_entry_rcu(p, node, head, hlist) {
                         if (!kprobe_disabled(p))
-                               unoptimize_kprobe(p);
+                               unoptimize_kprobe(p, false);
                 }
         }
-
-       mutex_unlock(&text_mutex);
-       put_online_cpus();
-       /* Allow all currently running kprobes to complete */
-       synchronize_sched();
+       /* Wait for unoptimizing completion */
+       wait_for_kprobe_optimizer();
+       printk(KERN_INFO "Kprobes globally unoptimized\n");
  }
  
  int sysctl_kprobes_optimization;
@@ -669,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
  }
  #endif /* CONFIG_SYSCTL */
  
+/* Put a breakpoint for a probe. Must be called with text_mutex locked */
  static void __kprobes __arm_kprobe(struct kprobe *p)
  {
-       struct kprobe *old_p;
+       struct kprobe *_p;
  
         /* Check collision with other optimized kprobes */
-       old_p = get_optimized_kprobe((unsigned long)p->addr);
-       if (unlikely(old_p))
-               unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
+       _p = get_optimized_kprobe((unsigned long)p->addr);
+       if (unlikely(_p))
+               /* Fallback to unoptimized kprobe */
+               unoptimize_kprobe(_p, true);
  
         arch_arm_kprobe(p);
         optimize_kprobe(p);     /* Try to optimize (add kprobe to a list) */
  }
  
-static void __kprobes __disarm_kprobe(struct kprobe *p)
+/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
+static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
  {
-       struct kprobe *old_p;
+       struct kprobe *_p;
  
-       unoptimize_kprobe(p);   /* Try to unoptimize */
-       arch_disarm_kprobe(p);
+       unoptimize_kprobe(p, false);    /* Try to unoptimize */
  
-       /* If another kprobe was blocked, optimize it. */
-       old_p = get_optimized_kprobe((unsigned long)p->addr);
-       if (unlikely(old_p))
-               optimize_kprobe(old_p);
+       if (!kprobe_queued(p)) {
+               arch_disarm_kprobe(p);
+               /* If another kprobe was blocked, optimize it. */
+               _p = get_optimized_kprobe((unsigned long)p->addr);
+               if (unlikely(_p) && reopt)
+                       optimize_kprobe(_p);
+       }
+       /* TODO: reoptimize others after unoptimized this probe */
  }
  
  #else /* !CONFIG_OPTPROBES */
  
  #define optimize_kprobe(p)                     do {} while (0)
-#define unoptimize_kprobe(p)                   do {} while (0)
+#define unoptimize_kprobe(p, f)                        do {} while (0)
  #define kill_optimized_kprobe(p)               do {} while (0)
  #define prepare_optimized_kprobe(p)            do {} while (0)
  #define try_to_optimize_kprobe(p)              do {} while (0)
  #define __arm_kprobe(p)                                arch_arm_kprobe(p)
-#define __disarm_kprobe(p)                     arch_disarm_kprobe(p)
+#define __disarm_kprobe(p, o)                  arch_disarm_kprobe(p)
+#define kprobe_disarmed(p)                     kprobe_disabled(p)
+#define wait_for_kprobe_optimizer()            do {} while (0)
+
+/* There should be no unused kprobes can be reused without optimization */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+       printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
+       BUG_ON(kprobe_unused(ap));
+}
  
  static __kprobes void free_aggr_kprobe(struct kprobe *p)
  {
+       arch_remove_kprobe(p);
         kfree(p);
  }
  
@@ -732,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
  /* Disarm a kprobe with text_mutex */
  static void __kprobes disarm_kprobe(struct kprobe *kp)
  {
-       get_online_cpus();      /* For avoiding text_mutex deadlock */
+       /* Ditto */
         mutex_lock(&text_mutex);
-       __disarm_kprobe(kp);
+       __disarm_kprobe(kp, true);
         mutex_unlock(&text_mutex);
-       put_online_cpus();
  }
  
  /*
@@ -942,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
         BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
  
         if (p->break_handler || p->post_handler)
-               unoptimize_kprobe(ap);  /* Fall back to normal kprobe */
+               unoptimize_kprobe(ap, true);    /* Fall back to normal kprobe */
  
         if (p->break_handler) {
                 if (ap->break_handler)
@@ -993,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
   * This is the second or subsequent kprobe at the address - handle
   * the intricacies
   */
-static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
                                           struct kprobe *p)
  {
         int ret = 0;
-       struct kprobe *ap = old_p;
+       struct kprobe *ap = orig_p;
  
-       if (!kprobe_aggrprobe(old_p)) {
-               /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
-               ap = alloc_aggr_kprobe(old_p);
+       if (!kprobe_aggrprobe(orig_p)) {
+               /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
+               ap = alloc_aggr_kprobe(orig_p);
                 if (!ap)
                         return -ENOMEM;
-               init_aggr_kprobe(ap, old_p);
-       }
+               init_aggr_kprobe(ap, orig_p);
+       } else if (kprobe_unused(ap))
+               /* This probe is going to die. Rescue it */
+               reuse_unused_kprobe(ap);
  
         if (kprobe_gone(ap)) {
                 /*
@@ -1039,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
         return add_new_kprobe(ap, p);
  }
  
-/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
-static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
-{
-       struct kprobe *kp;
-
-       list_for_each_entry_rcu(kp, &p->list, list) {
-               if (!kprobe_disabled(kp))
-                       /*
-                        * There is an active probe on the list.
-                        * We can't disable aggr_kprobe.
-                        */
-                       return 0;
-       }
-       p->flags |= KPROBE_FLAG_DISABLED;
-       return 1;
-}
-
  static int __kprobes in_kprobes_functions(unsigned long addr)
  {
         struct kprobe_blackpoint *kb;
@@ -1098,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
  /* Check passed kprobe is valid and return kprobe in kprobe_table. */
  static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
  {
-       struct kprobe *old_p, *list_p;
+       struct kprobe *ap, *list_p;
  
-       old_p = get_kprobe(p->addr);
-       if (unlikely(!old_p))
+       ap = get_kprobe(p->addr);
+       if (unlikely(!ap))
                 return NULL;
  
-       if (p != old_p) {
-               list_for_each_entry_rcu(list_p, &old_p->list, list)
+       if (p != ap) {
+               list_for_each_entry_rcu(list_p, &ap->list, list)
                         if (list_p == p)
                         /* kprobe p is a valid probe */
                                 goto valid;
                 return NULL;
         }
  valid:
-       return old_p;
+       return ap;
  }
  
  /* Return error if the kprobe is being re-registered */
  static inline int check_kprobe_rereg(struct kprobe *p)
  {
         int ret = 0;
-       struct kprobe *old_p;
  
         mutex_lock(&kprobe_mutex);
-       old_p = __get_valid_kprobe(p);
-       if (old_p)
+       if (__get_valid_kprobe(p))
                 ret = -EINVAL;
         mutex_unlock(&kprobe_mutex);
+
         return ret;
  }
  
@@ -1229,67 +1403,121 @@ fail_with_jump_label:
  }
  EXPORT_SYMBOL_GPL(register_kprobe);
  
+/* Check if all probes on the aggrprobe are disabled */
+static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
+{
+       struct kprobe *kp;
+
+       list_for_each_entry_rcu(kp, &ap->list, list)
+               if (!kprobe_disabled(kp))
+                       /*
+                        * There is an active probe on the list.
+                        * We can't disable this ap.
+                        */
+                       return 0;
+
+       return 1;
+}
+
+/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
+static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
+{
+       struct kprobe *orig_p;
+
+       /* Get an original kprobe for return */
+       orig_p = __get_valid_kprobe(p);
+       if (unlikely(orig_p == NULL))
+               return NULL;
+
+       if (!kprobe_disabled(p)) {
+               /* Disable probe if it is a child probe */
+               if (p != orig_p)
+                       p->flags |= KPROBE_FLAG_DISABLED;
+
+               /* Try to disarm and disable this/parent probe */
+               if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
+                       disarm_kprobe(orig_p);
+                       orig_p->flags |= KPROBE_FLAG_DISABLED;
+               }
+       }
+
+       return orig_p;
+}
+
  /*
   * Unregister a kprobe without a scheduler synchronization.
   */
  static int __kprobes __unregister_kprobe_top(struct kprobe *p)
  {
-       struct kprobe *old_p, *list_p;
+       struct kprobe *ap, *list_p;
  
-       old_p = __get_valid_kprobe(p);
-       if (old_p == NULL)
+       /* Disable kprobe. This will disarm it if needed. */
+       ap = __disable_kprobe(p);
+       if (ap == NULL)
                 return -EINVAL;
  
-       if (old_p == p ||
-           (kprobe_aggrprobe(old_p) &&
-            list_is_singular(&old_p->list))) {
+       if (ap == p)
                 /*
-                * Only probe on the hash list. Disarm only if kprobes are
-                * enabled and not gone - otherwise, the breakpoint would
-                * already have been removed. We save on flushing icache.
+                * This probe is an independent(and non-optimized) kprobe
+                * (not an aggrprobe). Remove from the hash list.
                  */
-               if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
-                       disarm_kprobe(old_p);
-               hlist_del_rcu(&old_p->hlist);
-       } else {
+               goto disarmed;
+
+       /* Following process expects this probe is an aggrprobe */
+       WARN_ON(!kprobe_aggrprobe(ap));
+
+       if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
+               /*
+                * !disarmed could be happen if the probe is under delayed
+                * unoptimizing.
+                */
+               goto disarmed;
+       else {
+               /* If disabling probe has special handlers, update aggrprobe */
                 if (p->break_handler && !kprobe_gone(p))
-                       old_p->break_handler = NULL;
+                       ap->break_handler = NULL;
                 if (p->post_handler && !kprobe_gone(p)) {
-                       list_for_each_entry_rcu(list_p, &old_p->list, list) {
+                       list_for_each_entry_rcu(list_p, &ap->list, list) {
                                 if ((list_p != p) && (list_p->post_handler))
                                         goto noclean;
                         }
-                       old_p->post_handler = NULL;
+                       ap->post_handler = NULL;
                 }
  noclean:
+               /*
+                * Remove from the aggrprobe: this path will do nothing in
+                * __unregister_kprobe_bottom().
+                */
                 list_del_rcu(&p->list);
-               if (!kprobe_disabled(old_p)) {
-                       try_to_disable_aggr_kprobe(old_p);
-                       if (!kprobes_all_disarmed) {
-                               if (kprobe_disabled(old_p))
-                                       disarm_kprobe(old_p);
-                               else
-                                       /* Try to optimize this probe again */
-                                       optimize_kprobe(old_p);
-                       }
-               }
+               if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
+                       /*
+                        * Try to optimize this probe again, because post
+                        * handler may have been changed.
+                        */
+                       optimize_kprobe(ap);
         }
         return 0;
+
+disarmed:
+       BUG_ON(!kprobe_disarmed(ap));
+       hlist_del_rcu(&ap->hlist);
+       return 0;
  }
  
  static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
  {
-       struct kprobe *old_p;
+       struct kprobe *ap;
  
         if (list_empty(&p->list))
+               /* This is an independent kprobe */
                 arch_remove_kprobe(p);
         else if (list_is_singular(&p->list)) {
-               /* "p" is the last child of an aggr_kprobe */
-               old_p = list_entry(p->list.next, struct kprobe, list);
+               /* This is the last child of an aggrprobe */
+               ap = list_entry(p->list.next, struct kprobe, list);
                 list_del(&p->list);
-               arch_remove_kprobe(old_p);
-               free_aggr_kprobe(old_p);
+               free_aggr_kprobe(ap);
         }
+       /* Otherwise, do nothing. */
  }
  
  int __kprobes register_kprobes(struct kprobe **kps, int num)
@@ -1607,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
  int __kprobes disable_kprobe(struct kprobe *kp)
  {
         int ret = 0;
-       struct kprobe *p;
  
         mutex_lock(&kprobe_mutex);
  
-       /* Check whether specified probe is valid. */
-       p = __get_valid_kprobe(kp);
-       if (unlikely(p == NULL)) {
+       /* Disable this kprobe */
+       if (__disable_kprobe(kp) == NULL)
                 ret = -EINVAL;
-               goto out;
-       }
  
-       /* If the probe is already disabled (or gone), just return */
-       if (kprobe_disabled(kp))
-               goto out;
-
-       kp->flags |= KPROBE_FLAG_DISABLED;
-       if (p != kp)
-               /* When kp != p, p is always enabled. */
-               try_to_disable_aggr_kprobe(p);
-
-       if (!kprobes_all_disarmed && kprobe_disabled(p))
-               disarm_kprobe(p);
-out:
         mutex_unlock(&kprobe_mutex);
         return ret;
  }
@@ -1927,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void)
         mutex_lock(&kprobe_mutex);
  
         /* If kprobes are already disarmed, just return */
-       if (kprobes_all_disarmed)
-               goto already_disabled;
+       if (kprobes_all_disarmed) {
+               mutex_unlock(&kprobe_mutex);
+               return;
+       }
  
         kprobes_all_disarmed = true;
         printk(KERN_INFO "Kprobes globally disabled\n");
  
-       /*
-        * Here we call get_online_cpus() for avoiding text_mutex deadlock,
-        * because disarming may also unoptimize kprobes.
-        */
-       get_online_cpus();
         mutex_lock(&text_mutex);
         for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                 head = &kprobe_table[i];
                 hlist_for_each_entry_rcu(p, node, head, hlist) {
                         if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
-                               __disarm_kprobe(p);
+                               __disarm_kprobe(p, false);
                 }
         }
-
         mutex_unlock(&text_mutex);
-       put_online_cpus();
         mutex_unlock(&kprobe_mutex);
-       /* Allow all currently running kprobes to complete */
-       synchronize_sched();
-       return;
  
-already_disabled:
-       mutex_unlock(&kprobe_mutex);
-       return;
+       /* Wait for disarming all kprobes by optimizer */
+       wait_for_kprobe_optimizer();
  }
  
  /*
diff --git a/kernel/kthread.c b/kernel/kthread.c

index ca61bbdd44b2e11acad4d866ab31224a5a5cab16..5355cfd44a3fd21cd767c13d053410ec338f2ede 100644 (file)
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
         wait_for_completion(&create.done);
  
         if (!IS_ERR(create.result)) {
-               struct sched_param param = { .sched_priority = 0 };
+               static struct sched_param param = { .sched_priority = 0 };
                 va_list args;
  
                 va_start(args, namefmt);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c

index 59b76c8ce9d7172e8176f355da9719495077a133..1969d2fc4b36328cf48798620506ddcd0ec330d0 100644 (file)
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -494,7 +494,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
                 namelen += 2;
  
         for (i = 0; i < LOCKSTAT_POINTS; i++) {
-               char sym[KSYM_SYMBOL_LEN];
                 char ip[32];
  
                 if (class->contention_point[i] == 0)
@@ -503,15 +502,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
                 if (!i)
                         seq_line(m, '-', 40-namelen, namelen);
  
-               sprint_symbol(sym, class->contention_point[i]);
                 snprintf(ip, sizeof(ip), "[<%p>]",
                                 (void *)class->contention_point[i]);
-               seq_printf(m, "%40s %14lu %29s %s\n", name,
-                               stats->contention_point[i],
-                               ip, sym);
+               seq_printf(m, "%40s %14lu %29s %pS\n",
+                          name, stats->contention_point[i],
+                          ip, (void *)class->contention_point[i]);
         }
         for (i = 0; i < LOCKSTAT_POINTS; i++) {
-               char sym[KSYM_SYMBOL_LEN];
                 char ip[32];
  
                 if (class->contending_point[i] == 0)
@@ -520,12 +517,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
                 if (!i)
                         seq_line(m, '-', 40-namelen, namelen);
  
-               sprint_symbol(sym, class->contending_point[i]);
                 snprintf(ip, sizeof(ip), "[<%p>]",
                                 (void *)class->contending_point[i]);
-               seq_printf(m, "%40s %14lu %29s %s\n", name,
-                               stats->contending_point[i],
-                               ip, sym);
+               seq_printf(m, "%40s %14lu %29s %pS\n",
+                          name, stats->contending_point[i],
+                          ip, (void *)class->contending_point[i]);
         }
         if (i) {
                 seq_puts(m, "\n");
diff --git a/kernel/module.c b/kernel/module.c

index d190664f25ff3fa10dca29f37b483f08ad07eae1..34e00b708fad2c79b260ab3d8d4cc199cece8eca 100644 (file)
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,6 +56,7 @@
  #include <linux/percpu.h>
  #include <linux/kmemleak.h>
  #include <linux/jump_label.h>
+#include <linux/pfn.h>
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/module.h>
@@ -70,6 +71,26 @@
  #define ARCH_SHF_SMALL 0
  #endif
  
+/*
+ * Modules' sections will be aligned on page boundaries
+ * to ensure complete separation of code and data, but
+ * only when CONFIG_DEBUG_SET_MODULE_RONX=y
+ */
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+# define debug_align(X) ALIGN(X, PAGE_SIZE)
+#else
+# define debug_align(X) (X)
+#endif
+
+/*
+ * Given BASE and SIZE this macro calculates the number of pages the
+ * memory regions occupies
+ */
+#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ?                \
+               (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
+                        PFN_DOWN((unsigned long)BASE) + 1)     \
+               : (0UL))
+
  /* If this is set, the section belongs in the init part of the module */
  #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
  
@@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod)
         return 0;
  }
  
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+/*
+ * LKM RO/NX protection: protect module's text/ro-data
+ * from modification and any data from execution.
+ */
+void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
+{
+       unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
+       unsigned long end_pfn = PFN_DOWN((unsigned long)end);
+
+       if (end_pfn > begin_pfn)
+               set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+}
+
+static void set_section_ro_nx(void *base,
+                       unsigned long text_size,
+                       unsigned long ro_size,
+                       unsigned long total_size)
+{
+       /* begin and end PFNs of the current subsection */
+       unsigned long begin_pfn;
+       unsigned long end_pfn;
+
+       /*
+        * Set RO for module text and RO-data:
+        * - Always protect first page.
+        * - Do not protect last partial page.
+        */
+       if (ro_size > 0)
+               set_page_attributes(base, base + ro_size, set_memory_ro);
+
+       /*
+        * Set NX permissions for module data:
+        * - Do not protect first partial page.
+        * - Always protect last page.
+        */
+       if (total_size > text_size) {
+               begin_pfn = PFN_UP((unsigned long)base + text_size);
+               end_pfn = PFN_UP((unsigned long)base + total_size);
+               if (end_pfn > begin_pfn)
+                       set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+       }
+}
+
+/* Setting memory back to RW+NX before releasing it */
+void unset_section_ro_nx(struct module *mod, void *module_region)
+{
+       unsigned long total_pages;
+
+       if (mod->module_core == module_region) {
+               /* Set core as NX+RW */
+               total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size);
+               set_memory_nx((unsigned long)mod->module_core, total_pages);
+               set_memory_rw((unsigned long)mod->module_core, total_pages);
+
+       } else if (mod->module_init == module_region) {
+               /* Set init as NX+RW */
+               total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size);
+               set_memory_nx((unsigned long)mod->module_init, total_pages);
+               set_memory_rw((unsigned long)mod->module_init, total_pages);
+       }
+}
+
+/* Iterate through all modules and set each module's text as RW */
+void set_all_modules_text_rw()
+{
+       struct module *mod;
+
+       mutex_lock(&module_mutex);
+       list_for_each_entry_rcu(mod, &modules, list) {
+               if ((mod->module_core) && (mod->core_text_size)) {
+                       set_page_attributes(mod->module_core,
+                                               mod->module_core + mod->core_text_size,
+                                               set_memory_rw);
+               }
+               if ((mod->module_init) && (mod->init_text_size)) {
+                       set_page_attributes(mod->module_init,
+                                               mod->module_init + mod->init_text_size,
+                                               set_memory_rw);
+               }
+       }
+       mutex_unlock(&module_mutex);
+}
+
+/* Iterate through all modules and set each module's text as RO */
+void set_all_modules_text_ro()
+{
+       struct module *mod;
+
+       mutex_lock(&module_mutex);
+       list_for_each_entry_rcu(mod, &modules, list) {
+               if ((mod->module_core) && (mod->core_text_size)) {
+                       set_page_attributes(mod->module_core,
+                                               mod->module_core + mod->core_text_size,
+                                               set_memory_ro);
+               }
+               if ((mod->module_init) && (mod->init_text_size)) {
+                       set_page_attributes(mod->module_init,
+                                               mod->module_init + mod->init_text_size,
+                                               set_memory_ro);
+               }
+       }
+       mutex_unlock(&module_mutex);
+}
+#else
+static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
+static inline void unset_section_ro_nx(struct module *mod, void *module_region) { }
+#endif
+
  /* Free a module, remove from lists, etc. */
  static void free_module(struct module *mod)
  {
@@ -1566,6 +1696,7 @@ static void free_module(struct module *mod)
         destroy_params(mod->kp, mod->num_kp);
  
         /* This may be NULL, but that's OK */
+       unset_section_ro_nx(mod, mod->module_init);
         module_free(mod, mod->module_init);
         kfree(mod->args);
         percpu_modfree(mod);
@@ -1574,6 +1705,7 @@ static void free_module(struct module *mod)
         lockdep_free_key_range(mod->module_core, mod->core_size);
  
         /* Finally, free the core (containing the module structure) */
+       unset_section_ro_nx(mod, mod->module_core);
         module_free(mod, mod->module_core);
  
  #ifdef CONFIG_MPU
@@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
                         s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
                         DEBUGP("\t%s\n", name);
                 }
-               if (m == 0)
+               switch (m) {
+               case 0: /* executable */
+                       mod->core_size = debug_align(mod->core_size);
                         mod->core_text_size = mod->core_size;
+                       break;
+               case 1: /* RO: text and ro-data */
+                       mod->core_size = debug_align(mod->core_size);
+                       mod->core_ro_size = mod->core_size;
+                       break;
+               case 3: /* whole core */
+                       mod->core_size = debug_align(mod->core_size);
+                       break;
+               }
         }
  
         DEBUGP("Init section allocation order:\n");
@@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info)
                                          | INIT_OFFSET_MASK);
                         DEBUGP("\t%s\n", sname);
                 }
-               if (m == 0)
+               switch (m) {
+               case 0: /* executable */
+                       mod->init_size = debug_align(mod->init_size);
                         mod->init_text_size = mod->init_size;
+                       break;
+               case 1: /* RO: text and ro-data */
+                       mod->init_size = debug_align(mod->init_size);
+                       mod->init_ro_size = mod->init_size;
+                       break;
+               case 3: /* whole init */
+                       mod->init_size = debug_align(mod->init_size);
+                       break;
+               }
         }
  }
  
@@ -2722,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
         blocking_notifier_call_chain(&module_notify_list,
                         MODULE_STATE_COMING, mod);
  
+       /* Set RO and NX regions for core */
+       set_section_ro_nx(mod->module_core,
+                               mod->core_text_size,
+                               mod->core_ro_size,
+                               mod->core_size);
+
+       /* Set RO and NX regions for init */
+       set_section_ro_nx(mod->module_init,
+                               mod->init_text_size,
+                               mod->init_ro_size,
+                               mod->init_size);
+
         do_mod_ctors(mod);
         /* Start the module */
         if (mod->init != NULL)
@@ -2765,6 +2931,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
         mod->symtab = mod->core_symtab;
         mod->strtab = mod->core_strtab;
  #endif
+       unset_section_ro_nx(mod, mod->module_init);
         module_free(mod, mod->module_init);
         mod->module_init = NULL;
         mod->init_size = 0;
diff --git a/kernel/mutex.c b/kernel/mutex.c

index 200407c1502f509ee3f9d8a665bc4d3b78a27f74..a5889fb28ecff33eaf5fae64c9d2a50ca03cb2f7 100644 (file)
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                  * memory barriers as we'll eventually observe the right
                  * values at the cost of a few extra spins.
                  */
-               cpu_relax();
+               arch_mutex_cpu_relax();
         }
  #endif
         spin_lock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c

index 2870feee81dd7a046703645c9ec50022d4339f39..11847bf1e8cc254db7f2a2a255511fd36eea4a68 100644 (file)
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -13,6 +13,7 @@
  #include <linux/mm.h>
  #include <linux/cpu.h>
  #include <linux/smp.h>
+#include <linux/idr.h>
  #include <linux/file.h>
  #include <linux/poll.h>
  #include <linux/slab.h>
@@ -21,7 +22,9 @@
  #include <linux/dcache.h>
  #include <linux/percpu.h>
  #include <linux/ptrace.h>
+#include <linux/reboot.h>
  #include <linux/vmstat.h>
+#include <linux/device.h>
  #include <linux/vmalloc.h>
  #include <linux/hardirq.h>
  #include <linux/rculist.h>
@@ -133,6 +136,28 @@ static void unclone_ctx(struct perf_event_context *ctx)
         }
  }
  
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+{
+       /*
+        * only top level events have the pid namespace they were created in
+        */
+       if (event->parent)
+               event = event->parent;
+
+       return task_tgid_nr_ns(p, event->ns);
+}
+
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+       /*
+        * only top level events have the pid namespace they were created in
+        */
+       if (event->parent)
+               event = event->parent;
+
+       return task_pid_nr_ns(p, event->ns);
+}
+
  /*
   * If we inherit events we want to return the parent event id
   * to userspace.
@@ -312,9 +337,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
                 ctx->nr_stat++;
  }
  
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__read_size(struct perf_event *event)
+{
+       int entry = sizeof(u64); /* value */
+       int size = 0;
+       int nr = 1;
+
+       if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+               size += sizeof(u64);
+
+       if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+               size += sizeof(u64);
+
+       if (event->attr.read_format & PERF_FORMAT_ID)
+               entry += sizeof(u64);
+
+       if (event->attr.read_format & PERF_FORMAT_GROUP) {
+               nr += event->group_leader->nr_siblings;
+               size += sizeof(u64);
+       }
+
+       size += entry * nr;
+       event->read_size = size;
+}
+
+static void perf_event__header_size(struct perf_event *event)
+{
+       struct perf_sample_data *data;
+       u64 sample_type = event->attr.sample_type;
+       u16 size = 0;
+
+       perf_event__read_size(event);
+
+       if (sample_type & PERF_SAMPLE_IP)
+               size += sizeof(data->ip);
+
+       if (sample_type & PERF_SAMPLE_ADDR)
+               size += sizeof(data->addr);
+
+       if (sample_type & PERF_SAMPLE_PERIOD)
+               size += sizeof(data->period);
+
+       if (sample_type & PERF_SAMPLE_READ)
+               size += event->read_size;
+
+       event->header_size = size;
+}
+
+static void perf_event__id_header_size(struct perf_event *event)
+{
+       struct perf_sample_data *data;
+       u64 sample_type = event->attr.sample_type;
+       u16 size = 0;
+
+       if (sample_type & PERF_SAMPLE_TID)
+               size += sizeof(data->tid_entry);
+
+       if (sample_type & PERF_SAMPLE_TIME)
+               size += sizeof(data->time);
+
+       if (sample_type & PERF_SAMPLE_ID)
+               size += sizeof(data->id);
+
+       if (sample_type & PERF_SAMPLE_STREAM_ID)
+               size += sizeof(data->stream_id);
+
+       if (sample_type & PERF_SAMPLE_CPU)
+               size += sizeof(data->cpu_entry);
+
+       event->id_header_size = size;
+}
+
  static void perf_group_attach(struct perf_event *event)
  {
-       struct perf_event *group_leader = event->group_leader;
+       struct perf_event *group_leader = event->group_leader, *pos;
  
         /*
          * We can have double attach due to group movement in perf_event_open.
@@ -333,6 +433,11 @@ static void perf_group_attach(struct perf_event *event)
  
         list_add_tail(&event->group_entry, &group_leader->sibling_list);
         group_leader->nr_siblings++;
+
+       perf_event__header_size(group_leader);
+
+       list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
+               perf_event__header_size(pos);
  }
  
  /*
@@ -391,7 +496,7 @@ static void perf_group_detach(struct perf_event *event)
         if (event->group_leader != event) {
                 list_del_init(&event->group_entry);
                 event->group_leader->nr_siblings--;
-               return;
+               goto out;
         }
  
         if (!list_empty(&event->group_entry))
@@ -410,6 +515,12 @@ static void perf_group_detach(struct perf_event *event)
                 /* Inherit group flags from the previous leader */
                 sibling->group_flags = event->group_flags;
         }
+
+out:
+       perf_event__header_size(event->group_leader);
+
+       list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
+               perf_event__header_size(tmp);
  }
  
  static inline int
@@ -1073,7 +1184,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
         /*
          * not supported on inherited events
          */
-       if (event->attr.inherit)
+       if (event->attr.inherit || !is_sampling_event(event))
                 return -EINVAL;
  
         atomic_add(refresh, &event->event_limit);
@@ -2289,31 +2400,6 @@ static int perf_release(struct inode *inode, struct file *file)
         return perf_event_release_kernel(event);
  }
  
-static int perf_event_read_size(struct perf_event *event)
-{
-       int entry = sizeof(u64); /* value */
-       int size = 0;
-       int nr = 1;
-
-       if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-               size += sizeof(u64);
-
-       if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-               size += sizeof(u64);
-
-       if (event->attr.read_format & PERF_FORMAT_ID)
-               entry += sizeof(u64);
-
-       if (event->attr.read_format & PERF_FORMAT_GROUP) {
-               nr += event->group_leader->nr_siblings;
-               size += sizeof(u64);
-       }
-
-       size += entry * nr;
-
-       return size;
-}
-
  u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
  {
         struct perf_event *child;
@@ -2428,7 +2514,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
         if (event->state == PERF_EVENT_STATE_ERROR)
                 return 0;
  
-       if (count < perf_event_read_size(event))
+       if (count < event->read_size)
                 return -ENOSPC;
  
         WARN_ON_ONCE(event->ctx->parent_ctx);
@@ -2514,7 +2600,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
         int ret = 0;
         u64 value;
  
-       if (!event->attr.sample_period)
+       if (!is_sampling_event(event))
                 return -EINVAL;
  
         if (copy_from_user(&value, arg, sizeof(value)))
@@ -3305,6 +3391,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
         } while (len);
  }
  
+static void __perf_event_header__init_id(struct perf_event_header *header,
+                                        struct perf_sample_data *data,
+                                        struct perf_event *event)
+{
+       u64 sample_type = event->attr.sample_type;
+
+       data->type = sample_type;
+       header->size += event->id_header_size;
+
+       if (sample_type & PERF_SAMPLE_TID) {
+               /* namespace issues */
+               data->tid_entry.pid = perf_event_pid(event, current);
+               data->tid_entry.tid = perf_event_tid(event, current);
+       }
+
+       if (sample_type & PERF_SAMPLE_TIME)
+               data->time = perf_clock();
+
+       if (sample_type & PERF_SAMPLE_ID)
+               data->id = primary_event_id(event);
+
+       if (sample_type & PERF_SAMPLE_STREAM_ID)
+               data->stream_id = event->id;
+
+       if (sample_type & PERF_SAMPLE_CPU) {
+               data->cpu_entry.cpu      = raw_smp_processor_id();
+               data->cpu_entry.reserved = 0;
+       }
+}
+
+static void perf_event_header__init_id(struct perf_event_header *header,
+                                      struct perf_sample_data *data,
+                                      struct perf_event *event)
+{
+       if (event->attr.sample_id_all)
+               __perf_event_header__init_id(header, data, event);
+}
+
+static void __perf_event__output_id_sample(struct perf_output_handle *handle,
+                                          struct perf_sample_data *data)
+{
+       u64 sample_type = data->type;
+
+       if (sample_type & PERF_SAMPLE_TID)
+               perf_output_put(handle, data->tid_entry);
+
+       if (sample_type & PERF_SAMPLE_TIME)
+               perf_output_put(handle, data->time);
+
+       if (sample_type & PERF_SAMPLE_ID)
+               perf_output_put(handle, data->id);
+
+       if (sample_type & PERF_SAMPLE_STREAM_ID)
+               perf_output_put(handle, data->stream_id);
+
+       if (sample_type & PERF_SAMPLE_CPU)
+               perf_output_put(handle, data->cpu_entry);
+}
+
+static void perf_event__output_id_sample(struct perf_event *event,
+                                        struct perf_output_handle *handle,
+                                        struct perf_sample_data *sample)
+{
+       if (event->attr.sample_id_all)
+               __perf_event__output_id_sample(handle, sample);
+}
+
  int perf_output_begin(struct perf_output_handle *handle,
                       struct perf_event *event, unsigned int size,
                       int nmi, int sample)
@@ -3312,6 +3465,7 @@ int perf_output_begin(struct perf_output_handle *handle,
         struct perf_buffer *buffer;
         unsigned long tail, offset, head;
         int have_lost;
+       struct perf_sample_data sample_data;
         struct {
                 struct perf_event_header header;
                 u64                      id;
@@ -3338,8 +3492,12 @@ int perf_output_begin(struct perf_output_handle *handle,
                 goto out;
  
         have_lost = local_read(&buffer->lost);
-       if (have_lost)
-               size += sizeof(lost_event);
+       if (have_lost) {
+               lost_event.header.size = sizeof(lost_event);
+               perf_event_header__init_id(&lost_event.header, &sample_data,
+                                          event);
+               size += lost_event.header.size;
+       }
  
         perf_output_get_handle(handle);
  
@@ -3370,11 +3528,11 @@ int perf_output_begin(struct perf_output_handle *handle,
         if (have_lost) {
                 lost_event.header.type = PERF_RECORD_LOST;
                 lost_event.header.misc = 0;
-               lost_event.header.size = sizeof(lost_event);
                 lost_event.id          = event->id;
                 lost_event.lost        = local_xchg(&buffer->lost, 0);
  
                 perf_output_put(handle, lost_event);
+               perf_event__output_id_sample(event, handle, &sample_data);
         }
  
         return 0;
@@ -3407,28 +3565,6 @@ void perf_output_end(struct perf_output_handle *handle)
         rcu_read_unlock();
  }
  
-static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
-{
-       /*
-        * only top level events have the pid namespace they were created in
-        */
-       if (event->parent)
-               event = event->parent;
-
-       return task_tgid_nr_ns(p, event->ns);
-}
-
-static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
-{
-       /*
-        * only top level events have the pid namespace they were created in
-        */
-       if (event->parent)
-               event = event->parent;
-
-       return task_pid_nr_ns(p, event->ns);
-}
-
  static void perf_output_read_one(struct perf_output_handle *handle,
                                  struct perf_event *event,
                                  u64 enabled, u64 running)
@@ -3603,61 +3739,16 @@ void perf_prepare_sample(struct perf_event_header *header,
  {
         u64 sample_type = event->attr.sample_type;
  
-       data->type = sample_type;
-
         header->type = PERF_RECORD_SAMPLE;
-       header->size = sizeof(*header);
+       header->size = sizeof(*header) + event->header_size;
  
         header->misc = 0;
         header->misc |= perf_misc_flags(regs);
  
-       if (sample_type & PERF_SAMPLE_IP) {
-               data->ip = perf_instruction_pointer(regs);
-
-               header->size += sizeof(data->ip);
-       }
-
-       if (sample_type & PERF_SAMPLE_TID) {
-               /* namespace issues */
-               data->tid_entry.pid = perf_event_pid(event, current);
-               data->tid_entry.tid = perf_event_tid(event, current);
-
-               header->size += sizeof(data->tid_entry);
-       }
-
-       if (sample_type & PERF_SAMPLE_TIME) {
-               data->time = perf_clock();
-
-               header->size += sizeof(data->time);
-       }
-
-       if (sample_type & PERF_SAMPLE_ADDR)
-               header->size += sizeof(data->addr);
-
-       if (sample_type & PERF_SAMPLE_ID) {
-               data->id = primary_event_id(event);
-
-               header->size += sizeof(data->id);
-       }
-
-       if (sample_type & PERF_SAMPLE_STREAM_ID) {
-               data->stream_id = event->id;
-
-               header->size += sizeof(data->stream_id);
-       }
-
-       if (sample_type & PERF_SAMPLE_CPU) {
-               data->cpu_entry.cpu             = raw_smp_processor_id();
-               data->cpu_entry.reserved        = 0;
-
-               header->size += sizeof(data->cpu_entry);
-       }
-
-       if (sample_type & PERF_SAMPLE_PERIOD)
-               header->size += sizeof(data->period);
+       __perf_event_header__init_id(header, data, event);
  
-       if (sample_type & PERF_SAMPLE_READ)
-               header->size += perf_event_read_size(event);
+       if (sample_type & PERF_SAMPLE_IP)
+               data->ip = perf_instruction_pointer(regs);
  
         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                 int size = 1;
@@ -3722,23 +3813,26 @@ perf_event_read_event(struct perf_event *event,
                         struct task_struct *task)
  {
         struct perf_output_handle handle;
+       struct perf_sample_data sample;
         struct perf_read_event read_event = {
                 .header = {
                         .type = PERF_RECORD_READ,
                         .misc = 0,
-                       .size = sizeof(read_event) + perf_event_read_size(event),
+                       .size = sizeof(read_event) + event->read_size,
                 },
                 .pid = perf_event_pid(event, task),
                 .tid = perf_event_tid(event, task),
         };
         int ret;
  
+       perf_event_header__init_id(&read_event.header, &sample, event);
         ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
         if (ret)
                 return;
  
         perf_output_put(&handle, read_event);
         perf_output_read(&handle, event);
+       perf_event__output_id_sample(event, &handle, &sample);
  
         perf_output_end(&handle);
  }
@@ -3768,14 +3862,16 @@ static void perf_event_task_output(struct perf_event *event,
                                      struct perf_task_event *task_event)
  {
         struct perf_output_handle handle;
+       struct perf_sample_data sample;
         struct task_struct *task = task_event->task;
-       int size, ret;
+       int ret, size = task_event->event_id.header.size;
  
-       size  = task_event->event_id.header.size;
-       ret = perf_output_begin(&handle, event, size, 0, 0);
+       perf_event_header__init_id(&task_event->event_id.header, &sample, event);
  
+       ret = perf_output_begin(&handle, event,
+                               task_event->event_id.header.size, 0, 0);
         if (ret)
-               return;
+               goto out;
  
         task_event->event_id.pid = perf_event_pid(event, task);
         task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3785,7 +3881,11 @@ static void perf_event_task_output(struct perf_event *event,
  
         perf_output_put(&handle, task_event->event_id);
  
+       perf_event__output_id_sample(event, &handle, &sample);
+
         perf_output_end(&handle);
+out:
+       task_event->event_id.header.size = size;
  }
  
  static int perf_event_task_match(struct perf_event *event)
@@ -3900,11 +4000,16 @@ static void perf_event_comm_output(struct perf_event *event,
                                      struct perf_comm_event *comm_event)
  {
         struct perf_output_handle handle;
+       struct perf_sample_data sample;
         int size = comm_event->event_id.header.size;
-       int ret = perf_output_begin(&handle, event, size, 0, 0);
+       int ret;
+
+       perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
+       ret = perf_output_begin(&handle, event,
+                               comm_event->event_id.header.size, 0, 0);
  
         if (ret)
-               return;
+               goto out;
  
         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
@@ -3912,7 +4017,12 @@ static void perf_event_comm_output(struct perf_event *event,
         perf_output_put(&handle, comm_event->event_id);
         perf_output_copy(&handle, comm_event->comm,
                                    comm_event->comm_size);
+
+       perf_event__output_id_sample(event, &handle, &sample);
+
         perf_output_end(&handle);
+out:
+       comm_event->event_id.header.size = size;
  }
  
  static int perf_event_comm_match(struct perf_event *event)
@@ -3957,7 +4067,6 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
         comm_event->comm_size = size;
  
         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
-
         rcu_read_lock();
         list_for_each_entry_rcu(pmu, &pmus, entry) {
                 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
@@ -4038,11 +4147,15 @@ static void perf_event_mmap_output(struct perf_event *event,
                                      struct perf_mmap_event *mmap_event)
  {
         struct perf_output_handle handle;
+       struct perf_sample_data sample;
         int size = mmap_event->event_id.header.size;
-       int ret = perf_output_begin(&handle, event, size, 0, 0);
+       int ret;
  
+       perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
+       ret = perf_output_begin(&handle, event,
+                               mmap_event->event_id.header.size, 0, 0);
         if (ret)
-               return;
+               goto out;
  
         mmap_event->event_id.pid = perf_event_pid(event, current);
         mmap_event->event_id.tid = perf_event_tid(event, current);
@@ -4050,7 +4163,12 @@ static void perf_event_mmap_output(struct perf_event *event,
         perf_output_put(&handle, mmap_event->event_id);
         perf_output_copy(&handle, mmap_event->file_name,
                                    mmap_event->file_size);
+
+       perf_event__output_id_sample(event, &handle, &sample);
+
         perf_output_end(&handle);
+out:
+       mmap_event->event_id.header.size = size;
  }
  
  static int perf_event_mmap_match(struct perf_event *event,
@@ -4205,6 +4323,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
  static void perf_log_throttle(struct perf_event *event, int enable)
  {
         struct perf_output_handle handle;
+       struct perf_sample_data sample;
         int ret;
  
         struct {
@@ -4226,11 +4345,15 @@ static void perf_log_throttle(struct perf_event *event, int enable)
         if (enable)
                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
  
-       ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
+       perf_event_header__init_id(&throttle_event.header, &sample, event);
+
+       ret = perf_output_begin(&handle, event,
+                               throttle_event.header.size, 1, 0);
         if (ret)
                 return;
  
         perf_output_put(&handle, throttle_event);
+       perf_event__output_id_sample(event, &handle, &sample);
         perf_output_end(&handle);
  }
  
@@ -4246,6 +4369,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
         struct hw_perf_event *hwc = &event->hw;
         int ret = 0;
  
+       /*
+        * Non-sampling counters might still use the PMI to fold short
+        * hardware counters, ignore those.
+        */
+       if (unlikely(!is_sampling_event(event)))
+               return 0;
+
         if (!throttle) {
                 hwc->interrupts++;
         } else {
@@ -4391,7 +4521,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
         if (!regs)
                 return;
  
-       if (!hwc->sample_period)
+       if (!is_sampling_event(event))
                 return;
  
         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
@@ -4554,7 +4684,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
         struct hw_perf_event *hwc = &event->hw;
         struct hlist_head *head;
  
-       if (hwc->sample_period) {
+       if (is_sampling_event(event)) {
                 hwc->last_period = hwc->sample_period;
                 perf_swevent_set_period(event);
         }
@@ -4811,15 +4941,6 @@ static int perf_tp_event_init(struct perf_event *event)
         if (event->attr.type != PERF_TYPE_TRACEPOINT)
                 return -ENOENT;
  
-       /*
-        * Raw tracepoint data is a severe data leak, only allow root to
-        * have these.
-        */
-       if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
-                       perf_paranoid_tracepoint_raw() &&
-                       !capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
         err = perf_trace_init(event);
         if (err)
                 return err;
@@ -4842,7 +4963,7 @@ static struct pmu perf_tracepoint = {
  
  static inline void perf_tp_register(void)
  {
-       perf_pmu_register(&perf_tracepoint);
+       perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
  }
  
  static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4932,31 +5053,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
  static void perf_swevent_start_hrtimer(struct perf_event *event)
  {
         struct hw_perf_event *hwc = &event->hw;
+       s64 period;
+
+       if (!is_sampling_event(event))
+               return;
  
         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         hwc->hrtimer.function = perf_swevent_hrtimer;
-       if (hwc->sample_period) {
-               s64 period = local64_read(&hwc->period_left);
  
-               if (period) {
-                       if (period < 0)
-                               period = 10000;
+       period = local64_read(&hwc->period_left);
+       if (period) {
+               if (period < 0)
+                       period = 10000;
  
-                       local64_set(&hwc->period_left, 0);
-               } else {
-                       period = max_t(u64, 10000, hwc->sample_period);
-               }
-               __hrtimer_start_range_ns(&hwc->hrtimer,
+               local64_set(&hwc->period_left, 0);
+       } else {
+               period = max_t(u64, 10000, hwc->sample_period);
+       }
+       __hrtimer_start_range_ns(&hwc->hrtimer,
                                 ns_to_ktime(period), 0,
                                 HRTIMER_MODE_REL_PINNED, 0);
-       }
  }
  
  static void perf_swevent_cancel_hrtimer(struct perf_event *event)
  {
         struct hw_perf_event *hwc = &event->hw;
  
-       if (hwc->sample_period) {
+       if (is_sampling_event(event)) {
                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
                 local64_set(&hwc->period_left, ktime_to_ns(remaining));
  
@@ -5184,8 +5307,61 @@ static void free_pmu_context(struct pmu *pmu)
  out:
         mutex_unlock(&pmus_lock);
  }
+static struct idr pmu_idr;
+
+static ssize_t
+type_show(struct device *dev, struct device_attribute *attr, char *page)
+{
+       struct pmu *pmu = dev_get_drvdata(dev);
+
+       return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+}
+
+static struct device_attribute pmu_dev_attrs[] = {
+       __ATTR_RO(type),
+       __ATTR_NULL,
+};
+
+static int pmu_bus_running;
+static struct bus_type pmu_bus = {
+       .name           = "event_source",
+       .dev_attrs      = pmu_dev_attrs,
+};
+
+static void pmu_dev_release(struct device *dev)
+{
+       kfree(dev);
+}
+
+static int pmu_dev_alloc(struct pmu *pmu)
+{
+       int ret = -ENOMEM;
+
+       pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
+       if (!pmu->dev)
+               goto out;
+
+       device_initialize(pmu->dev);
+       ret = dev_set_name(pmu->dev, "%s", pmu->name);
+       if (ret)
+               goto free_dev;
+
+       dev_set_drvdata(pmu->dev, pmu);
+       pmu->dev->bus = &pmu_bus;
+       pmu->dev->release = pmu_dev_release;
+       ret = device_add(pmu->dev);
+       if (ret)
+               goto free_dev;
+
+out:
+       return ret;
+
+free_dev:
+       put_device(pmu->dev);
+       goto out;
+}
  
-int perf_pmu_register(struct pmu *pmu)
+int perf_pmu_register(struct pmu *pmu, char *name, int type)
  {
         int cpu, ret;
  
@@ -5195,13 +5371,38 @@ int perf_pmu_register(struct pmu *pmu)
         if (!pmu->pmu_disable_count)
                 goto unlock;
  
+       pmu->type = -1;
+       if (!name)
+               goto skip_type;
+       pmu->name = name;
+
+       if (type < 0) {
+               int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
+               if (!err)
+                       goto free_pdc;
+
+               err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
+               if (err) {
+                       ret = err;
+                       goto free_pdc;
+               }
+       }
+       pmu->type = type;
+
+       if (pmu_bus_running) {
+               ret = pmu_dev_alloc(pmu);
+               if (ret)
+                       goto free_idr;
+       }
+
+skip_type:
         pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
         if (pmu->pmu_cpu_context)
                 goto got_cpu_context;
  
         pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
         if (!pmu->pmu_cpu_context)
-               goto free_pdc;
+               goto free_dev;
  
         for_each_possible_cpu(cpu) {
                 struct perf_cpu_context *cpuctx;
@@ -5245,6 +5446,14 @@ unlock:
  
         return ret;
  
+free_dev:
+       device_del(pmu->dev);
+       put_device(pmu->dev);
+
+free_idr:
+       if (pmu->type >= PERF_TYPE_MAX)
+               idr_remove(&pmu_idr, pmu->type);
+
  free_pdc:
         free_percpu(pmu->pmu_disable_count);
         goto unlock;
@@ -5264,6 +5473,10 @@ void perf_pmu_unregister(struct pmu *pmu)
         synchronize_rcu();
  
         free_percpu(pmu->pmu_disable_count);
+       if (pmu->type >= PERF_TYPE_MAX)
+               idr_remove(&pmu_idr, pmu->type);
+       device_del(pmu->dev);
+       put_device(pmu->dev);
         free_pmu_context(pmu);
  }
  
@@ -5273,6 +5486,13 @@ struct pmu *perf_init_event(struct perf_event *event)
         int idx;
  
         idx = srcu_read_lock(&pmus_srcu);
+
+       rcu_read_lock();
+       pmu = idr_find(&pmu_idr, event->attr.type);
+       rcu_read_unlock();
+       if (pmu)
+               goto unlock;
+
         list_for_each_entry_rcu(pmu, &pmus, entry) {
                 int ret = pmu->event_init(event);
                 if (!ret)
@@ -5737,6 +5957,12 @@ SYSCALL_DEFINE5(perf_event_open,
         list_add_tail(&event->owner_entry, &current->perf_event_list);
         mutex_unlock(&current->perf_event_mutex);
  
+       /*
+        * Precalculate sample_data sizes
+        */
+       perf_event__header_size(event);
+       perf_event__id_header_size(event);
+
         /*
          * Drop the reference on the group_event after placing the
          * new event on the sibling_list. This ensures destruction
@@ -6089,6 +6315,12 @@ inherit_event(struct perf_event *parent_event,
         child_event->ctx = child_ctx;
         child_event->overflow_handler = parent_event->overflow_handler;
  
+       /*
+        * Precalculate sample_data sizes
+        */
+       perf_event__header_size(child_event);
+       perf_event__id_header_size(child_event);
+
         /*
          * Link it up in the child's context:
          */
@@ -6320,7 +6552,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
         mutex_unlock(&swhash->hlist_mutex);
  }
  
-#ifdef CONFIG_HOTPLUG_CPU
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
  static void perf_pmu_rotate_stop(struct pmu *pmu)
  {
         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
@@ -6374,6 +6606,26 @@ static void perf_event_exit_cpu(int cpu)
  static inline void perf_event_exit_cpu(int cpu) { }
  #endif
  
+static int
+perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
+{
+       int cpu;
+
+       for_each_online_cpu(cpu)
+               perf_event_exit_cpu(cpu);
+
+       return NOTIFY_OK;
+}
+
+/*
+ * Run the perf reboot notifier at the very last possible moment so that
+ * the generic watchdog code runs as long as possible.
+ */
+static struct notifier_block perf_reboot_notifier = {
+       .notifier_call = perf_reboot,
+       .priority = INT_MIN,
+};
+
  static int __cpuinit
  perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
  {
@@ -6402,14 +6654,45 @@ void __init perf_event_init(void)
  {
         int ret;
  
+       idr_init(&pmu_idr);
+
         perf_event_init_all_cpus();
         init_srcu_struct(&pmus_srcu);
-       perf_pmu_register(&perf_swevent);
-       perf_pmu_register(&perf_cpu_clock);
-       perf_pmu_register(&perf_task_clock);
+       perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
+       perf_pmu_register(&perf_cpu_clock, NULL, -1);
+       perf_pmu_register(&perf_task_clock, NULL, -1);
         perf_tp_register();
         perf_cpu_notifier(perf_cpu_notify);
+       register_reboot_notifier(&perf_reboot_notifier);
  
         ret = init_hw_breakpoint();
         WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
  }
+
+static int __init perf_event_sysfs_init(void)
+{
+       struct pmu *pmu;
+       int ret;
+
+       mutex_lock(&pmus_lock);
+
+       ret = bus_register(&pmu_bus);
+       if (ret)
+               goto unlock;
+
+       list_for_each_entry(pmu, &pmus, entry) {
+               if (!pmu->name || pmu->type < 0)
+                       continue;
+
+               ret = pmu_dev_alloc(pmu);
+               WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
+       }
+       pmu_bus_running = 1;
+       ret = 0;
+
+unlock:
+       mutex_unlock(&pmus_lock);
+
+       return ret;
+}
+device_initcall(perf_event_sysfs_init);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c

index 9ca4973f736d53b04bf4eea9373ce635cf7098c3..93bd2eb2bc53efe76dd120501b0cbda115b71bfd 100644 (file)
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -145,7 +145,13 @@ static int common_timer_del(struct k_itimer *timer);
  
  static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
  
-static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
+
+#define lock_timer(tid, flags)                                            \
+({     struct k_itimer *__timr;                                           \
+       __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags));  \
+       __timr;                                                            \
+})
  
  static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
  {
@@ -619,7 +625,7 @@ out:
   * the find to the timer lock.  To avoid a dead lock, the timer id MUST
   * be release with out holding the timer lock.
   */
-static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
  {
         struct k_itimer *timr;
         /*
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c

index ecf770509d0d1bb9dce2381716d6e7583ee6c492..031d5e3a61973464eec7e0e72791e423bfb3396f 100644 (file)
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,7 @@
  #include <linux/mm.h>
  #include <linux/slab.h>
  #include <linux/suspend.h>
+#include <trace/events/power.h>
  
  #include "power.h"
  
@@ -201,6 +202,7 @@ int suspend_devices_and_enter(suspend_state_t state)
         if (!suspend_ops)
                 return -ENOSYS;
  
+       trace_machine_suspend(state);
         if (suspend_ops->begin) {
                 error = suspend_ops->begin(state);
                 if (error)
@@ -229,6 +231,7 @@ int suspend_devices_and_enter(suspend_state_t state)
   Close:
         if (suspend_ops->end)
                 suspend_ops->end();
+       trace_machine_suspend(PWR_EVENT_EXIT);
         return error;
  
   Recover_platform:
diff --git a/kernel/printk.c b/kernel/printk.c

index a23315dc4498844c113cecc9792eabd063e1d87b..ab3ffc5b3b64613507134573dbb94af132c4adff 100644 (file)
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1074,17 +1074,17 @@ static DEFINE_PER_CPU(int, printk_pending);
  
  void printk_tick(void)
  {
-       if (__get_cpu_var(printk_pending)) {
-               __get_cpu_var(printk_pending) = 0;
+       if (__this_cpu_read(printk_pending)) {
+               __this_cpu_write(printk_pending, 0);
                 wake_up_interruptible(&log_wait);
         }
  }
  
  int printk_needs_cpu(int cpu)
  {
-       if (unlikely(cpu_is_offline(cpu)))
+       if (cpu_is_offline(cpu))
                 printk_tick();
-       return per_cpu(printk_pending, cpu);
+       return __this_cpu_read(printk_pending);
  }
  
  void wake_up_klogd(void)
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c

index d806735342acb10bc3e3ae787e62ade34f1d5955..0344937247495d69b3ef5255ace0d94b2250fac2 100644 (file)
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -36,31 +36,16 @@
  #include <linux/time.h>
  #include <linux/cpu.h>
  
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
-       struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
-       struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
-       struct rcu_head **curtail;      /* ->next pointer of last CB. */
-};
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
-       .donetail       = &rcu_sched_ctrlblk.rcucblist,
-       .curtail        = &rcu_sched_ctrlblk.rcucblist,
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-       .donetail       = &rcu_bh_ctrlblk.rcucblist,
-       .curtail        = &rcu_bh_ctrlblk.rcucblist,
-};
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
+static struct task_struct *rcu_kthread_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
+static void invoke_rcu_kthread(void);
  
  /* Forward declarations for rcutiny_plugin.h. */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+struct rcu_ctrlblk;
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static int rcu_kthread(void *arg);
  static void __call_rcu(struct rcu_head *head,
                        void (*func)(struct rcu_head *rcu),
                        struct rcu_ctrlblk *rcp);
@@ -123,7 +108,7 @@ void rcu_sched_qs(int cpu)
  {
         if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
             rcu_qsctr_help(&rcu_bh_ctrlblk))
-               raise_softirq(RCU_SOFTIRQ);
+               invoke_rcu_kthread();
  }
  
  /*
@@ -132,7 +117,7 @@ void rcu_sched_qs(int cpu)
  void rcu_bh_qs(int cpu)
  {
         if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-               raise_softirq(RCU_SOFTIRQ);
+               invoke_rcu_kthread();
  }
  
  /*
@@ -152,13 +137,14 @@ void rcu_check_callbacks(int cpu, int user)
  }
  
  /*
- * Helper function for rcu_process_callbacks() that operates on the
- * specified rcu_ctrlkblk structure.
+ * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
+ * whose grace period has elapsed.
   */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
  {
         struct rcu_head *next, *list;
         unsigned long flags;
+       RCU_TRACE(int cb_count = 0);
  
         /* If no RCU callbacks ready to invoke, just return. */
         if (&rcp->rcucblist == rcp->donetail)
@@ -180,19 +166,58 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
                 next = list->next;
                 prefetch(next);
                 debug_rcu_head_unqueue(list);
+               local_bh_disable();
                 list->func(list);
+               local_bh_enable();
                 list = next;
+               RCU_TRACE(cb_count++);
         }
+       RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
  }
  
  /*
- * Invoke any callbacks whose grace period has completed.
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed.  It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that was used previously for this purpose.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
   */
-static void rcu_process_callbacks(struct softirq_action *unused)
+static int rcu_kthread(void *arg)
  {
-       __rcu_process_callbacks(&rcu_sched_ctrlblk);
-       __rcu_process_callbacks(&rcu_bh_ctrlblk);
-       rcu_preempt_process_callbacks();
+       unsigned long work;
+       unsigned long morework;
+       unsigned long flags;
+
+       for (;;) {
+               wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
+               morework = rcu_boost();
+               local_irq_save(flags);
+               work = have_rcu_kthread_work;
+               have_rcu_kthread_work = morework;
+               local_irq_restore(flags);
+               if (work) {
+                       rcu_process_callbacks(&rcu_sched_ctrlblk);
+                       rcu_process_callbacks(&rcu_bh_ctrlblk);
+                       rcu_preempt_process_callbacks();
+               }
+               schedule_timeout_interruptible(1); /* Leave CPU for others. */
+       }
+
+       return 0;  /* Not reached, but needed to shut gcc up. */
+}
+
+/*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_kthread(void)
+{
+       unsigned long flags;
+
+       local_irq_save(flags);
+       have_rcu_kthread_work = 1;
+       wake_up(&rcu_kthread_wq);
+       local_irq_restore(flags);
  }
  
  /*
@@ -230,6 +255,7 @@ static void __call_rcu(struct rcu_head *head,
         local_irq_save(flags);
         *rcp->curtail = head;
         rcp->curtail = &head->next;
+       RCU_TRACE(rcp->qlen++);
         local_irq_restore(flags);
  }
  
@@ -282,7 +308,16 @@ void rcu_barrier_sched(void)
  }
  EXPORT_SYMBOL_GPL(rcu_barrier_sched);
  
-void __init rcu_init(void)
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
  {
-       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+       struct sched_param sp;
+
+       rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+       sp.sched_priority = RCU_BOOST_PRIO;
+       sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+       return 0;
  }
+early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h

index 6ceca4f745ffa1f4535c69467ea59704e2ddbe97..015abaea962ad4087130014506b72dc19b33b43d 100644 (file)
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -22,6 +22,40 @@
   * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
   */
  
+#include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt)        stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+       struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
+       struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
+       struct rcu_head **curtail;      /* ->next pointer of last CB. */
+       RCU_TRACE(long qlen);           /* Number of pending CBs. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+       .donetail       = &rcu_sched_ctrlblk.rcucblist,
+       .curtail        = &rcu_sched_ctrlblk.rcucblist,
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+       .donetail       = &rcu_bh_ctrlblk.rcucblist,
+       .curtail        = &rcu_bh_ctrlblk.rcucblist,
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
  #ifdef CONFIG_TINY_PREEMPT_RCU
  
  #include <linux/delay.h>
@@ -46,17 +80,45 @@ struct rcu_preempt_ctrlblk {
         struct list_head *gp_tasks;
                                 /* Pointer to the first task blocking the */
                                 /*  current grace period, or NULL if there */
-                               /*  is not such task. */
+                               /*  is no such task. */
         struct list_head *exp_tasks;
                                 /* Pointer to first task blocking the */
                                 /*  current expedited grace period, or NULL */
                                 /*  if there is no such task.  If there */
                                 /*  is no current expedited grace period, */
                                 /*  then there cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+       struct list_head *boost_tasks;
+                               /* Pointer to first task that needs to be */
+                               /*  priority-boosted, or NULL if no priority */
+                               /*  boosting is needed.  If there is no */
+                               /*  current or expedited grace period, there */
+                               /*  can be no such task. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
         u8 gpnum;               /* Current grace period. */
         u8 gpcpu;               /* Last grace period blocked by the CPU. */
         u8 completed;           /* Last grace period completed. */
                                 /*  If all three are equal, RCU is idle. */
+#ifdef CONFIG_RCU_BOOST
+       s8 boosted_this_gp;     /* Has boosting already happened? */
+       unsigned long boost_time; /* When to start boosting (jiffies) */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+       unsigned long n_grace_periods;
+#ifdef CONFIG_RCU_BOOST
+       unsigned long n_tasks_boosted;
+       unsigned long n_exp_boosts;
+       unsigned long n_normal_boosts;
+       unsigned long n_normal_balk_blkd_tasks;
+       unsigned long n_normal_balk_gp_tasks;
+       unsigned long n_normal_balk_boost_tasks;
+       unsigned long n_normal_balk_boosted;
+       unsigned long n_normal_balk_notyet;
+       unsigned long n_normal_balk_nos;
+       unsigned long n_exp_balk_blkd_tasks;
+       unsigned long n_exp_balk_nos;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#endif /* #ifdef CONFIG_RCU_TRACE */
  };
  
  static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
@@ -121,6 +183,210 @@ static int rcu_preempt_gp_in_progress(void)
         return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
  }
  
+/*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t)
+{
+       struct list_head *np;
+
+       np = t->rcu_node_entry.next;
+       if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+               np = NULL;
+       return np;
+}
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void);
+static void rcu_initiate_exp_boost_trace(void);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Dump additional statistice for TINY_PREEMPT_RCU.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+       seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
+                  rcu_preempt_ctrlblk.rcb.qlen,
+                  rcu_preempt_ctrlblk.n_grace_periods,
+                  rcu_preempt_ctrlblk.gpnum,
+                  rcu_preempt_ctrlblk.gpcpu,
+                  rcu_preempt_ctrlblk.completed,
+                  "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
+                  "N."[!rcu_preempt_ctrlblk.gp_tasks],
+                  "E."[!rcu_preempt_ctrlblk.exp_tasks]);
+#ifdef CONFIG_RCU_BOOST
+       seq_printf(m, "             ttb=%c btg=",
+                  "B."[!rcu_preempt_ctrlblk.boost_tasks]);
+       switch (rcu_preempt_ctrlblk.boosted_this_gp) {
+       case -1:
+               seq_puts(m, "exp");
+               break;
+       case 0:
+               seq_puts(m, "no");
+               break;
+       case 1:
+               seq_puts(m, "begun");
+               break;
+       case 2:
+               seq_puts(m, "done");
+               break;
+       default:
+               seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
+       }
+       seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+                  rcu_preempt_ctrlblk.n_tasks_boosted,
+                  rcu_preempt_ctrlblk.n_exp_boosts,
+                  rcu_preempt_ctrlblk.n_normal_boosts,
+                  (int)(jiffies & 0xffff),
+                  (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
+       seq_printf(m, "             %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
+                  "normal balk",
+                  rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
+                  rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
+                  rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
+                  rcu_preempt_ctrlblk.n_normal_balk_boosted,
+                  rcu_preempt_ctrlblk.n_normal_balk_notyet,
+                  rcu_preempt_ctrlblk.n_normal_balk_nos);
+       seq_printf(m, "             exp balk: bt=%lu nos=%lu\n",
+                  rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
+                  rcu_preempt_ctrlblk.n_exp_balk_nos);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+#ifdef CONFIG_RCU_BOOST
+
+#include "rtmutex_common.h"
+
+/*
+ * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
+ * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
+ */
+static int rcu_boost(void)
+{
+       unsigned long flags;
+       struct rt_mutex mtx;
+       struct list_head *np;
+       struct task_struct *t;
+
+       if (rcu_preempt_ctrlblk.boost_tasks == NULL)
+               return 0;  /* Nothing to boost. */
+       raw_local_irq_save(flags);
+       rcu_preempt_ctrlblk.boosted_this_gp++;
+       t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
+                        rcu_node_entry);
+       np = rcu_next_node_entry(t);
+       rt_mutex_init_proxy_locked(&mtx, t);
+       t->rcu_boost_mutex = &mtx;
+       t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+       raw_local_irq_restore(flags);
+       rt_mutex_lock(&mtx);
+       RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+       rcu_preempt_ctrlblk.boosted_this_gp++;
+       rt_mutex_unlock(&mtx);
+       return rcu_preempt_ctrlblk.boost_tasks != NULL;
+}
+
+/*
+ * Check to see if it is now time to start boosting RCU readers blocking
+ * the current grace period, and, if so, tell the rcu_kthread_task to
+ * start boosting them.  If there is an expedited boost in progress,
+ * we wait for it to complete.
+ *
+ * If there are no blocked readers blocking the current grace period,
+ * return 0 to let the caller know, otherwise return 1.  Note that this
+ * return value is independent of whether or not boosting was done.
+ */
+static int rcu_initiate_boost(void)
+{
+       if (!rcu_preempt_blocked_readers_cgp()) {
+               RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
+               return 0;
+       }
+       if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+           rcu_preempt_ctrlblk.boost_tasks == NULL &&
+           rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
+           ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
+               rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
+               invoke_rcu_kthread();
+               RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+       } else
+               RCU_TRACE(rcu_initiate_boost_trace());
+       return 1;
+}
+
+/*
+ * Initiate boosting for an expedited grace period.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+       unsigned long flags;
+
+       raw_local_irq_save(flags);
+       if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
+               rcu_preempt_ctrlblk.boost_tasks =
+                       rcu_preempt_ctrlblk.blkd_tasks.next;
+               rcu_preempt_ctrlblk.boosted_this_gp = -1;
+               invoke_rcu_kthread();
+               RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+       } else
+               RCU_TRACE(rcu_initiate_exp_boost_trace());
+       raw_local_irq_restore(flags);
+}
+
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
+
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+       rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+       if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
+               rcu_preempt_ctrlblk.boosted_this_gp = 0;
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * If there is no RCU priority boosting, we don't boost.
+ */
+static int rcu_boost(void)
+{
+       return 0;
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate boosting,
+ * but we do indicate whether there are blocked readers blocking the
+ * current grace period.
+ */
+static int rcu_initiate_boost(void)
+{
+       return rcu_preempt_blocked_readers_cgp();
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate expedited boosting.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+}
+
+/*
+ * If there is no RCU priority boosting, nothing to do at grace-period start.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+}
+
+#endif /* else #ifdef CONFIG_RCU_BOOST */
+
  /*
   * Record a preemptible-RCU quiescent state for the specified CPU.  Note
   * that this just means that the task currently running on the CPU is
@@ -148,11 +414,14 @@ static void rcu_preempt_cpu_qs(void)
         rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
         current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
  
+       /* If there is no GP then there is nothing more to do.  */
+       if (!rcu_preempt_gp_in_progress())
+               return;
         /*
-        * If there is no GP, or if blocked readers are still blocking GP,
-        * then there is nothing more to do.
+        * Check up on boosting.  If there are no readers blocking the
+        * current grace period, leave.
          */
-       if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+       if (rcu_initiate_boost())
                 return;
  
         /* Advance callbacks. */
@@ -164,9 +433,9 @@ static void rcu_preempt_cpu_qs(void)
         if (!rcu_preempt_blocked_readers_any())
                 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
  
-       /* If there are done callbacks, make RCU_SOFTIRQ process them. */
+       /* If there are done callbacks, cause them to be invoked. */
         if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-               raise_softirq(RCU_SOFTIRQ);
+               invoke_rcu_kthread();
  }
  
  /*
@@ -178,12 +447,16 @@ static void rcu_preempt_start_gp(void)
  
                 /* Official start of GP. */
                 rcu_preempt_ctrlblk.gpnum++;
+               RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
  
                 /* Any blocked RCU readers block new GP. */
                 if (rcu_preempt_blocked_readers_any())
                         rcu_preempt_ctrlblk.gp_tasks =
                                 rcu_preempt_ctrlblk.blkd_tasks.next;
  
+               /* Set up for RCU priority boosting. */
+               rcu_preempt_boost_start_gp();
+
                 /* If there is no running reader, CPU is done with GP. */
                 if (!rcu_preempt_running_reader())
                         rcu_preempt_cpu_qs();
@@ -304,14 +577,16 @@ static void rcu_read_unlock_special(struct task_struct *t)
                  */
                 empty = !rcu_preempt_blocked_readers_cgp();
                 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
-               np = t->rcu_node_entry.next;
-               if (np == &rcu_preempt_ctrlblk.blkd_tasks)
-                       np = NULL;
+               np = rcu_next_node_entry(t);
                 list_del(&t->rcu_node_entry);
                 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
                         rcu_preempt_ctrlblk.gp_tasks = np;
                 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
                         rcu_preempt_ctrlblk.exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+               if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
+                       rcu_preempt_ctrlblk.boost_tasks = np;
+#endif /* #ifdef CONFIG_RCU_BOOST */
                 INIT_LIST_HEAD(&t->rcu_node_entry);
  
                 /*
@@ -331,6 +606,14 @@ static void rcu_read_unlock_special(struct task_struct *t)
                 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
                         rcu_report_exp_done();
         }
+#ifdef CONFIG_RCU_BOOST
+       /* Unboost self if was boosted. */
+       if (special & RCU_READ_UNLOCK_BOOSTED) {
+               t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+               rt_mutex_unlock(t->rcu_boost_mutex);
+               t->rcu_boost_mutex = NULL;
+       }
+#endif /* #ifdef CONFIG_RCU_BOOST */
         local_irq_restore(flags);
  }
  
@@ -374,7 +657,7 @@ static void rcu_preempt_check_callbacks(void)
                 rcu_preempt_cpu_qs();
         if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
             rcu_preempt_ctrlblk.rcb.donetail)
-               raise_softirq(RCU_SOFTIRQ);
+               invoke_rcu_kthread();
         if (rcu_preempt_gp_in_progress() &&
             rcu_cpu_blocking_cur_gp() &&
             rcu_preempt_running_reader())
@@ -383,7 +666,7 @@ static void rcu_preempt_check_callbacks(void)
  
  /*
   * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
- * update, so this is invoked from __rcu_process_callbacks() to
+ * update, so this is invoked from rcu_process_callbacks() to
   * handle that case.  Of course, it is invoked for all flavors of
   * RCU, but RCU callbacks can appear only on one of the lists, and
   * neither ->nexttail nor ->donetail can possibly be NULL, so there
@@ -400,7 +683,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
   */
  static void rcu_preempt_process_callbacks(void)
  {
-       __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+       rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
  }
  
  /*
@@ -417,6 +700,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
         local_irq_save(flags);
         *rcu_preempt_ctrlblk.nexttail = head;
         rcu_preempt_ctrlblk.nexttail = &head->next;
+       RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
         rcu_preempt_start_gp();  /* checks to see if GP needed. */
         local_irq_restore(flags);
  }
@@ -532,6 +816,7 @@ void synchronize_rcu_expedited(void)
  
         /* Wait for tail of ->blkd_tasks list to drain. */
         if (rcu_preempted_readers_exp())
+               rcu_initiate_expedited_boost();
                 wait_event(sync_rcu_preempt_exp_wq,
                            !rcu_preempted_readers_exp());
  
@@ -572,6 +857,27 @@ void exit_rcu(void)
  
  #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
  
+#ifdef CONFIG_RCU_TRACE
+
+/*
+ * Because preemptible RCU does not exist, it is not necessary to
+ * dump out its statistics.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Because preemptible RCU does not exist, it is never necessary to
+ * boost preempted RCU readers.
+ */
+static int rcu_boost(void)
+{
+       return 0;
+}
+
  /*
   * Because preemptible RCU does not exist, it never has any callbacks
   * to check.
@@ -599,17 +905,116 @@ static void rcu_preempt_process_callbacks(void)
  #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
  
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
-
  #include <linux/kernel_stat.h>
  
  /*
   * During boot, we forgive RCU lockdep issues.  After this function is
   * invoked, we start taking RCU lockdep issues seriously.
   */
-void rcu_scheduler_starting(void)
+void __init rcu_scheduler_starting(void)
  {
         WARN_ON(nr_context_switches() > 0);
         rcu_scheduler_active = 1;
  }
  
  #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define RCU_BOOST_PRIO 1
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+
+static void rcu_initiate_boost_trace(void)
+{
+       if (rcu_preempt_ctrlblk.gp_tasks == NULL)
+               rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
+       else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
+               rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
+       else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
+               rcu_preempt_ctrlblk.n_normal_balk_boosted++;
+       else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
+               rcu_preempt_ctrlblk.n_normal_balk_notyet++;
+       else
+               rcu_preempt_ctrlblk.n_normal_balk_nos++;
+}
+
+static void rcu_initiate_exp_boost_trace(void)
+{
+       if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
+               rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
+       else
+               rcu_preempt_ctrlblk.n_exp_balk_nos++;
+}
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
+{
+       unsigned long flags;
+
+       raw_local_irq_save(flags);
+       rcp->qlen -= n;
+       raw_local_irq_restore(flags);
+}
+
+/*
+ * Dump statistics for TINY_RCU, such as they are.
+ */
+static int show_tiny_stats(struct seq_file *m, void *unused)
+{
+       show_tiny_preempt_stats(m);
+       seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
+       seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
+       return 0;
+}
+
+static int show_tiny_stats_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, show_tiny_stats, NULL);
+}
+
+static const struct file_operations show_tiny_stats_fops = {
+       .owner = THIS_MODULE,
+       .open = show_tiny_stats_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = single_release,
+};
+
+static struct dentry *rcudir;
+
+static int __init rcutiny_trace_init(void)
+{
+       struct dentry *retval;
+
+       rcudir = debugfs_create_dir("rcu", NULL);
+       if (!rcudir)
+               goto free_out;
+       retval = debugfs_create_file("rcudata", 0444, rcudir,
+                                    NULL, &show_tiny_stats_fops);
+       if (!retval)
+               goto free_out;
+       return 0;
+free_out:
+       debugfs_remove_recursive(rcudir);
+       return 1;
+}
+
+static void __exit rcutiny_trace_cleanup(void)
+{
+       debugfs_remove_recursive(rcudir);
+}
+
+module_init(rcutiny_trace_init);
+module_exit(rcutiny_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
+MODULE_LICENSE("GPL");
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c

index 9d8e8fb2515f4e4801c214841a7f8c95b8b45ffe..89613f97ff264e35cac497419bd0a4dac798ce5f 100644 (file)
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,6 +47,7 @@
  #include <linux/srcu.h>
  #include <linux/slab.h>
  #include <asm/byteorder.h>
+#include <linux/sched.h>
  
  MODULE_LICENSE("GPL");
  MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -64,6 +65,9 @@ static int irqreader = 1;     /* RCU readers from irq (timers). */
  static int fqs_duration = 0;   /* Duration of bursts (us), 0 to disable. */
  static int fqs_holdoff = 0;    /* Hold time within burst (us). */
  static int fqs_stutter = 3;    /* Wait time between bursts (s). */
+static int test_boost = 1;     /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
+static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
+static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
  static char *torture_type = "rcu"; /* What RCU implementation to torture. */
  
  module_param(nreaders, int, 0444);
@@ -88,6 +92,12 @@ module_param(fqs_holdoff, int, 0444);
  MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
  module_param(fqs_stutter, int, 0444);
  MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(test_boost, int, 0444);
+MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+module_param(test_boost_interval, int, 0444);
+MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+module_param(test_boost_duration, int, 0444);
+MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
  module_param(torture_type, charp, 0444);
  MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
  
@@ -109,6 +119,7 @@ static struct task_struct *stats_task;
  static struct task_struct *shuffler_task;
  static struct task_struct *stutter_task;
  static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
  
  #define RCU_TORTURE_PIPE_LEN 10
  
@@ -134,6 +145,12 @@ static atomic_t n_rcu_torture_alloc_fail;
  static atomic_t n_rcu_torture_free;
  static atomic_t n_rcu_torture_mberror;
  static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_allocerror;
+static long n_rcu_torture_boost_afferror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
  static long n_rcu_torture_timers;
  static struct list_head rcu_torture_removed;
  static cpumask_var_t shuffle_tmp_mask;
@@ -147,6 +164,16 @@ static int stutter_pause_test;
  #endif
  int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
  
+#ifdef CONFIG_RCU_BOOST
+#define rcu_can_boost() 1
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define rcu_can_boost() 0
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+static unsigned long boost_starttime;  /* jiffies of next boost test start. */
+DEFINE_MUTEX(boost_mutex);             /* protect setting boost_starttime */
+                                       /*  and boost task create/destroy. */
+
  /* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
  
  #define FULLSTOP_DONTSTOP 0    /* Normal operation. */
@@ -277,6 +304,7 @@ struct rcu_torture_ops {
         void (*fqs)(void);
         int (*stats)(char *page);
         int irq_capable;
+       int can_boost;
         char *name;
  };
  
@@ -366,6 +394,7 @@ static struct rcu_torture_ops rcu_ops = {
         .fqs            = rcu_force_quiescent_state,
         .stats          = NULL,
         .irq_capable    = 1,
+       .can_boost      = rcu_can_boost(),
         .name           = "rcu"
  };
  
@@ -408,6 +437,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
         .fqs            = rcu_force_quiescent_state,
         .stats          = NULL,
         .irq_capable    = 1,
+       .can_boost      = rcu_can_boost(),
         .name           = "rcu_sync"
  };
  
@@ -424,6 +454,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
         .fqs            = rcu_force_quiescent_state,
         .stats          = NULL,
         .irq_capable    = 1,
+       .can_boost      = rcu_can_boost(),
         .name           = "rcu_expedited"
  };
  
@@ -683,6 +714,110 @@ static struct rcu_torture_ops sched_expedited_ops = {
         .name           = "sched_expedited"
  };
  
+/*
+ * RCU torture priority-boost testing.  Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked.  If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+
+struct rcu_boost_inflight {
+       struct rcu_head rcu;
+       int inflight;
+};
+
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+       struct rcu_boost_inflight *rbip =
+               container_of(head, struct rcu_boost_inflight, rcu);
+
+       smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+       rbip->inflight = 0;
+}
+
+static int rcu_torture_boost(void *arg)
+{
+       unsigned long call_rcu_time;
+       unsigned long endtime;
+       unsigned long oldstarttime;
+       struct rcu_boost_inflight rbi = { .inflight = 0 };
+       struct sched_param sp;
+
+       VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+
+       /* Set real-time priority. */
+       sp.sched_priority = 1;
+       if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+               VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+               n_rcu_torture_boost_rterror++;
+       }
+
+       /* Each pass through the following loop does one boost-test cycle. */
+       do {
+               /* Wait for the next test interval. */
+               oldstarttime = boost_starttime;
+               while (jiffies - oldstarttime > ULONG_MAX / 2) {
+                       schedule_timeout_uninterruptible(1);
+                       rcu_stutter_wait("rcu_torture_boost");
+                       if (kthread_should_stop() ||
+                           fullstop != FULLSTOP_DONTSTOP)
+                               goto checkwait;
+               }
+
+               /* Do one boost-test interval. */
+               endtime = oldstarttime + test_boost_duration * HZ;
+               call_rcu_time = jiffies;
+               while (jiffies - endtime > ULONG_MAX / 2) {
+                       /* If we don't have a callback in flight, post one. */
+                       if (!rbi.inflight) {
+                               smp_mb(); /* RCU core before ->inflight = 1. */
+                               rbi.inflight = 1;
+                               call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+                               if (jiffies - call_rcu_time >
+                                        test_boost_duration * HZ - HZ / 2) {
+                                       VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+                                       n_rcu_torture_boost_failure++;
+                               }
+                               call_rcu_time = jiffies;
+                       }
+                       cond_resched();
+                       rcu_stutter_wait("rcu_torture_boost");
+                       if (kthread_should_stop() ||
+                           fullstop != FULLSTOP_DONTSTOP)
+                               goto checkwait;
+               }
+
+               /*
+                * Set the start time of the next test interval.
+                * Yes, this is vulnerable to long delays, but such
+                * delays simply cause a false negative for the next
+                * interval.  Besides, we are running at RT priority,
+                * so delays should be relatively rare.
+                */
+               while (oldstarttime == boost_starttime) {
+                       if (mutex_trylock(&boost_mutex)) {
+                               boost_starttime = jiffies +
+                                                 test_boost_interval * HZ;
+                               n_rcu_torture_boosts++;
+                               mutex_unlock(&boost_mutex);
+                               break;
+                       }
+                       schedule_timeout_uninterruptible(1);
+               }
+
+               /* Go do the stutter. */
+checkwait:     rcu_stutter_wait("rcu_torture_boost");
+       } while (!kthread_should_stop() && fullstop  == FULLSTOP_DONTSTOP);
+
+       /* Clean up and exit. */
+       VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+       rcutorture_shutdown_absorb("rcu_torture_boost");
+       while (!kthread_should_stop() || rbi.inflight)
+               schedule_timeout_uninterruptible(1);
+       smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+       return 0;
+}
+
  /*
   * RCU torture force-quiescent-state kthread.  Repeatedly induces
   * bursts of calls to force_quiescent_state(), increasing the probability
@@ -933,7 +1068,8 @@ rcu_torture_printk(char *page)
         cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
         cnt += sprintf(&page[cnt],
                        "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
-                      "rtmbe: %d nt: %ld",
+                      "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
+                      "rtbf: %ld rtb: %ld nt: %ld",
                        rcu_torture_current,
                        rcu_torture_current_version,
                        list_empty(&rcu_torture_freelist),
@@ -941,8 +1077,19 @@ rcu_torture_printk(char *page)
                        atomic_read(&n_rcu_torture_alloc_fail),
                        atomic_read(&n_rcu_torture_free),
                        atomic_read(&n_rcu_torture_mberror),
+                      n_rcu_torture_boost_ktrerror,
+                      n_rcu_torture_boost_rterror,
+                      n_rcu_torture_boost_allocerror,
+                      n_rcu_torture_boost_afferror,
+                      n_rcu_torture_boost_failure,
+                      n_rcu_torture_boosts,
                        n_rcu_torture_timers);
-       if (atomic_read(&n_rcu_torture_mberror) != 0)
+       if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+           n_rcu_torture_boost_ktrerror != 0 ||
+           n_rcu_torture_boost_rterror != 0 ||
+           n_rcu_torture_boost_allocerror != 0 ||
+           n_rcu_torture_boost_afferror != 0 ||
+           n_rcu_torture_boost_failure != 0)
                 cnt += sprintf(&page[cnt], " !!!");
         cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
         if (i > 1) {
@@ -1094,22 +1241,91 @@ rcu_torture_stutter(void *arg)
  }
  
  static inline void
-rcu_torture_print_module_parms(char *tag)
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
  {
         printk(KERN_ALERT "%s" TORTURE_FLAG
                 "--- %s: nreaders=%d nfakewriters=%d "
                 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
                 "shuffle_interval=%d stutter=%d irqreader=%d "
-               "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
+               "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+               "test_boost=%d/%d test_boost_interval=%d "
+               "test_boost_duration=%d\n",
                 torture_type, tag, nrealreaders, nfakewriters,
                 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
-               stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
+               stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+               test_boost, cur_ops->can_boost,
+               test_boost_interval, test_boost_duration);
  }
  
-static struct notifier_block rcutorture_nb = {
+static struct notifier_block rcutorture_shutdown_nb = {
         .notifier_call = rcutorture_shutdown_notify,
  };
  
+static void rcutorture_booster_cleanup(int cpu)
+{
+       struct task_struct *t;
+
+       if (boost_tasks[cpu] == NULL)
+               return;
+       mutex_lock(&boost_mutex);
+       VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
+       t = boost_tasks[cpu];
+       boost_tasks[cpu] = NULL;
+       mutex_unlock(&boost_mutex);
+
+       /* This must be outside of the mutex, otherwise deadlock! */
+       kthread_stop(t);
+}
+
+static int rcutorture_booster_init(int cpu)
+{
+       int retval;
+
+       if (boost_tasks[cpu] != NULL)
+               return 0;  /* Already created, nothing more to do. */
+
+       /* Don't allow time recalculation while creating a new task. */
+       mutex_lock(&boost_mutex);
+       VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+       boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
+                                         "rcu_torture_boost");
+       if (IS_ERR(boost_tasks[cpu])) {
+               retval = PTR_ERR(boost_tasks[cpu]);
+               VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+               n_rcu_torture_boost_ktrerror++;
+               boost_tasks[cpu] = NULL;
+               mutex_unlock(&boost_mutex);
+               return retval;
+       }
+       kthread_bind(boost_tasks[cpu], cpu);
+       wake_up_process(boost_tasks[cpu]);
+       mutex_unlock(&boost_mutex);
+       return 0;
+}
+
+static int rcutorture_cpu_notify(struct notifier_block *self,
+                                unsigned long action, void *hcpu)
+{
+       long cpu = (long)hcpu;
+
+       switch (action) {
+       case CPU_ONLINE:
+       case CPU_DOWN_FAILED:
+               (void)rcutorture_booster_init(cpu);
+               break;
+       case CPU_DOWN_PREPARE:
+               rcutorture_booster_cleanup(cpu);
+               break;
+       default:
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+static struct notifier_block rcutorture_cpu_nb = {
+       .notifier_call = rcutorture_cpu_notify,
+};
+
  static void
  rcu_torture_cleanup(void)
  {
@@ -1127,7 +1343,7 @@ rcu_torture_cleanup(void)
         }
         fullstop = FULLSTOP_RMMOD;
         mutex_unlock(&fullstop_mutex);
-       unregister_reboot_notifier(&rcutorture_nb);
+       unregister_reboot_notifier(&rcutorture_shutdown_nb);
         if (stutter_task) {
                 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
                 kthread_stop(stutter_task);
@@ -1184,6 +1400,12 @@ rcu_torture_cleanup(void)
                 kthread_stop(fqs_task);
         }
         fqs_task = NULL;
+       if ((test_boost == 1 && cur_ops->can_boost) ||
+           test_boost == 2) {
+               unregister_cpu_notifier(&rcutorture_cpu_nb);
+               for_each_possible_cpu(i)
+                       rcutorture_booster_cleanup(i);
+       }
  
         /* Wait for all RCU callbacks to fire.  */
  
@@ -1195,9 +1417,9 @@ rcu_torture_cleanup(void)
         if (cur_ops->cleanup)
                 cur_ops->cleanup();
         if (atomic_read(&n_rcu_torture_error))
-               rcu_torture_print_module_parms("End of test: FAILURE");
+               rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
         else
-               rcu_torture_print_module_parms("End of test: SUCCESS");
+               rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
  }
  
  static int __init
@@ -1242,7 +1464,7 @@ rcu_torture_init(void)
                 nrealreaders = nreaders;
         else
                 nrealreaders = 2 * num_online_cpus();
-       rcu_torture_print_module_parms("Start of test");
+       rcu_torture_print_module_parms(cur_ops, "Start of test");
         fullstop = FULLSTOP_DONTSTOP;
  
         /* Set up the freelist. */
@@ -1263,6 +1485,12 @@ rcu_torture_init(void)
         atomic_set(&n_rcu_torture_free, 0);
         atomic_set(&n_rcu_torture_mberror, 0);
         atomic_set(&n_rcu_torture_error, 0);
+       n_rcu_torture_boost_ktrerror = 0;
+       n_rcu_torture_boost_rterror = 0;
+       n_rcu_torture_boost_allocerror = 0;
+       n_rcu_torture_boost_afferror = 0;
+       n_rcu_torture_boost_failure = 0;
+       n_rcu_torture_boosts = 0;
         for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
                 atomic_set(&rcu_torture_wcount[i], 0);
         for_each_possible_cpu(cpu) {
@@ -1376,7 +1604,27 @@ rcu_torture_init(void)
                         goto unwind;
                 }
         }
-       register_reboot_notifier(&rcutorture_nb);
+       if (test_boost_interval < 1)
+               test_boost_interval = 1;
+       if (test_boost_duration < 2)
+               test_boost_duration = 2;
+       if ((test_boost == 1 && cur_ops->can_boost) ||
+           test_boost == 2) {
+               int retval;
+
+               boost_starttime = jiffies + test_boost_interval * HZ;
+               register_cpu_notifier(&rcutorture_cpu_nb);
+               for_each_possible_cpu(i) {
+                       if (cpu_is_offline(i))
+                               continue;  /* Heuristic: CPU can go offline. */
+                       retval = rcutorture_booster_init(i);
+                       if (retval < 0) {
+                               firsterr = retval;
+                               goto unwind;
+                       }
+               }
+       }
+       register_reboot_notifier(&rcutorture_shutdown_nb);
         mutex_unlock(&fullstop_mutex);
         return 0;
  
diff --git a/kernel/rcutree.c b/kernel/rcutree.c

index ccdc04c479815addc8dbacea69643174a4636670..d0ddfea6579d027809cfb0bce885289bac0f957e 100644 (file)
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
         .gpnum = -300, \
         .completed = -300, \
         .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
-       .orphan_cbs_list = NULL, \
-       .orphan_cbs_tail = &structname.orphan_cbs_list, \
-       .orphan_qlen = 0, \
         .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
         .n_force_qs = 0, \
         .n_force_qs_ngp = 0, \
@@ -620,9 +617,17 @@ static void __init check_cpu_stall_init(void)
  static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
  {
         if (rdp->gpnum != rnp->gpnum) {
-               rdp->qs_pending = 1;
-               rdp->passed_quiesc = 0;
+               /*
+                * If the current grace period is waiting for this CPU,
+                * set up to detect a quiescent state, otherwise don't
+                * go looking for one.
+                */
                 rdp->gpnum = rnp->gpnum;
+               if (rnp->qsmask & rdp->grpmask) {
+                       rdp->qs_pending = 1;
+                       rdp->passed_quiesc = 0;
+               } else
+                       rdp->qs_pending = 0;
         }
  }
  
@@ -681,6 +686,24 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
  
                 /* Remember that we saw this grace-period completion. */
                 rdp->completed = rnp->completed;
+
+               /*
+                * If we were in an extended quiescent state, we may have
+                * missed some grace periods that others CPUs handled on
+                * our behalf. Catch up with this state to avoid noting
+                * spurious new grace periods.  If another grace period
+                * has started, then rnp->gpnum will have advanced, so
+                * we will detect this later on.
+                */
+               if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
+                       rdp->gpnum = rdp->completed;
+
+               /*
+                * If RCU does not need a quiescent state from this CPU,
+                * then make sure that this CPU doesn't go looking for one.
+                */
+               if ((rnp->qsmask & rdp->grpmask) == 0)
+                       rdp->qs_pending = 0;
         }
  }
  
@@ -984,53 +1007,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
  #ifdef CONFIG_HOTPLUG_CPU
  
  /*
- * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
- * specified flavor of RCU.  The callbacks will be adopted by the next
- * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
- * comes first.  Because this is invoked from the CPU_DYING notifier,
- * irqs are already disabled.
+ * Move a dying CPU's RCU callbacks to online CPU's callback list.
+ * Synchronization is not required because this function executes
+ * in stop_machine() context.
   */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
  {
         int i;
+       /* current DYING CPU is cleared in the cpu_online_mask */
+       int receive_cpu = cpumask_any(cpu_online_mask);
         struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+       struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
  
         if (rdp->nxtlist == NULL)
                 return;  /* irqs disabled, so comparison is stable. */
-       raw_spin_lock(&rsp->onofflock);  /* irqs already disabled. */
-       *rsp->orphan_cbs_tail = rdp->nxtlist;
-       rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
+
+       *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+       receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+       receive_rdp->qlen += rdp->qlen;
+       receive_rdp->n_cbs_adopted += rdp->qlen;
+       rdp->n_cbs_orphaned += rdp->qlen;
+
         rdp->nxtlist = NULL;
         for (i = 0; i < RCU_NEXT_SIZE; i++)
                 rdp->nxttail[i] = &rdp->nxtlist;
-       rsp->orphan_qlen += rdp->qlen;
-       rdp->n_cbs_orphaned += rdp->qlen;
         rdp->qlen = 0;
-       raw_spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
-}
-
-/*
- * Adopt previously orphaned RCU callbacks.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
-{
-       unsigned long flags;
-       struct rcu_data *rdp;
-
-       raw_spin_lock_irqsave(&rsp->onofflock, flags);
-       rdp = this_cpu_ptr(rsp->rda);
-       if (rsp->orphan_cbs_list == NULL) {
-               raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-               return;
-       }
-       *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
-       rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
-       rdp->qlen += rsp->orphan_qlen;
-       rdp->n_cbs_adopted += rsp->orphan_qlen;
-       rsp->orphan_cbs_list = NULL;
-       rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
-       rsp->orphan_qlen = 0;
-       raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
  }
  
  /*
@@ -1081,8 +1082,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
         if (need_report & RCU_OFL_TASKS_EXP_GP)
                 rcu_report_exp_rnp(rsp, rnp);
-
-       rcu_adopt_orphan_cbs(rsp);
  }
  
  /*
@@ -1100,11 +1099,7 @@ static void rcu_offline_cpu(int cpu)
  
  #else /* #ifdef CONFIG_HOTPLUG_CPU */
  
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
-{
-}
-
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
  {
  }
  
@@ -1440,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
          */
         local_irq_save(flags);
         rdp = this_cpu_ptr(rsp->rda);
-       rcu_process_gp_end(rsp, rdp);
-       check_for_new_grace_period(rsp, rdp);
  
         /* Add the callback to our list. */
         *rdp->nxttail[RCU_NEXT_TAIL] = head;
         rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
  
-       /* Start a new grace period if one not already started. */
-       if (!rcu_gp_in_progress(rsp)) {
-               unsigned long nestflag;
-               struct rcu_node *rnp_root = rcu_get_root(rsp);
-
-               raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
-               rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
-       }
-
         /*
          * Force the grace period if too many callbacks or too long waiting.
          * Enforce hysteresis, and don't invoke force_quiescent_state()
@@ -1464,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
          * is the only one waiting for a grace period to complete.
          */
         if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
-               rdp->blimit = LONG_MAX;
-               if (rsp->n_force_qs == rdp->n_force_qs_snap &&
-                   *rdp->nxttail[RCU_DONE_TAIL] != head)
-                       force_quiescent_state(rsp, 0);
-               rdp->n_force_qs_snap = rsp->n_force_qs;
-               rdp->qlen_last_fqs_check = rdp->qlen;
+
+               /* Are we ignoring a completed grace period? */
+               rcu_process_gp_end(rsp, rdp);
+               check_for_new_grace_period(rsp, rdp);
+
+               /* Start a new grace period if one not already started. */
+               if (!rcu_gp_in_progress(rsp)) {
+                       unsigned long nestflag;
+                       struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+                       raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+                       rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
+               } else {
+                       /* Give the grace period a kick. */
+                       rdp->blimit = LONG_MAX;
+                       if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+                           *rdp->nxttail[RCU_DONE_TAIL] != head)
+                               force_quiescent_state(rsp, 0);
+                       rdp->n_force_qs_snap = rsp->n_force_qs;
+                       rdp->qlen_last_fqs_check = rdp->qlen;
+               }
         } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
                 force_quiescent_state(rsp, 1);
         local_irq_restore(flags);
@@ -1699,13 +1698,12 @@ static void _rcu_barrier(struct rcu_state *rsp,
          * decrement rcu_barrier_cpu_count -- otherwise the first CPU
          * might complete its grace period before all of the other CPUs
          * did their increment, causing this function to return too
-        * early.
+        * early.  Note that on_each_cpu() disables irqs, which prevents
+        * any CPUs from coming online or going offline until each online
+        * CPU has queued its RCU-barrier callback.
          */
         atomic_set(&rcu_barrier_cpu_count, 1);
-       preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
-       rcu_adopt_orphan_cbs(rsp);
         on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
-       preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
         if (atomic_dec_and_test(&rcu_barrier_cpu_count))
                 complete(&rcu_barrier_completion);
         wait_for_completion(&rcu_barrier_completion);
@@ -1831,18 +1829,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
         case CPU_DYING:
         case CPU_DYING_FROZEN:
                 /*
-                * preempt_disable() in _rcu_barrier() prevents stop_machine(),
-                * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
-                * returns, all online cpus have queued rcu_barrier_func().
-                * The dying CPU clears its cpu_online_mask bit and
-                * moves all of its RCU callbacks to ->orphan_cbs_list
-                * in the context of stop_machine(), so subsequent calls
-                * to _rcu_barrier() will adopt these callbacks and only
-                * then queue rcu_barrier_func() on all remaining CPUs.
+                * The whole machine is "stopped" except this CPU, so we can
+                * touch any data without introducing corruption. We send the
+                * dying CPU's callbacks to an arbitrarily chosen online CPU.
                  */
-               rcu_send_cbs_to_orphanage(&rcu_bh_state);
-               rcu_send_cbs_to_orphanage(&rcu_sched_state);
-               rcu_preempt_send_cbs_to_orphanage();
+               rcu_send_cbs_to_online(&rcu_bh_state);
+               rcu_send_cbs_to_online(&rcu_sched_state);
+               rcu_preempt_send_cbs_to_online();
                 break;
         case CPU_DEAD:
         case CPU_DEAD_FROZEN:
@@ -1880,8 +1873,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
  {
         int i;
  
-       for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
+       for (i = NUM_RCU_LVLS - 1; i > 0; i--)
                 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+       rsp->levelspread[0] = RCU_FANOUT_LEAF;
  }
  #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
  static void __init rcu_init_levelspread(struct rcu_state *rsp)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h

index 91d4170c5c13afd2e8997bd59b28e7cc2a4385e8..e8f057e44e3ee00466e840593983ca5062302545 100644 (file)
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -31,46 +31,51 @@
  /*
   * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
   * In theory, it should be possible to add more levels straightforwardly.
- * In practice, this has not been tested, so there is probably some
- * bug somewhere.
+ * In practice, this did work well going from three levels to four.
+ * Of course, your mileage may vary.
   */
  #define MAX_RCU_LVLS 4
-#define RCU_FANOUT           (CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_SQ        (RCU_FANOUT * RCU_FANOUT)
-#define RCU_FANOUT_CUBE              (RCU_FANOUT_SQ * RCU_FANOUT)
-#define RCU_FANOUT_FOURTH     (RCU_FANOUT_CUBE * RCU_FANOUT)
-
-#if NR_CPUS <= RCU_FANOUT
+#if CONFIG_RCU_FANOUT > 16
+#define RCU_FANOUT_LEAF       16
+#else /* #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_LEAF       (CONFIG_RCU_FANOUT)
+#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_1         (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2         (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_3         (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_4         (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+
+#if NR_CPUS <= RCU_FANOUT_1
  #  define NUM_RCU_LVLS       1
  #  define NUM_RCU_LVL_0              1
  #  define NUM_RCU_LVL_1              (NR_CPUS)
  #  define NUM_RCU_LVL_2              0
  #  define NUM_RCU_LVL_3              0
  #  define NUM_RCU_LVL_4              0
-#elif NR_CPUS <= RCU_FANOUT_SQ
+#elif NR_CPUS <= RCU_FANOUT_2
  #  define NUM_RCU_LVLS       2
  #  define NUM_RCU_LVL_0              1
-#  define NUM_RCU_LVL_1              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+#  define NUM_RCU_LVL_1              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
  #  define NUM_RCU_LVL_2              (NR_CPUS)
  #  define NUM_RCU_LVL_3              0
  #  define NUM_RCU_LVL_4              0
-#elif NR_CPUS <= RCU_FANOUT_CUBE
+#elif NR_CPUS <= RCU_FANOUT_3
  #  define NUM_RCU_LVLS       3
  #  define NUM_RCU_LVL_0              1
-#  define NUM_RCU_LVL_1              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
-#  define NUM_RCU_LVL_2              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
-#  define NUM_RCU_LVL_3              NR_CPUS
+#  define NUM_RCU_LVL_1              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+#  define NUM_RCU_LVL_2              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+#  define NUM_RCU_LVL_3              (NR_CPUS)
  #  define NUM_RCU_LVL_4              0
-#elif NR_CPUS <= RCU_FANOUT_FOURTH
+#elif NR_CPUS <= RCU_FANOUT_4
  #  define NUM_RCU_LVLS       4
  #  define NUM_RCU_LVL_0              1
-#  define NUM_RCU_LVL_1              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
-#  define NUM_RCU_LVL_2              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
-#  define NUM_RCU_LVL_3              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
-#  define NUM_RCU_LVL_4              NR_CPUS
+#  define NUM_RCU_LVL_1              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
+#  define NUM_RCU_LVL_2              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+#  define NUM_RCU_LVL_3              DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+#  define NUM_RCU_LVL_4              (NR_CPUS)
  #else
  # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
  
  #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
  #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
@@ -203,8 +208,8 @@ struct rcu_data {
         long            qlen_last_fqs_check;
                                         /* qlen at last check for QS forcing */
         unsigned long   n_cbs_invoked;  /* count of RCU cbs invoked. */
-       unsigned long   n_cbs_orphaned; /* RCU cbs sent to orphanage. */
-       unsigned long   n_cbs_adopted;  /* RCU cbs adopted from orphanage. */
+       unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
+       unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */
         unsigned long   n_force_qs_snap;
                                         /* did other CPU force QS recently? */
         long            blimit;         /* Upper limit on a processed batch */
@@ -309,15 +314,7 @@ struct rcu_state {
         /* End of fields guarded by root rcu_node's lock. */
  
         raw_spinlock_t onofflock;               /* exclude on/offline and */
-                                               /*  starting new GP.  Also */
-                                               /*  protects the following */
-                                               /*  orphan_cbs fields. */
-       struct rcu_head *orphan_cbs_list;       /* list of rcu_head structs */
-                                               /*  orphaned by all CPUs in */
-                                               /*  a given leaf rcu_node */
-                                               /*  going offline. */
-       struct rcu_head **orphan_cbs_tail;      /* And tail pointer. */
-       long orphan_qlen;                       /* Number of orphaned cbs. */
+                                               /*  starting new GP. */
         raw_spinlock_t fqslock;                 /* Only one task forcing */
                                                 /*  quiescent states. */
         unsigned long jiffies_force_qs;         /* Time at which to invoke */
@@ -390,7 +387,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
  static int rcu_preempt_pending(int cpu);
  static int rcu_preempt_needs_cpu(int cpu);
  static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_send_cbs_to_orphanage(void);
+static void rcu_preempt_send_cbs_to_online(void);
  static void __init __rcu_init_preempt(void);
  static void rcu_needs_cpu_flush(void);
  
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h

index 71a4147473f95f51d2b2e88db4c14372dafe375f..a3638710dc67f4627f5cdb88e1cafb43b500d24a 100644 (file)
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
   */
  
  #include <linux/delay.h>
+#include <linux/stop_machine.h>
  
  /*
   * Check the RCU kernel configuration parameters and print informative
@@ -773,11 +774,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
  }
  
  /*
- * Move preemptable RCU's callbacks to ->orphan_cbs_list.
+ * Move preemptable RCU's callbacks from dying CPU to other online CPU.
   */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
  {
-       rcu_send_cbs_to_orphanage(&rcu_preempt_state);
+       rcu_send_cbs_to_online(&rcu_preempt_state);
  }
  
  /*
@@ -1001,7 +1002,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
  /*
   * Because there is no preemptable RCU, there are no callbacks to move.
   */
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
  {
  }
  
@@ -1014,6 +1015,132 @@ static void __init __rcu_init_preempt(void)
  
  #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
  
+#ifndef CONFIG_SMP
+
+void synchronize_sched_expedited(void)
+{
+       cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
+
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+       /*
+        * There must be a full memory barrier on each affected CPU
+        * between the time that try_stop_cpus() is called and the
+        * time that it returns.
+        *
+        * In the current initial implementation of cpu_stop, the
+        * above condition is already met when the control reaches
+        * this point and the following smp_mb() is not strictly
+        * necessary.  Do smp_mb() anyway for documentation and
+        * robustness against future implementation changes.
+        */
+       smp_mb(); /* See above comment block. */
+       return 0;
+}
+
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly.  This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier.  Failing to
+ * observe this restriction will result in deadlock.
+ *
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word.  Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs.  If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period.  We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done.  If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot.  In this case, our work is
+ * done for us, and we can simply return.  Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
+ */
+void synchronize_sched_expedited(void)
+{
+       int firstsnap, s, snap, trycount = 0;
+
+       /* Note that atomic_inc_return() implies full memory barrier. */
+       firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
+       get_online_cpus();
+
+       /*
+        * Each pass through the following loop attempts to force a
+        * context switch on each CPU.
+        */
+       while (try_stop_cpus(cpu_online_mask,
+                            synchronize_sched_expedited_cpu_stop,
+                            NULL) == -EAGAIN) {
+               put_online_cpus();
+
+               /* No joy, try again later.  Or just synchronize_sched(). */
+               if (trycount++ < 10)
+                       udelay(trycount * num_online_cpus());
+               else {
+                       synchronize_sched();
+                       return;
+               }
+
+               /* Check to see if someone else did our work for us. */
+               s = atomic_read(&sync_sched_expedited_done);
+               if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
+                       smp_mb(); /* ensure test happens before caller kfree */
+                       return;
+               }
+
+               /*
+                * Refetching sync_sched_expedited_started allows later
+                * callers to piggyback on our grace period.  We subtract
+                * 1 to get the same token that the last incrementer got.
+                * We retry after they started, so our grace period works
+                * for them, and they started after our first try, so their
+                * grace period works for us.
+                */
+               get_online_cpus();
+               snap = atomic_read(&sync_sched_expedited_started) - 1;
+               smp_mb(); /* ensure read is before try_stop_cpus(). */
+       }
+
+       /*
+        * Everyone up to our most recent fetch is covered by our grace
+        * period.  Update the counter, but only if our work is still
+        * relevant -- which it won't be if someone who started later
+        * than we did beat us to the punch.
+        */
+       do {
+               s = atomic_read(&sync_sched_expedited_done);
+               if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+                       smp_mb(); /* ensure test happens before caller kfree */
+                       break;
+               }
+       } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+
+       put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#endif /* #else #ifndef CONFIG_SMP */
+
  #if !defined(CONFIG_RCU_FAST_NO_HZ)
  
  /*
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c

index d15430b9d122f4d619e76fb6b5069aa1f494a575..c8e97853b970f71ad662732ef46da011cf46ac1d 100644 (file)
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -166,13 +166,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
  
         gpnum = rsp->gpnum;
         seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
-                     "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
+                     "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
                    rsp->completed, gpnum, rsp->signaled,
                    (long)(rsp->jiffies_force_qs - jiffies),
                    (int)(jiffies & 0xffff),
                    rsp->n_force_qs, rsp->n_force_qs_ngp,
                    rsp->n_force_qs - rsp->n_force_qs_ngp,
-                  rsp->n_force_qs_lh, rsp->orphan_qlen);
+                  rsp->n_force_qs_lh);
         for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
                 if (rnp->level != level) {
                         seq_puts(m, "\n");
@@ -300,7 +300,7 @@ static const struct file_operations rcu_pending_fops = {
  
  static struct dentry *rcudir;
  
-static int __init rcuclassic_trace_init(void)
+static int __init rcutree_trace_init(void)
  {
         struct dentry *retval;
  
@@ -337,14 +337,14 @@ free_out:
         return 1;
  }
  
-static void __exit rcuclassic_trace_cleanup(void)
+static void __exit rcutree_trace_cleanup(void)
  {
         debugfs_remove_recursive(rcudir);
  }
  
  
-module_init(rcuclassic_trace_init);
-module_exit(rcuclassic_trace_cleanup);
+module_init(rcutree_trace_init);
+module_exit(rcutree_trace_cleanup);
  
  MODULE_AUTHOR("Paul E. McKenney");
  MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
diff --git a/kernel/sched.c b/kernel/sched.c

index 297d1a0eedb0e68d8b9327f530ba477c93b1222e..04949089e7601ccd2a9b82f0f30c5905cbc9777b 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
  
  #include <asm/tlb.h>
  #include <asm/irq_regs.h>
+#include <asm/mutex.h>
  
  #include "sched_cpupri.h"
  #include "workqueue_sched.h"
+#include "sched_autogroup.h"
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
         /* runqueue "owned" by this group on each cpu */
         struct cfs_rq **cfs_rq;
         unsigned long shares;
+
+       atomic_t load_weight;
  #endif
  
  #ifdef CONFIG_RT_GROUP_SCHED
@@ -268,24 +272,19 @@ struct task_group {
         struct task_group *parent;
         struct list_head siblings;
         struct list_head children;
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+       struct autogroup *autogroup;
+#endif
  };
  
  #define root_task_group init_task_group
  
-/* task_group_lock serializes add/remove of task groups and also changes to
- * a task group's cpu shares.
- */
+/* task_group_lock serializes the addition/removal of task groups */
  static DEFINE_SPINLOCK(task_group_lock);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
-#ifdef CONFIG_SMP
-static int root_task_group_empty(void)
-{
-       return list_empty(&root_task_group.children);
-}
-#endif
-
  # define INIT_TASK_GROUP_LOAD  NICE_0_LOAD
  
  /*
@@ -342,6 +341,7 @@ struct cfs_rq {
          * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
          * list is used during load balance.
          */
+       int on_list;
         struct list_head leaf_cfs_rq_list;
         struct task_group *tg;  /* group that "owns" this runqueue */
  
@@ -360,14 +360,17 @@ struct cfs_rq {
         unsigned long h_load;
  
         /*
-        * this cpu's part of tg->shares
+        * Maintaining per-cpu shares distribution for group scheduling
+        *
+        * load_stamp is the last time we updated the load average
+        * load_last is the last time we updated the load average and saw load
+        * load_unacc_exec_time is currently unaccounted execution time
          */
-       unsigned long shares;
+       u64 load_avg;
+       u64 load_period;
+       u64 load_stamp, load_last, load_unacc_exec_time;
  
-       /*
-        * load.weight at the time we set shares
-        */
-       unsigned long rq_weight;
+       unsigned long load_contribution;
  #endif
  #endif
  };
@@ -605,11 +608,14 @@ static inline int cpu_of(struct rq *rq)
   */
  static inline struct task_group *task_group(struct task_struct *p)
  {
+       struct task_group *tg;
         struct cgroup_subsys_state *css;
  
         css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
                         lockdep_is_held(&task_rq(p)->lock));
-       return container_of(css, struct task_group, css);
+       tg = container_of(css, struct task_group, css);
+
+       return autogroup_task_group(p, tg);
  }
  
  /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -792,20 +798,6 @@ late_initcall(sched_init_debug);
   */
  const_debug unsigned int sysctl_sched_nr_migrate = 32;
  
-/*
- * ratelimit for updating the group shares.
- * default: 0.25ms
- */
-unsigned int sysctl_sched_shares_ratelimit = 250000;
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
-
-/*
- * Inject some fuzzyness into changing the per-cpu group shares
- * this avoids remote rq-locks at the expense of fairness.
- * default: 4
- */
-unsigned int sysctl_sched_shares_thresh = 4;
-
  /*
   * period over which we average the RT time consumption, measured
   * in ms.
@@ -1355,6 +1347,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
         lw->inv_weight = 0;
  }
  
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+       lw->weight = w;
+       lw->inv_weight = 0;
+}
+
  /*
   * To aid in avoiding the subversion of "niceness" due to uneven distribution
   * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1543,101 +1541,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
-static __read_mostly unsigned long __percpu *update_shares_data;
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
-                                   unsigned long sd_shares,
-                                   unsigned long sd_rq_weight,
-                                   unsigned long *usd_rq_weight)
-{
-       unsigned long shares, rq_weight;
-       int boost = 0;
-
-       rq_weight = usd_rq_weight[cpu];
-       if (!rq_weight) {
-               boost = 1;
-               rq_weight = NICE_0_LOAD;
-       }
-
-       /*
-        *             \Sum_j shares_j * rq_weight_i
-        * shares_i =  -----------------------------
-        *                  \Sum_j rq_weight_j
-        */
-       shares = (sd_shares * rq_weight) / sd_rq_weight;
-       shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-
-       if (abs(shares - tg->se[cpu]->load.weight) >
-                       sysctl_sched_shares_thresh) {
-               struct rq *rq = cpu_rq(cpu);
-               unsigned long flags;
-
-               raw_spin_lock_irqsave(&rq->lock, flags);
-               tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
-               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-               __set_se_shares(tg->se[cpu], shares);
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
-       }
-}
-
-/*
- * Re-compute the task group their per cpu shares over the given domain.
- * This needs to be done in a bottom-up fashion because the rq weight of a
- * parent group depends on the shares of its child groups.
- */
-static int tg_shares_up(struct task_group *tg, void *data)
-{
-       unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
-       unsigned long *usd_rq_weight;
-       struct sched_domain *sd = data;
-       unsigned long flags;
-       int i;
-
-       if (!tg->se[0])
-               return 0;
-
-       local_irq_save(flags);
-       usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
-
-       for_each_cpu(i, sched_domain_span(sd)) {
-               weight = tg->cfs_rq[i]->load.weight;
-               usd_rq_weight[i] = weight;
-
-               rq_weight += weight;
-               /*
-                * If there are currently no tasks on the cpu pretend there
-                * is one of average load so that when a new task gets to
-                * run here it will not get delayed by group starvation.
-                */
-               if (!weight)
-                       weight = NICE_0_LOAD;
-
-               sum_weight += weight;
-               shares += tg->cfs_rq[i]->shares;
-       }
-
-       if (!rq_weight)
-               rq_weight = sum_weight;
-
-       if ((!shares && rq_weight) || shares > tg->shares)
-               shares = tg->shares;
-
-       if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
-               shares = tg->shares;
-
-       for_each_cpu(i, sched_domain_span(sd))
-               update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
-
-       local_irq_restore(flags);
-
-       return 0;
-}
-
  /*
   * Compute the cpu's hierarchical load factor for each task group.
   * This needs to be done in a top-down fashion because the load of a child
@@ -1652,7 +1555,7 @@ static int tg_load_down(struct task_group *tg, void *data)
                 load = cpu_rq(cpu)->load.weight;
         } else {
                 load = tg->parent->cfs_rq[cpu]->h_load;
-               load *= tg->cfs_rq[cpu]->shares;
+               load *= tg->se[cpu]->load.weight;
                 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
         }
  
@@ -1661,34 +1564,11 @@ static int tg_load_down(struct task_group *tg, void *data)
         return 0;
  }
  
-static void update_shares(struct sched_domain *sd)
-{
-       s64 elapsed;
-       u64 now;
-
-       if (root_task_group_empty())
-               return;
-
-       now = local_clock();
-       elapsed = now - sd->last_update;
-
-       if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
-               sd->last_update = now;
-               walk_tg_tree(tg_nop, tg_shares_up, sd);
-       }
-}
-
  static void update_h_load(long cpu)
  {
         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
  }
  
-#else
-
-static inline void update_shares(struct sched_domain *sd)
-{
-}
-
  #endif
  
  #ifdef CONFIG_PREEMPT
@@ -1810,15 +1690,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
  
  #endif
  
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-#ifdef CONFIG_SMP
-       cfs_rq->shares = shares;
-#endif
-}
-#endif
-
  static void calc_load_account_idle(struct rq *this_rq);
  static void update_sysctl(void);
  static int get_update_sysctl_factor(void);
@@ -2063,6 +1934,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
  #include "sched_idletask.c"
  #include "sched_fair.c"
  #include "sched_rt.c"
+#include "sched_autogroup.c"
  #include "sched_stoptask.c"
  #ifdef CONFIG_SCHED_DEBUG
  # include "sched_debug.c"
@@ -2255,10 +2127,8 @@ static int migration_cpu_stop(void *data);
   * The task's runqueue lock must be held.
   * Returns true if you have to wait for migration thread.
   */
-static bool migrate_task(struct task_struct *p, int dest_cpu)
+static bool migrate_task(struct task_struct *p, struct rq *rq)
  {
-       struct rq *rq = task_rq(p);
-
         /*
          * If the task is not on a runqueue (and not running), then
          * the next wake-up will properly place the task.
@@ -2438,18 +2308,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                 return dest_cpu;
  
         /* No more Mr. Nice Guy. */
-       if (unlikely(dest_cpu >= nr_cpu_ids)) {
-               dest_cpu = cpuset_cpus_allowed_fallback(p);
-               /*
-                * Don't tell them about moving exiting tasks or
-                * kernel threads (both mm NULL), since they never
-                * leave kernel.
-                */
-               if (p->mm && printk_ratelimit()) {
-                       printk(KERN_INFO "process %d (%s) no "
-                              "longer affine to cpu%d\n",
-                              task_pid_nr(p), p->comm, cpu);
-               }
+       dest_cpu = cpuset_cpus_allowed_fallback(p);
+       /*
+        * Don't tell them about moving exiting tasks or
+        * kernel threads (both mm NULL), since they never
+        * leave kernel.
+        */
+       if (p->mm && printk_ratelimit()) {
+               printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
+                               task_pid_nr(p), p->comm, cpu);
         }
  
         return dest_cpu;
@@ -2785,7 +2652,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
         /* Want to start with kernel preemption disabled. */
         task_thread_info(p)->preempt_count = 1;
  #endif
+#ifdef CONFIG_SMP
         plist_node_init(&p->pushable_tasks, MAX_PRIO);
+#endif
  
         put_cpu();
  }
@@ -3549,7 +3418,7 @@ void sched_exec(void)
          * select_task_rq() can race against ->cpus_allowed
          */
         if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-           likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+           likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
                 struct migration_arg arg = { p, dest_cpu };
  
                 task_rq_unlock(rq, &flags);
@@ -4214,7 +4083,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
                 if (task_thread_info(rq->curr) != owner || need_resched())
                         return 0;
  
-               cpu_relax();
+               arch_mutex_cpu_relax();
         }
  
         return 1;
@@ -4526,7 +4395,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
   * This waits for either a completion of a specific task to be signaled or for a
   * specified timeout to expire. It is interruptible. The timeout is in jiffies.
   */
-unsigned long __sched
+long __sched
  wait_for_completion_interruptible_timeout(struct completion *x,
                                           unsigned long timeout)
  {
@@ -4559,7 +4428,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
   * signaled or for a specified timeout to expire. It can be
   * interrupted by a kill signal. The timeout is in jiffies.
   */
-unsigned long __sched
+long __sched
  wait_for_completion_killable_timeout(struct completion *x,
                                      unsigned long timeout)
  {
@@ -4901,7 +4770,7 @@ static bool check_same_owner(struct task_struct *p)
  }
  
  static int __sched_setscheduler(struct task_struct *p, int policy,
-                               struct sched_param *param, bool user)
+                               const struct sched_param *param, bool user)
  {
         int retval, oldprio, oldpolicy = -1, on_rq, running;
         unsigned long flags;
@@ -5056,7 +4925,7 @@ recheck:
   * NOTE that the task may be already dead.
   */
  int sched_setscheduler(struct task_struct *p, int policy,
-                      struct sched_param *param)
+                      const struct sched_param *param)
  {
         return __sched_setscheduler(p, policy, param, true);
  }
@@ -5074,7 +4943,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
   * but our caller might not have that capability.
   */
  int sched_setscheduler_nocheck(struct task_struct *p, int policy,
-                              struct sched_param *param)
+                              const struct sched_param *param)
  {
         return __sched_setscheduler(p, policy, param, false);
  }
@@ -5590,7 +5459,7 @@ void sched_show_task(struct task_struct *p)
         unsigned state;
  
         state = p->state ? __ffs(p->state) + 1 : 0;
-       printk(KERN_INFO "%-13.13s %c", p->comm,
+       printk(KERN_INFO "%-15.15s %c", p->comm,
                 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
  #if BITS_PER_LONG == 32
         if (state == TASK_RUNNING)
@@ -5754,7 +5623,6 @@ static void update_sysctl(void)
         SET_SYSCTL(sched_min_granularity);
         SET_SYSCTL(sched_latency);
         SET_SYSCTL(sched_wakeup_granularity);
-       SET_SYSCTL(sched_shares_ratelimit);
  #undef SET_SYSCTL
  }
  
@@ -5830,7 +5698,7 @@ again:
                 goto out;
  
         dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-       if (migrate_task(p, dest_cpu)) {
+       if (migrate_task(p, rq)) {
                 struct migration_arg arg = { p, dest_cpu };
                 /* Need help from migration thread: drop lock and wait. */
                 task_rq_unlock(rq, &flags);
@@ -5912,29 +5780,20 @@ static int migration_cpu_stop(void *data)
  }
  
  #ifdef CONFIG_HOTPLUG_CPU
+
  /*
- * Figure out where task on dead CPU should go, use force if necessary.
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
   */
-void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void idle_task_exit(void)
  {
-       struct rq *rq = cpu_rq(dead_cpu);
-       int needs_cpu, uninitialized_var(dest_cpu);
-       unsigned long flags;
+       struct mm_struct *mm = current->active_mm;
  
-       local_irq_save(flags);
+       BUG_ON(cpu_online(smp_processor_id()));
  
-       raw_spin_lock(&rq->lock);
-       needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
-       if (needs_cpu)
-               dest_cpu = select_fallback_rq(dead_cpu, p);
-       raw_spin_unlock(&rq->lock);
-       /*
-        * It can only fail if we race with set_cpus_allowed(),
-        * in the racer should migrate the task anyway.
-        */
-       if (needs_cpu)
-               __migrate_task(p, dead_cpu, dest_cpu);
-       local_irq_restore(flags);
+       if (mm != &init_mm)
+               switch_mm(mm, &init_mm, current);
+       mmdrop(mm);
  }
  
  /*
@@ -5947,128 +5806,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  static void migrate_nr_uninterruptible(struct rq *rq_src)
  {
         struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-       unsigned long flags;
  
-       local_irq_save(flags);
-       double_rq_lock(rq_src, rq_dest);
         rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
         rq_src->nr_uninterruptible = 0;
-       double_rq_unlock(rq_src, rq_dest);
-       local_irq_restore(flags);
-}
-
-/* Run through task list and migrate tasks from the dead cpu. */
-static void migrate_live_tasks(int src_cpu)
-{
-       struct task_struct *p, *t;
-
-       read_lock(&tasklist_lock);
-
-       do_each_thread(t, p) {
-               if (p == current)
-                       continue;
-
-               if (task_cpu(p) == src_cpu)
-                       move_task_off_dead_cpu(src_cpu, p);
-       } while_each_thread(t, p);
-
-       read_unlock(&tasklist_lock);
  }
  
  /*
- * Schedules idle task to be the next runnable task on current CPU.
- * It does so by boosting its priority to highest possible.
- * Used by CPU offline code.
+ * remove the tasks which were accounted by rq from calc_load_tasks.
   */
-void sched_idle_next(void)
+static void calc_global_load_remove(struct rq *rq)
  {
-       int this_cpu = smp_processor_id();
-       struct rq *rq = cpu_rq(this_cpu);
-       struct task_struct *p = rq->idle;
-       unsigned long flags;
-
-       /* cpu has to be offline */
-       BUG_ON(cpu_online(this_cpu));
-
-       /*
-        * Strictly not necessary since rest of the CPUs are stopped by now
-        * and interrupts disabled on the current cpu.
-        */
-       raw_spin_lock_irqsave(&rq->lock, flags);
-
-       __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-
-       activate_task(rq, p, 0);
-
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+       rq->calc_load_active = 0;
  }
  
  /*
- * Ensures that the idle task is using init_mm right before its cpu goes
- * offline.
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
   */
-void idle_task_exit(void)
-{
-       struct mm_struct *mm = current->active_mm;
-
-       BUG_ON(cpu_online(smp_processor_id()));
-
-       if (mm != &init_mm)
-               switch_mm(mm, &init_mm, current);
-       mmdrop(mm);
-}
-
-/* called under rq->lock with disabled interrupts */
-static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
+static void migrate_tasks(unsigned int dead_cpu)
  {
         struct rq *rq = cpu_rq(dead_cpu);
-
-       /* Must be exiting, otherwise would be on tasklist. */
-       BUG_ON(!p->exit_state);
-
-       /* Cannot have done final schedule yet: would have vanished. */
-       BUG_ON(p->state == TASK_DEAD);
-
-       get_task_struct(p);
+       struct task_struct *next, *stop = rq->stop;
+       int dest_cpu;
  
         /*
-        * Drop lock around migration; if someone else moves it,
-        * that's OK. No task can be added to this CPU, so iteration is
-        * fine.
+        * Fudge the rq selection such that the below task selection loop
+        * doesn't get stuck on the currently eligible stop task.
+        *
+        * We're currently inside stop_machine() and the rq is either stuck
+        * in the stop_machine_cpu_stop() loop, or we're executing this code,
+        * either way we should never end up calling schedule() until we're
+        * done here.
          */
-       raw_spin_unlock_irq(&rq->lock);
-       move_task_off_dead_cpu(dead_cpu, p);
-       raw_spin_lock_irq(&rq->lock);
-
-       put_task_struct(p);
-}
-
-/* release_task() removes task from tasklist, so we won't find dead tasks. */
-static void migrate_dead_tasks(unsigned int dead_cpu)
-{
-       struct rq *rq = cpu_rq(dead_cpu);
-       struct task_struct *next;
+       rq->stop = NULL;
  
         for ( ; ; ) {
-               if (!rq->nr_running)
+               /*
+                * There's this thread running, bail when that's the only
+                * remaining thread.
+                */
+               if (rq->nr_running == 1)
                         break;
+
                 next = pick_next_task(rq);
-               if (!next)
-                       break;
+               BUG_ON(!next);
                 next->sched_class->put_prev_task(rq, next);
-               migrate_dead(dead_cpu, next);
  
+               /* Find suitable destination for @next, with force if needed. */
+               dest_cpu = select_fallback_rq(dead_cpu, next);
+               raw_spin_unlock(&rq->lock);
+
+               __migrate_task(next, dead_cpu, dest_cpu);
+
+               raw_spin_lock(&rq->lock);
         }
-}
  
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
- */
-static void calc_global_load_remove(struct rq *rq)
-{
-       atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
-       rq->calc_load_active = 0;
+       rq->stop = stop;
  }
+
  #endif /* CONFIG_HOTPLUG_CPU */
  
  #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6278,15 +6078,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
         unsigned long flags;
         struct rq *rq = cpu_rq(cpu);
  
-       switch (action) {
+       switch (action & ~CPU_TASKS_FROZEN) {
  
         case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
                 rq->calc_load_update = calc_load_update;
                 break;
  
         case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
                 /* Update our root-domain */
                 raw_spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
@@ -6298,30 +6096,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 break;
  
  #ifdef CONFIG_HOTPLUG_CPU
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               migrate_live_tasks(cpu);
-               /* Idle task back to normal (off runqueue, low prio) */
-               raw_spin_lock_irq(&rq->lock);
-               deactivate_task(rq, rq->idle, 0);
-               __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
-               rq->idle->sched_class = &idle_sched_class;
-               migrate_dead_tasks(cpu);
-               raw_spin_unlock_irq(&rq->lock);
-               migrate_nr_uninterruptible(rq);
-               BUG_ON(rq->nr_running != 0);
-               calc_global_load_remove(rq);
-               break;
-
         case CPU_DYING:
-       case CPU_DYING_FROZEN:
                 /* Update our root-domain */
                 raw_spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
                         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                         set_rq_offline(rq);
                 }
+               migrate_tasks(cpu);
+               BUG_ON(rq->nr_running != 1); /* the migration thread */
                 raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+               migrate_nr_uninterruptible(rq);
+               calc_global_load_remove(rq);
                 break;
  #endif
         }
@@ -8052,15 +7839,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
-                               struct sched_entity *se, int cpu, int add,
+                               struct sched_entity *se, int cpu,
                                 struct sched_entity *parent)
  {
         struct rq *rq = cpu_rq(cpu);
         tg->cfs_rq[cpu] = cfs_rq;
         init_cfs_rq(cfs_rq, rq);
         cfs_rq->tg = tg;
-       if (add)
-               list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
  
         tg->se[cpu] = se;
         /* se could be NULL for init_task_group */
@@ -8073,15 +7858,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                 se->cfs_rq = parent->my_q;
  
         se->my_q = cfs_rq;
-       se->load.weight = tg->shares;
-       se->load.inv_weight = 0;
+       update_load_set(&se->load, 0);
         se->parent = parent;
  }
  #endif
  
  #ifdef CONFIG_RT_GROUP_SCHED
  static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-               struct sched_rt_entity *rt_se, int cpu, int add,
+               struct sched_rt_entity *rt_se, int cpu,
                 struct sched_rt_entity *parent)
  {
         struct rq *rq = cpu_rq(cpu);
@@ -8090,8 +7874,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
         init_rt_rq(rt_rq, rq);
         rt_rq->tg = tg;
         rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
-       if (add)
-               list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
  
         tg->rt_se[cpu] = rt_se;
         if (!rt_se)
@@ -8164,13 +7946,9 @@ void __init sched_init(void)
  #ifdef CONFIG_CGROUP_SCHED
         list_add(&init_task_group.list, &task_groups);
         INIT_LIST_HEAD(&init_task_group.children);
-
+       autogroup_init(&init_task);
  #endif /* CONFIG_CGROUP_SCHED */
  
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
-       update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
-                                           __alignof__(unsigned long));
-#endif
         for_each_possible_cpu(i) {
                 struct rq *rq;
  
@@ -8184,7 +7962,6 @@ void __init sched_init(void)
  #ifdef CONFIG_FAIR_GROUP_SCHED
                 init_task_group.shares = init_task_group_load;
                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
                 /*
                  * How much cpu bandwidth does init_task_group get?
                  *
@@ -8204,16 +7981,13 @@ void __init sched_init(void)
                  * We achieve this by letting init_task_group's tasks sit
                  * directly in rq->cfs (i.e init_task_group->se[] = NULL).
                  */
-               init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
-#endif
+               init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
                 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
  #ifdef CONFIG_RT_GROUP_SCHED
                 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
-               init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#endif
+               init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
  #endif
  
                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8293,8 +8067,6 @@ void __init sched_init(void)
                 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
  #endif /* SMP */
  
-       perf_event_init();
-
         scheduler_running = 1;
  }
  
@@ -8488,7 +8260,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                 if (!se)
                         goto err_free_rq;
  
-               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
+               init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
         }
  
         return 1;
@@ -8499,15 +8271,21 @@ err:
         return 0;
  }
  
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-       list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
-                       &cpu_rq(cpu)->leaf_cfs_rq_list);
-}
-
  static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
  {
-       list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags;
+
+       /*
+       * Only empty task groups can be destroyed; so we can speculatively
+       * check on_list without danger of it being re-added.
+       */
+       if (!tg->cfs_rq[cpu]->on_list)
+               return;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  #else /* !CONFG_FAIR_GROUP_SCHED */
  static inline void free_fair_sched_group(struct task_group *tg)
@@ -8520,10 +8298,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
         return 1;
  }
  
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-}
-
  static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
  {
  }
@@ -8578,7 +8352,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                 if (!rt_se)
                         goto err_free_rq;
  
-               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
+               init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
         }
  
         return 1;
@@ -8588,17 +8362,6 @@ err_free_rq:
  err:
         return 0;
  }
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-       list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
-                       &cpu_rq(cpu)->leaf_rt_rq_list);
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-       list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
-}
  #else /* !CONFIG_RT_GROUP_SCHED */
  static inline void free_rt_sched_group(struct task_group *tg)
  {
@@ -8609,14 +8372,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
  {
         return 1;
  }
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
  #endif /* CONFIG_RT_GROUP_SCHED */
  
  #ifdef CONFIG_CGROUP_SCHED
@@ -8632,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent)
  {
         struct task_group *tg;
         unsigned long flags;
-       int i;
  
         tg = kzalloc(sizeof(*tg), GFP_KERNEL);
         if (!tg)
@@ -8645,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent)
                 goto err;
  
         spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i) {
-               register_fair_sched_group(tg, i);
-               register_rt_sched_group(tg, i);
-       }
         list_add_rcu(&tg->list, &task_groups);
  
         WARN_ON(!parent); /* root should already exist */
@@ -8678,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg)
         unsigned long flags;
         int i;
  
-       spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i) {
+       /* end participation in shares distribution */
+       for_each_possible_cpu(i)
                 unregister_fair_sched_group(tg, i);
-               unregister_rt_sched_group(tg, i);
-       }
+
+       spin_lock_irqsave(&task_group_lock, flags);
         list_del_rcu(&tg->list);
         list_del_rcu(&tg->siblings);
         spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8729,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk)
  #endif /* CONFIG_CGROUP_SCHED */
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-       struct cfs_rq *cfs_rq = se->cfs_rq;
-       int on_rq;
-
-       on_rq = se->on_rq;
-       if (on_rq)
-               dequeue_entity(cfs_rq, se, 0);
-
-       se->load.weight = shares;
-       se->load.inv_weight = 0;
-
-       if (on_rq)
-               enqueue_entity(cfs_rq, se, 0);
-}
-
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
-       struct cfs_rq *cfs_rq = se->cfs_rq;
-       struct rq *rq = cfs_rq->rq;
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       __set_se_shares(se, shares);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
  static DEFINE_MUTEX(shares_mutex);
  
  int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8778,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         if (tg->shares == shares)
                 goto done;
  
-       spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i)
-               unregister_fair_sched_group(tg, i);
-       list_del_rcu(&tg->siblings);
-       spin_unlock_irqrestore(&task_group_lock, flags);
-
-       /* wait for any ongoing reference to this group to finish */
-       synchronize_sched();
-
-       /*
-        * Now we are free to modify the group's share on each cpu
-        * w/o tripping rebalance_share or load_balance_fair.
-        */
         tg->shares = shares;
         for_each_possible_cpu(i) {
-               /*
-                * force a rebalance
-                */
-               cfs_rq_set_shares(tg->cfs_rq[i], 0);
-               set_se_shares(tg->se[i], shares);
+               struct rq *rq = cpu_rq(i);
+               struct sched_entity *se;
+
+               se = tg->se[i];
+               /* Propagate contribution to hierarchy */
+               raw_spin_lock_irqsave(&rq->lock, flags);
+               for_each_sched_entity(se)
+                       update_cfs_shares(group_cfs_rq(se), 0);
+               raw_spin_unlock_irqrestore(&rq->lock, flags);
         }
  
-       /*
-        * Enable load balance activity on this group, by inserting it back on
-        * each cpu's rq->leaf_cfs_rq_list.
-        */
-       spin_lock_irqsave(&task_group_lock, flags);
-       for_each_possible_cpu(i)
-               register_fair_sched_group(tg, i);
-       list_add_rcu(&tg->siblings, &tg->parent->children);
-       spin_unlock_irqrestore(&task_group_lock, flags);
  done:
         mutex_unlock(&shares_mutex);
         return 0;
@@ -9534,72 +9239,3 @@ struct cgroup_subsys cpuacct_subsys = {
  };
  #endif /* CONFIG_CGROUP_CPUACCT */
  
-#ifndef CONFIG_SMP
-
-void synchronize_sched_expedited(void)
-{
-       barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
-       /*
-        * There must be a full memory barrier on each affected CPU
-        * between the time that try_stop_cpus() is called and the
-        * time that it returns.
-        *
-        * In the current initial implementation of cpu_stop, the
-        * above condition is already met when the control reaches
-        * this point and the following smp_mb() is not strictly
-        * necessary.  Do smp_mb() anyway for documentation and
-        * robustness against future implementation changes.
-        */
-       smp_mb(); /* See above comment block. */
-       return 0;
-}
-
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly.  This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier.  Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
-       int snap, trycount = 0;
-
-       smp_mb();  /* ensure prior mod happens before capturing snap. */
-       snap = atomic_read(&synchronize_sched_expedited_count) + 1;
-       get_online_cpus();
-       while (try_stop_cpus(cpu_online_mask,
-                            synchronize_sched_expedited_cpu_stop,
-                            NULL) == -EAGAIN) {
-               put_online_cpus();
-               if (trycount++ < 10)
-                       udelay(trycount * num_online_cpus());
-               else {
-                       synchronize_sched();
-                       return;
-               }
-               if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
-                       smp_mb(); /* ensure test happens before caller kfree */
-                       return;
-               }
-               get_online_cpus();
-       }
-       atomic_inc(&synchronize_sched_expedited_count);
-       smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
-       put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c

new file mode 100644 (file)

index 0000000..c80fedc
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,238 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+static struct autogroup autogroup_default;
+static atomic_t autogroup_seq_nr;
+
+static void autogroup_init(struct task_struct *init_task)
+{
+       autogroup_default.tg = &init_task_group;
+       init_task_group.autogroup = &autogroup_default;
+       kref_init(&autogroup_default.kref);
+       init_rwsem(&autogroup_default.lock);
+       init_task->signal->autogroup = &autogroup_default;
+}
+
+static inline void autogroup_free(struct task_group *tg)
+{
+       kfree(tg->autogroup);
+}
+
+static inline void autogroup_destroy(struct kref *kref)
+{
+       struct autogroup *ag = container_of(kref, struct autogroup, kref);
+
+       sched_destroy_group(ag->tg);
+}
+
+static inline void autogroup_kref_put(struct autogroup *ag)
+{
+       kref_put(&ag->kref, autogroup_destroy);
+}
+
+static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
+{
+       kref_get(&ag->kref);
+       return ag;
+}
+
+static inline struct autogroup *autogroup_task_get(struct task_struct *p)
+{
+       struct autogroup *ag;
+       unsigned long flags;
+
+       if (!lock_task_sighand(p, &flags))
+               return autogroup_kref_get(&autogroup_default);
+
+       ag = autogroup_kref_get(p->signal->autogroup);
+       unlock_task_sighand(p, &flags);
+
+       return ag;
+}
+
+static inline struct autogroup *autogroup_create(void)
+{
+       struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
+       struct task_group *tg;
+
+       if (!ag)
+               goto out_fail;
+
+       tg = sched_create_group(&init_task_group);
+
+       if (IS_ERR(tg))
+               goto out_free;
+
+       kref_init(&ag->kref);
+       init_rwsem(&ag->lock);
+       ag->id = atomic_inc_return(&autogroup_seq_nr);
+       ag->tg = tg;
+       tg->autogroup = ag;
+
+       return ag;
+
+out_free:
+       kfree(ag);
+out_fail:
+       if (printk_ratelimit()) {
+               printk(KERN_WARNING "autogroup_create: %s failure.\n",
+                       ag ? "sched_create_group()" : "kmalloc()");
+       }
+
+       return autogroup_kref_get(&autogroup_default);
+}
+
+static inline bool
+task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+{
+       if (tg != &root_task_group)
+               return false;
+
+       if (p->sched_class != &fair_sched_class)
+               return false;
+
+       /*
+        * We can only assume the task group can't go away on us if
+        * autogroup_move_group() can see us on ->thread_group list.
+        */
+       if (p->flags & PF_EXITING)
+               return false;
+
+       return true;
+}
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+       int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+       if (enabled && task_wants_autogroup(p, tg))
+               return p->signal->autogroup->tg;
+
+       return tg;
+}
+
+static void
+autogroup_move_group(struct task_struct *p, struct autogroup *ag)
+{
+       struct autogroup *prev;
+       struct task_struct *t;
+       unsigned long flags;
+
+       BUG_ON(!lock_task_sighand(p, &flags));
+
+       prev = p->signal->autogroup;
+       if (prev == ag) {
+               unlock_task_sighand(p, &flags);
+               return;
+       }
+
+       p->signal->autogroup = autogroup_kref_get(ag);
+
+       t = p;
+       do {
+               sched_move_task(t);
+       } while_each_thread(p, t);
+
+       unlock_task_sighand(p, &flags);
+       autogroup_kref_put(prev);
+}
+
+/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+void sched_autogroup_create_attach(struct task_struct *p)
+{
+       struct autogroup *ag = autogroup_create();
+
+       autogroup_move_group(p, ag);
+       /* drop extra refrence added by autogroup_create() */
+       autogroup_kref_put(ag);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+
+/* Cannot be called under siglock.  Currently has no users */
+void sched_autogroup_detach(struct task_struct *p)
+{
+       autogroup_move_group(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detach);
+
+void sched_autogroup_fork(struct signal_struct *sig)
+{
+       sig->autogroup = autogroup_task_get(current);
+}
+
+void sched_autogroup_exit(struct signal_struct *sig)
+{
+       autogroup_kref_put(sig->autogroup);
+}
+
+static int __init setup_autogroup(char *str)
+{
+       sysctl_sched_autogroup_enabled = 0;
+
+       return 1;
+}
+
+__setup("noautogroup", setup_autogroup);
+
+#ifdef CONFIG_PROC_FS
+
+int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
+{
+       static unsigned long next = INITIAL_JIFFIES;
+       struct autogroup *ag;
+       int err;
+
+       if (*nice < -20 || *nice > 19)
+               return -EINVAL;
+
+       err = security_task_setnice(current, *nice);
+       if (err)
+               return err;
+
+       if (*nice < 0 && !can_nice(current, *nice))
+               return -EPERM;
+
+       /* this is a heavy operation taking global locks.. */
+       if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
+               return -EAGAIN;
+
+       next = HZ / 10 + jiffies;
+       ag = autogroup_task_get(p);
+
+       down_write(&ag->lock);
+       err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
+       if (!err)
+               ag->nice = *nice;
+       up_write(&ag->lock);
+
+       autogroup_kref_put(ag);
+
+       return err;
+}
+
+void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
+{
+       struct autogroup *ag = autogroup_task_get(p);
+
+       down_read(&ag->lock);
+       seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
+       up_read(&ag->lock);
+
+       autogroup_kref_put(ag);
+}
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+       return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h

new file mode 100644 (file)

index 0000000..5358e24
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,32 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+struct autogroup {
+       struct kref             kref;
+       struct task_group       *tg;
+       struct rw_semaphore     lock;
+       unsigned long           id;
+       int                     nice;
+};
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg);
+
+#else /* !CONFIG_SCHED_AUTOGROUP */
+
+static inline void autogroup_init(struct task_struct *init_task) {  }
+static inline void autogroup_free(struct task_group *tg) { }
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+       return tg;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+       return 0;
+}
+#endif
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c

index 52f1a149bfb15a871a362255498fadf90e357c57..9d8af0b3fb64544d9ca7076f3478d2239b46540e 100644 (file)
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
  }
  EXPORT_SYMBOL_GPL(sched_clock);
  
-static __read_mostly int sched_clock_running;
+__read_mostly int sched_clock_running;
  
  #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
  __read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c

index 2e1b0d17dd9b6a8b4ac48891a988c025a8b07ed5..1dfae3d014b5934eba4b3be25bfbad746196b4b7 100644 (file)
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec)
  #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-static void print_cfs_group_stats(struct seq_file *m, int cpu,
-               struct task_group *tg)
+static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
  {
         struct sched_entity *se = tg->se[cpu];
         if (!se)
@@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
  #endif
  
-#ifdef CONFIG_CGROUP_SCHED
-       {
-               char path[64];
-
-               rcu_read_lock();
-               cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
-               rcu_read_unlock();
-               SEQ_printf(m, " %s", path);
-       }
-#endif
         SEQ_printf(m, "\n");
  }
  
@@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
         read_unlock_irqrestore(&tasklist_lock, flags);
  }
  
-#if defined(CONFIG_CGROUP_SCHED) && \
-       (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
-static void task_group_path(struct task_group *tg, char *buf, int buflen)
-{
-       /* may be NULL if the underlying cgroup isn't fully-created yet */
-       if (!tg->css.cgroup) {
-               buf[0] = '\0';
-               return;
-       }
-       cgroup_path(tg->css.cgroup, buf, buflen);
-}
-#endif
-
  void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
  {
         s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
         struct sched_entity *last;
         unsigned long flags;
  
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
-       char path[128];
-       struct task_group *tg = cfs_rq->tg;
-
-       task_group_path(tg, path, sizeof(path));
-
-       SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
-#else
         SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
-#endif
         SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
                         SPLIT_NS(cfs_rq->exec_clock));
  
@@ -202,32 +169,29 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
         spread0 = min_vruntime - rq0_min_vruntime;
         SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
                         SPLIT_NS(spread0));
-       SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
-       SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
-
         SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
                         cfs_rq->nr_spread_over);
+       SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+       SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
  #ifdef CONFIG_FAIR_GROUP_SCHED
  #ifdef CONFIG_SMP
-       SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
+       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg",
+                       SPLIT_NS(cfs_rq->load_avg));
+       SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period",
+                       SPLIT_NS(cfs_rq->load_period));
+       SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
+                       cfs_rq->load_contribution);
+       SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
+                       atomic_read(&cfs_rq->tg->load_weight));
  #endif
+
         print_cfs_group_stats(m, cpu, cfs_rq->tg);
  #endif
  }
  
  void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
  {
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
-       char path[128];
-       struct task_group *tg = rt_rq->tg;
-
-       task_group_path(tg, path, sizeof(path));
-
-       SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
-#else
         SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
-#endif
-
  
  #define P(x) \
         SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -243,6 +207,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
  #undef P
  }
  
+extern __read_mostly int sched_clock_running;
+
  static void print_cpu(struct seq_file *m, int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
@@ -314,21 +280,42 @@ static const char *sched_tunable_scaling_names[] = {
  
  static int sched_debug_show(struct seq_file *m, void *v)
  {
-       u64 now = ktime_to_ns(ktime_get());
+       u64 ktime, sched_clk, cpu_clk;
+       unsigned long flags;
         int cpu;
  
-       SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
+       local_irq_save(flags);
+       ktime = ktime_to_ns(ktime_get());
+       sched_clk = sched_clock();
+       cpu_clk = local_clock();
+       local_irq_restore(flags);
+
+       SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
                 init_utsname()->release,
                 (int)strcspn(init_utsname()->version, " "),
                 init_utsname()->version);
  
-       SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
+#define P(x) \
+       SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+       SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+       PN(ktime);
+       PN(sched_clk);
+       PN(cpu_clk);
+       P(jiffies);
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+       P(sched_clock_stable);
+#endif
+#undef PN
+#undef P
+
+       SEQ_printf(m, "\n");
+       SEQ_printf(m, "sysctl_sched\n");
  
  #define P(x) \
         SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
  #define PN(x) \
         SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
-       P(jiffies);
         PN(sysctl_sched_latency);
         PN(sysctl_sched_min_granularity);
         PN(sysctl_sched_wakeup_granularity);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index 00ebd7686676bd87a6e5b3be513d27028e518796..c62ebae65cf0c5e5d1628b0368692a94cda37568 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
  
  const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
  
+/*
+ * The exponential sliding  window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+
  static const struct sched_class fair_sched_class;
  
  /**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
         return cfs_rq->tg->cfs_rq[this_cpu];
  }
  
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       if (!cfs_rq->on_list) {
+               /*
+                * Ensure we either appear before our parent (if already
+                * enqueued) or force our parent to appear after us when it is
+                * enqueued.  The fact that we always enqueue bottom-up
+                * reduces this to two cases.
+                */
+               if (cfs_rq->tg->parent &&
+                   cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+                       list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+               } else {
+                       list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+                               &rq_of(cfs_rq)->leaf_cfs_rq_list);
+               }
+
+               cfs_rq->on_list = 1;
+       }
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->on_list) {
+               list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+               cfs_rq->on_list = 0;
+       }
+}
+
  /* Iterate thr' all leaf cfs_rq's on a runqueue */
  #define for_each_leaf_cfs_rq(rq, cfs_rq) \
         list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
         return &cpu_rq(this_cpu)->cfs;
  }
  
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
  #define for_each_leaf_cfs_rq(rq, cfs_rq) \
                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
  
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
         WRT_SYSCTL(sched_min_granularity);
         WRT_SYSCTL(sched_latency);
         WRT_SYSCTL(sched_wakeup_granularity);
-       WRT_SYSCTL(sched_shares_ratelimit);
  #undef WRT_SYSCTL
  
         return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
         return calc_delta_fair(sched_slice(cfs_rq, se), se);
  }
  
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+
  /*
   * Update the current task's runtime statistics. Skip current tasks that
   * are not in our scheduling class.
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
  
         curr->vruntime += delta_exec_weighted;
         update_min_vruntime(cfs_rq);
+
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+       cfs_rq->load_unacc_exec_time += delta_exec;
+#endif
  }
  
  static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 list_add(&se->group_node, &cfs_rq->tasks);
         }
         cfs_rq->nr_running++;
-       se->on_rq = 1;
  }
  
  static void
@@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 list_del_init(&se->group_node);
         }
         cfs_rq->nr_running--;
-       se->on_rq = 0;
  }
  
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+                                           int global_update)
+{
+       struct task_group *tg = cfs_rq->tg;
+       long load_avg;
+
+       load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+       load_avg -= cfs_rq->load_contribution;
+
+       if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+               atomic_add(load_avg, &tg->load_weight);
+               cfs_rq->load_contribution += load_avg;
+       }
+}
+
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+       u64 period = sysctl_sched_shares_window;
+       u64 now, delta;
+       unsigned long load = cfs_rq->load.weight;
+
+       if (!cfs_rq)
+               return;
+
+       now = rq_of(cfs_rq)->clock;
+       delta = now - cfs_rq->load_stamp;
+
+       /* truncate load history at 4 idle periods */
+       if (cfs_rq->load_stamp > cfs_rq->load_last &&
+           now - cfs_rq->load_last > 4 * period) {
+               cfs_rq->load_period = 0;
+               cfs_rq->load_avg = 0;
+       }
+
+       cfs_rq->load_stamp = now;
+       cfs_rq->load_unacc_exec_time = 0;
+       cfs_rq->load_period += delta;
+       if (load) {
+               cfs_rq->load_last = now;
+               cfs_rq->load_avg += delta * load;
+       }
+
+       /* consider updating load contribution on each fold or truncate */
+       if (global_update || cfs_rq->load_period > period
+           || !cfs_rq->load_period)
+               update_cfs_rq_load_contribution(cfs_rq, global_update);
+
+       while (cfs_rq->load_period > period) {
+               /*
+                * Inline assembly required to prevent the compiler
+                * optimising this loop into a divmod call.
+                * See __iter_div_u64_rem() for another example of this.
+                */
+               asm("" : "+rm" (cfs_rq->load_period));
+               cfs_rq->load_period /= 2;
+               cfs_rq->load_avg /= 2;
+       }
+
+       if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+               list_del_leaf_cfs_rq(cfs_rq);
+}
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+                           unsigned long weight)
+{
+       if (se->on_rq) {
+               /* commit outstanding execution time */
+               if (cfs_rq->curr == se)
+                       update_curr(cfs_rq);
+               account_entity_dequeue(cfs_rq, se);
+       }
+
+       update_load_set(&se->load, weight);
+
+       if (se->on_rq)
+               account_entity_enqueue(cfs_rq, se);
+}
+
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+       struct task_group *tg;
+       struct sched_entity *se;
+       long load_weight, load, shares;
+
+       if (!cfs_rq)
+               return;
+
+       tg = cfs_rq->tg;
+       se = tg->se[cpu_of(rq_of(cfs_rq))];
+       if (!se)
+               return;
+
+       load = cfs_rq->load.weight + weight_delta;
+
+       load_weight = atomic_read(&tg->load_weight);
+       load_weight -= cfs_rq->load_contribution;
+       load_weight += load;
+
+       shares = (tg->shares * load);
+       if (load_weight)
+               shares /= load_weight;
+
+       if (shares < MIN_SHARES)
+               shares = MIN_SHARES;
+       if (shares > tg->shares)
+               shares = tg->shares;
+
+       reweight_entity(cfs_rq_of(se), se, shares);
+}
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+       if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq, 0);
+       }
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
  #ifdef CONFIG_SCHEDSTATS
@@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
          * Update run-time statistics of the 'current'.
          */
         update_curr(cfs_rq);
+       update_cfs_load(cfs_rq, 0);
+       update_cfs_shares(cfs_rq, se->load.weight);
         account_entity_enqueue(cfs_rq, se);
  
         if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         check_spread(cfs_rq, se);
         if (se != cfs_rq->curr)
                 __enqueue_entity(cfs_rq, se);
+       se->on_rq = 1;
+
+       if (cfs_rq->nr_running == 1)
+               list_add_leaf_cfs_rq(cfs_rq);
  }
  
  static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  
         if (se != cfs_rq->curr)
                 __dequeue_entity(cfs_rq, se);
+       se->on_rq = 0;
+       update_cfs_load(cfs_rq, 0);
         account_entity_dequeue(cfs_rq, se);
         update_min_vruntime(cfs_rq);
+       update_cfs_shares(cfs_rq, 0);
  
         /*
          * Normalize the entity after updating the min_vruntime because the
@@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
          */
         update_curr(cfs_rq);
  
+       /*
+        * Update share accounting for long-running entities.
+        */
+       update_entity_shares_tick(cfs_rq);
+
  #ifdef CONFIG_SCHED_HRTICK
         /*
          * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 flags = ENQUEUE_WAKEUP;
         }
  
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq, 0);
+       }
+
         hrtick_update(rq);
  }
  
@@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 dequeue_entity(cfs_rq, se, flags);
+
                 /* Don't dequeue parent if it has other entities besides us */
                 if (cfs_rq->load.weight)
                         break;
                 flags |= DEQUEUE_SLEEP;
         }
  
+       for_each_sched_entity(se) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+               update_cfs_load(cfs_rq, 0);
+               update_cfs_shares(cfs_rq, 0);
+       }
+
         hrtick_update(rq);
  }
  
@@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
   * Adding load to a group doesn't make a group heavier, but can cause movement
   * of group shares between cpus. Assuming the shares were perfectly aligned one
   * can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
   */
-static long effective_load(struct task_group *tg, int cpu,
-               long wl, long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  {
         struct sched_entity *se = tg->se[cpu];
  
         if (!tg->parent)
                 return wl;
  
-       /*
-        * By not taking the decrease of shares on the other cpu into
-        * account our error leans towards reducing the affine wakeups.
-        */
-       if (!wl && sched_feat(ASYM_EFF_LOAD))
-               return wl;
-
         for_each_sched_entity(se) {
                 long S, rw, s, a, b;
-               long more_w;
-
-               /*
-                * Instead of using this increment, also add the difference
-                * between when the shares were last updated and now.
-                */
-               more_w = se->my_q->load.weight - se->my_q->rq_weight;
-               wl += more_w;
-               wg += more_w;
  
                 S = se->my_q->tg->shares;
-               s = se->my_q->shares;
-               rw = se->my_q->rq_weight;
+               s = se->load.weight;
+               rw = se->my_q->load.weight;
  
                 a = S*(rw + wl);
                 b = S*rw + s*wg;
@@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
                         sd = tmp;
         }
  
-#ifdef CONFIG_FAIR_GROUP_SCHED
-       if (sched_feat(LB_SHARES_UPDATE)) {
-               /*
-                * Pick the largest domain to update shares over
-                */
-               tmp = sd;
-               if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
-                       tmp = affine_sd;
-
-               if (tmp) {
-                       raw_spin_unlock(&rq->lock);
-                       update_shares(tmp);
-                       raw_spin_lock(&rq->lock);
-               }
-       }
-#endif
-
         if (affine_sd) {
                 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
                         return select_idle_sibling(p, cpu);
@@ -1909,6 +2071,48 @@ out:
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static int update_shares_cpu(struct task_group *tg, int cpu)
+{
+       struct cfs_rq *cfs_rq;
+       unsigned long flags;
+       struct rq *rq;
+
+       if (!tg->se[cpu])
+               return 0;
+
+       rq = cpu_rq(cpu);
+       cfs_rq = tg->cfs_rq[cpu];
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       update_rq_clock(rq);
+       update_cfs_load(cfs_rq, 1);
+
+       /*
+        * We need to update shares after updating tg->load_weight in
+        * order to adjust the weight of groups with long running tasks.
+        */
+       update_cfs_shares(cfs_rq, 0);
+
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+       return 0;
+}
+
+static void update_shares(int cpu)
+{
+       struct cfs_rq *cfs_rq;
+       struct rq *rq = cpu_rq(cpu);
+
+       rcu_read_lock();
+       for_each_leaf_cfs_rq(rq, cfs_rq)
+               update_shares_cpu(cfs_rq->tg, cpu);
+       rcu_read_unlock();
+}
+
  static unsigned long
  load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                   unsigned long max_load_move,
@@ -1956,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
         return max_load_move - rem_load_move;
  }
  #else
+static inline void update_shares(int cpu)
+{
+}
+
  static unsigned long
  load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                   unsigned long max_load_move,
@@ -3032,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
         schedstat_inc(sd, lb_count[idle]);
  
  redo:
-       update_shares(sd);
         group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                                    cpus, balance);
  
@@ -3174,8 +3381,6 @@ out_one_pinned:
         else
                 ld_moved = 0;
  out:
-       if (ld_moved)
-               update_shares(sd);
         return ld_moved;
  }
  
@@ -3199,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
          */
         raw_spin_unlock(&this_rq->lock);
  
+       update_shares(this_cpu);
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
                 int balance = 1;
@@ -3569,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
         int update_next_balance = 0;
         int need_serialize;
  
+       update_shares(cpu);
+
         for_each_domain(cpu, sd) {
                 if (!(sd->flags & SD_LOAD_BALANCE))
                         continue;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h

index 185f920ec1a2e923b0d966f787c610ff26a7b6cb..68e69acc29b9570b10ecf8a5892ea62c48700c5c 100644 (file)
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
  SCHED_FEAT(HRTICK, 0)
  SCHED_FEAT(DOUBLE_TICK, 0)
  SCHED_FEAT(LB_BIAS, 1)
-SCHED_FEAT(LB_SHARES_UPDATE, 1)
-SCHED_FEAT(ASYM_EFF_LOAD, 1)
  
  /*
   * Spin-wait on mutex acquisition when the mutex owner is running on
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c

index bea7d79f7e9ca958bba514cbd8eb48ceab47bab3..c914ec747ca6709e25a177eb3c4152c75cb40aee 100644 (file)
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
         return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
  }
  
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+       list_add_rcu(&rt_rq->leaf_rt_rq_list,
+                       &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
+}
+
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+       list_del_rcu(&rt_rq->leaf_rt_rq_list);
+}
+
  #define for_each_leaf_rt_rq(rt_rq, rq) \
         list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
  
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
         return ktime_to_ns(def_rt_bandwidth.rt_period);
  }
  
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+
  #define for_each_leaf_rt_rq(rt_rq, rq) \
         for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
  
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
         if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                 return;
  
+       if (!rt_rq->rt_nr_running)
+               list_add_leaf_rt_rq(rt_rq);
+
         if (head)
                 list_add(&rt_se->run_list, queue);
         else
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
                 __clear_bit(rt_se_prio(rt_se), array->bitmap);
  
         dec_rt_tasks(rt_se, rt_rq);
+       if (!rt_rq->rt_nr_running)
+               list_del_leaf_rt_rq(rt_rq);
  }
  
  /*
diff --git a/kernel/softirq.c b/kernel/softirq.c

index 18f4be0d5fe0bbf853935972d9b441e95bc61c5a..d4d918a91881407acd8abbde6691f77197cd013c 100644 (file)
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
                              cpumask_any(cpu_online_mask));
         case CPU_DEAD:
         case CPU_DEAD_FROZEN: {
-               struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+               static struct sched_param param = {
+                       .sched_priority = MAX_RT_PRIO-1
+               };
  
                 p = per_cpu(ksoftirqd, hotcpu);
                 per_cpu(ksoftirqd, hotcpu) = NULL;
diff --git a/kernel/srcu.c b/kernel/srcu.c

index c71e075005368eceff3aab4340f94beca4aee249..98d8c1e80edbcb106ba8e87c34777459aa4eff55 100644 (file)
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -31,6 +31,7 @@
  #include <linux/rcupdate.h>
  #include <linux/sched.h>
  #include <linux/smp.h>
+#include <linux/delay.h>
  #include <linux/srcu.h>
  
  static int init_srcu_struct_fields(struct srcu_struct *sp)
@@ -203,9 +204,14 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
          * all srcu_read_lock() calls using the old counters have completed.
          * Their corresponding critical sections might well be still
          * executing, but the srcu_read_lock() primitives themselves
-        * will have finished executing.
+        * will have finished executing.  We initially give readers
+        * an arbitrarily chosen 10 microseconds to get out of their
+        * SRCU read-side critical sections, then loop waiting 1/HZ
+        * seconds per iteration.
          */
  
+       if (srcu_readers_active_idx(sp, idx))
+               udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
         while (srcu_readers_active_idx(sp, idx))
                 schedule_timeout_interruptible(1);
  
diff --git a/kernel/sys.c b/kernel/sys.c

index 7f5a0cd296a96ca44e43f0db028026094dbbb57a..2745dcdb6c6c5756a7bafd19e6497c31cc077d60 100644 (file)
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid)
         err = session;
  out:
         write_unlock_irq(&tasklist_lock);
-       if (err > 0)
+       if (err > 0) {
                 proc_sid_connector(group_leader);
+               sched_autogroup_create_attach(group_leader);
+       }
         return err;
  }
  
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 5abfa151855493735a91fd45a255a45727c8ba97..ae5cbb1e3ced15b8cc2e00b052496953709cf4c2 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns;                       /* 0 usecs */
  static int max_wakeup_granularity_ns = NSEC_PER_SEC;   /* 1 second */
  static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
  static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-static int min_sched_shares_ratelimit = 100000; /* 100 usec */
-static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
  #endif
  
  #ifdef CONFIG_COMPACTION
@@ -304,15 +302,6 @@ static struct ctl_table kern_table[] = {
                 .extra1         = &min_wakeup_granularity_ns,
                 .extra2         = &max_wakeup_granularity_ns,
         },
-       {
-               .procname       = "sched_shares_ratelimit",
-               .data           = &sysctl_sched_shares_ratelimit,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = sched_proc_update_handler,
-               .extra1         = &min_sched_shares_ratelimit,
-               .extra2         = &max_sched_shares_ratelimit,
-       },
         {
                 .procname       = "sched_tunable_scaling",
                 .data           = &sysctl_sched_tunable_scaling,
@@ -322,14 +311,6 @@ static struct ctl_table kern_table[] = {
                 .extra1         = &min_sched_tunable_scaling,
                 .extra2         = &max_sched_tunable_scaling,
         },
-       {
-               .procname       = "sched_shares_thresh",
-               .data           = &sysctl_sched_shares_thresh,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &zero,
-       },
         {
                 .procname       = "sched_migration_cost",
                 .data           = &sysctl_sched_migration_cost,
@@ -351,6 +332,13 @@ static struct ctl_table kern_table[] = {
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec,
         },
+       {
+               .procname       = "sched_shares_window",
+               .data           = &sysctl_sched_shares_window,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
         {
                 .procname       = "timer_migration",
                 .data           = &sysctl_timer_migration,
@@ -382,6 +370,17 @@ static struct ctl_table kern_table[] = {
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec,
         },
+#ifdef CONFIG_SCHED_AUTOGROUP
+       {
+               .procname       = "sched_autogroup_enabled",
+               .data           = &sysctl_sched_autogroup_enabled,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+#endif
  #ifdef CONFIG_PROVE_LOCKING
         {
                 .procname       = "prove_locking",
@@ -745,21 +744,21 @@ static struct ctl_table kern_table[] = {
                 .extra1         = &zero,
                 .extra2         = &one,
         },
-#endif
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
         {
-               .procname       = "unknown_nmi_panic",
-               .data           = &unknown_nmi_panic,
+               .procname       = "nmi_watchdog",
+               .data           = &watchdog_enabled,
                 .maxlen         = sizeof (int),
                 .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = proc_dowatchdog_enabled,
         },
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
         {
-               .procname       = "nmi_watchdog",
-               .data           = &nmi_watchdog_enabled,
+               .procname       = "unknown_nmi_panic",
+               .data           = &unknown_nmi_panic,
                 .maxlen         = sizeof (int),
                 .mode           = 0644,
-               .proc_handler   = proc_nmi_enabled,
+               .proc_handler   = proc_dointvec,
         },
  #endif
  #if defined(CONFIG_X86)
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c

index 1357c5786064e6c8f030defbbb7f76f690dc3c15..4b2545a136ffcec72d5073b579cf3f41c49fcbd3 100644 (file)
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = {
         { CTL_INT,      KERN_IA64_UNALIGNED,            "ignore-unaligned-usertrap" },
         { CTL_INT,      KERN_COMPAT_LOG,                "compat-log" },
         { CTL_INT,      KERN_MAX_LOCK_DEPTH,            "max_lock_depth" },
-       { CTL_INT,      KERN_NMI_WATCHDOG,              "nmi_watchdog" },
         { CTL_INT,      KERN_PANIC_ON_NMI,              "panic_on_unrecovered_nmi" },
         {}
  };
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c

index ac38fbb176ccd0bb598b1eaaa7f2a703b17ec565..a9ae369925ce14fa4cf7ca1674d9f4903a6f4a24 100644 (file)
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -21,6 +21,7 @@
  #include <linux/module.h>
  #include <linux/slab.h>
  #include <linux/math64.h>
+#include <linux/kernel.h>
  
  /*
   * fixed point arithmetic scale factor for skew
@@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync,
         int index;
         int num_samples = sync->num_samples;
  
-       if (num_samples > sizeof(buffer)/sizeof(buffer[0])) {
+       if (num_samples > ARRAY_SIZE(buffer)) {
                 samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
                 if (!samples) {
                         samples = buffer;
-                       num_samples = sizeof(buffer)/sizeof(buffer[0]);
+                       num_samples = ARRAY_SIZE(buffer);
                 }
         } else {
                 samples = buffer;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c

index 49010d822f725b47726742fa7e1b45aad076ef90..5bb86da8200373a2e6cd64fdcbe0355f43f5a27f 100644 (file)
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -32,6 +32,8 @@ struct timekeeper {
         cycle_t cycle_interval;
         /* Number of clock shifted nano seconds in one NTP interval. */
         u64     xtime_interval;
+       /* shifted nano seconds left over when rounding cycle_interval */
+       s64     xtime_remainder;
         /* Raw nano seconds accumulated per NTP interval. */
         u32     raw_interval;
  
@@ -62,7 +64,7 @@ struct timekeeper timekeeper;
  static void timekeeper_setup_internals(struct clocksource *clock)
  {
         cycle_t interval;
-       u64 tmp;
+       u64 tmp, ntpinterval;
  
         timekeeper.clock = clock;
         clock->cycle_last = clock->read(clock);
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
         /* Do the ns -> cycle conversion first, using original mult */
         tmp = NTP_INTERVAL_LENGTH;
         tmp <<= clock->shift;
+       ntpinterval = tmp;
         tmp += clock->mult/2;
         do_div(tmp, clock->mult);
         if (tmp == 0)
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
  
         /* Go back from cycles -> shifted ns */
         timekeeper.xtime_interval = (u64) interval * clock->mult;
+       timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
         timekeeper.raw_interval =
                 ((u64) interval * clock->mult) >> clock->shift;
  
@@ -719,7 +723,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
  
         /* Accumulate error between NTP and clock interval */
         timekeeper.ntp_error += tick_length << shift;
-       timekeeper.ntp_error -= timekeeper.xtime_interval <<
+       timekeeper.ntp_error -=
+           (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
                                 (timekeeper.ntp_error_shift + shift);
  
         return offset;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c

index ab8f5e33fa92c76db813d1419e6a339f3a7aca52..32a19f9397fc347c3144a01e142308c026f49c70 100644 (file)
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
  {
         struct hrtimer *timer, tmp;
         unsigned long next = 0, i;
-       struct rb_node *curr;
+       struct timerqueue_node *curr;
         unsigned long flags;
  
  next_one:
         i = 0;
         raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
  
-       curr = base->first;
+       curr = timerqueue_getnext(&base->active);
         /*
          * Crude but we have to do this O(N*N) thing, because
          * we have to unlock the base when printing:
          */
         while (curr && i < next) {
-               curr = rb_next(curr);
+               curr = timerqueue_iterate_next(curr);
                 i++;
         }
  
         if (curr) {
  
-               timer = rb_entry(curr, struct hrtimer, node);
+               timer = container_of(curr, struct hrtimer, node);
                 tmp = *timer;
                 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
  
diff --git a/kernel/timer.c b/kernel/timer.c

index 353b9227c2ecfe11793a17b0a41f534ebdbd14f8..43ca9936f2d06a2cba572f1c877406f9281e68a3 100644 (file)
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases;
  EXPORT_SYMBOL(boot_tvec_bases);
  static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
  
-/*
- * Note that all tvec_bases are 2 byte aligned and lower bit of
- * base in timer_list is guaranteed to be zero. Use the LSB to
- * indicate whether the timer is deferrable.
- *
- * A deferrable timer will work normally when the system is busy, but
- * will not cause a CPU to come out of idle just to service it; instead,
- * the timer will be serviced when the CPU eventually wakes up with a
- * subsequent non-deferrable timer.
- */
-#define TBASE_DEFERRABLE_FLAG          (0x1)
-
  /* Functions below help us manage 'deferrable' flag */
  static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
  {
@@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
  
  static inline void timer_set_deferrable(struct timer_list *timer)
  {
-       timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
-                                      TBASE_DEFERRABLE_FLAG));
+       timer->base = TBASE_MAKE_DEFERRED(timer->base);
  }
  
  static inline void
@@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
  }
  EXPORT_SYMBOL_GPL(set_timer_slack);
  
-
-static inline void set_running_timer(struct tvec_base *base,
-                                       struct timer_list *timer)
-{
-#ifdef CONFIG_SMP
-       base->running_timer = timer;
-#endif
-}
-
  static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
  {
         unsigned long expires = timer->expires;
@@ -936,15 +914,12 @@ int del_timer(struct timer_list *timer)
  }
  EXPORT_SYMBOL(del_timer);
  
-#ifdef CONFIG_SMP
  /**
   * try_to_del_timer_sync - Try to deactivate a timer
   * @timer: timer do del
   *
   * This function tries to deactivate a timer. Upon successful (ret >= 0)
   * exit the timer is not queued and the handler is not running on any CPU.
- *
- * It must not be called from interrupt contexts.
   */
  int try_to_del_timer_sync(struct timer_list *timer)
  {
@@ -973,6 +948,7 @@ out:
  }
  EXPORT_SYMBOL(try_to_del_timer_sync);
  
+#ifdef CONFIG_SMP
  /**
   * del_timer_sync - deactivate a timer and wait for the handler to finish.
   * @timer: the timer to be deactivated
@@ -983,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
   *
   * Synchronization rules: Callers must prevent restarting of the timer,
   * otherwise this function is meaningless. It must not be called from
- * interrupt contexts. The caller must not hold locks which would prevent
+ * hardirq contexts. The caller must not hold locks which would prevent
   * completion of the timer's handler. The timer's handler must not call
   * add_timer_on(). Upon exit the timer is not queued and the handler is
   * not running on any CPU.
@@ -993,14 +969,16 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
  int del_timer_sync(struct timer_list *timer)
  {
  #ifdef CONFIG_LOCKDEP
-       unsigned long flags;
-
-       local_irq_save(flags);
+       local_bh_disable();
         lock_map_acquire(&timer->lockdep_map);
         lock_map_release(&timer->lockdep_map);
-       local_irq_restore(flags);
+       local_bh_enable();
  #endif
-
+       /*
+        * don't use it in hardirq context, because it
+        * could lead to deadlock.
+        */
+       WARN_ON(in_irq());
         for (;;) {
                 int ret = try_to_del_timer_sync(timer);
                 if (ret >= 0)
@@ -1111,7 +1089,7 @@ static inline void __run_timers(struct tvec_base *base)
  
                         timer_stats_account_timer(timer);
  
-                       set_running_timer(base, timer);
+                       base->running_timer = timer;
                         detach_timer(timer, 1);
  
                         spin_unlock_irq(&base->lock);
@@ -1119,7 +1097,7 @@ static inline void __run_timers(struct tvec_base *base)
                         spin_lock_irq(&base->lock);
                 }
         }
-       set_running_timer(base, NULL);
+       base->running_timer = NULL;
         spin_unlock_irq(&base->lock);
  }
  
@@ -1249,7 +1227,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
   */
  unsigned long get_next_timer_interrupt(unsigned long now)
  {
-       struct tvec_base *base = __get_cpu_var(tvec_bases);
+       struct tvec_base *base = __this_cpu_read(tvec_bases);
         unsigned long expires;
  
         /*
@@ -1298,7 +1276,7 @@ void update_process_times(int user_tick)
   */
  static void run_timer_softirq(struct softirq_action *h)
  {
-       struct tvec_base *base = __get_cpu_var(tvec_bases);
+       struct tvec_base *base = __this_cpu_read(tvec_bases);
  
         hrtimer_run_pending();
  
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig

index ea37e2ff416429d04cea3829e835ced7f0f35fc0..14674dce77a6c5a9cb8f17ffe2056f8e3ece1cc2 100644 (file)
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -69,6 +69,21 @@ config EVENT_TRACING
         select CONTEXT_SWITCH_TRACER
         bool
  
+config EVENT_POWER_TRACING_DEPRECATED
+       depends on EVENT_TRACING
+       bool "Deprecated power event trace API, to be removed"
+       default y
+       help
+         Provides old power event types:
+         C-state/idle accounting events:
+         power:power_start
+         power:power_end
+         and old cpufreq accounting event:
+         power:power_frequency
+         This is for userspace compatibility
+         and will vanish after 5 kernel iterations,
+         namely 2.6.41.
+
  config CONTEXT_SWITCH_TRACER
         bool
  
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c

index a22582a061618cea52acee544d9e783ef1d9868e..f55fcf61b223d87e547568512ef2ae89e17107c5 100644 (file)
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,5 +13,8 @@
  #define CREATE_TRACE_POINTS
  #include <trace/events/power.h>
  
-EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
+#ifdef EVENT_POWER_TRACING_DEPRECATED
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
+#endif
+EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
  
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c

index 39c059ca670e64156e6681782ffa708c6b8d720f..19a359d5e6d58573cc1c74326e488a419b01b342 100644 (file)
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
  /* Count the events in use (per event id, not per instance) */
  static int     total_ref_count;
  
+static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
+                                struct perf_event *p_event)
+{
+       /* No tracing, just counting, so no obvious leak */
+       if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
+               return 0;
+
+       /* Some events are ok to be traced by non-root users... */
+       if (p_event->attach_state == PERF_ATTACH_TASK) {
+               if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
+                       return 0;
+       }
+
+       /*
+        * ...otherwise raw tracepoint data can be a severe data leak,
+        * only allow root to have these.
+        */
+       if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       return 0;
+}
+
  static int perf_trace_event_init(struct ftrace_event_call *tp_event,
                                  struct perf_event *p_event)
  {
         struct hlist_head __percpu *list;
-       int ret = -ENOMEM;
+       int ret;
         int cpu;
  
+       ret = perf_trace_event_perm(tp_event, p_event);
+       if (ret)
+               return ret;
+
         p_event->tp_event = tp_event;
         if (tp_event->perf_refcount++ > 0)
                 return 0;
  
+       ret = -ENOMEM;
+
         list = alloc_percpu(struct hlist_head);
         if (!list)
                 goto fail;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c

index 0725eeab1937ef24a301f2c0b0404f64ce95e026..35fde09b81dee7f386c111766b1fdbefac2d7414 100644 (file)
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,6 +27,12 @@
  
  DEFINE_MUTEX(event_mutex);
  
+DEFINE_MUTEX(event_storage_mutex);
+EXPORT_SYMBOL_GPL(event_storage_mutex);
+
+char event_storage[EVENT_STORAGE_SIZE];
+EXPORT_SYMBOL_GPL(event_storage);
+
  LIST_HEAD(ftrace_events);
  LIST_HEAD(ftrace_common_fields);
  
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c

index 4ba44deaac259d05fb67d5d31e8ce5371844a414..4b74d71705c0d2be2a9adf67246823584bd34fcd 100644 (file)
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void)   \
  
  #undef __array
  #define __array(type, item, len)                                       \
-       BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                         \
-       ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+       do {                                                            \
+               BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                 \
+               mutex_lock(&event_storage_mutex);                       \
+               snprintf(event_storage, sizeof(event_storage),          \
+                        "%s[%d]", #type, len);                         \
+               ret = trace_define_field(event_call, event_storage, #item, \
                                  offsetof(typeof(field), item),         \
                                  sizeof(field.item),                    \
                                  is_signed_type(type), FILTER_OTHER);   \
-       if (ret)                                                        \
-               return ret;
+               mutex_unlock(&event_storage_mutex);                     \
+               if (ret)                                                \
+                       return ret;                                     \
+       } while (0);
  
  #undef __array_desc
  #define __array_desc(type, container, item, len)                       \
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c

index 155a415b3209c0c4e65936be1b593678aea42d27..562c56e048fdbc34b18dded38cd21ccb3cd87f08 100644 (file)
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
  static int trace_wakeup_test_thread(void *data)
  {
         /* Make this a RT thread, doesn't need to be too high */
-       struct sched_param param = { .sched_priority = 5 };
+       static struct sched_param param = { .sched_priority = 5 };
         struct completion *x = data;
  
         sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/user.c b/kernel/user.c

index 2c7d8d5914b188be65c36a686a81fba7eed07d8c..5c598ca781df4bf6f907043ed1d43f98b3b7ef58 100644 (file)
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -158,6 +158,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
                 spin_lock_irq(&uidhash_lock);
                 up = uid_hash_find(uid, hashent);
                 if (up) {
+                       put_user_ns(ns);
                         key_put(new->uid_keyring);
                         key_put(new->session_keyring);
                         kmem_cache_free(uid_cachep, new);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c

index 6e3c41a4024c1cc66be01218e2c37498498f2469..6e7b575ac33cf2dcba3f9dc749f7039e6805a3f0 100644 (file)
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -57,6 +57,8 @@ static int __init hardlockup_panic_setup(char *str)
  {
         if (!strncmp(str, "panic", 5))
                 hardlockup_panic = 1;
+       else if (!strncmp(str, "0", 1))
+               no_watchdog = 1;
         return 1;
  }
  __setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -307,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
   */
  static int watchdog(void *unused)
  {
-       struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+       static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
         struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
  
         sched_setscheduler(current, SCHED_FIFO, &param);
@@ -364,7 +366,8 @@ static int watchdog_nmi_enable(int cpu)
                 goto out_save;
         }
  
-       printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
+       printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n",
+              cpu, PTR_ERR(event));
         return PTR_ERR(event);
  
         /* success path */
@@ -547,13 +550,13 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
         .notifier_call = cpu_callback
  };
  
-static int __init spawn_watchdog_task(void)
+void __init lockup_detector_init(void)
  {
         void *cpu = (void *)(long)smp_processor_id();
         int err;
  
         if (no_watchdog)
-               return 0;
+               return;
  
         err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
         WARN_ON(notifier_to_errno(err));
@@ -561,6 +564,5 @@ static int __init spawn_watchdog_task(void)
         cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
         register_cpu_notifier(&cpu_nfb);
  
-       return 0;
+       return;
  }
-early_initcall(spawn_watchdog_task);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug

index 28b42b9274d0b5fe47522d9df8158498be12319a..2d05adb984018776610f573de126f14bda4c9d2e 100644 (file)
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -173,7 +173,8 @@ config LOCKUP_DETECTOR
           An NMI is generated every 60 seconds or so to check for hardlockups.
  
  config HARDLOCKUP_DETECTOR
-       def_bool LOCKUP_DETECTOR && PERF_EVENTS && HAVE_PERF_EVENTS_NMI
+       def_bool LOCKUP_DETECTOR && PERF_EVENTS && HAVE_PERF_EVENTS_NMI && \
+                !ARCH_HAS_NMI_WATCHDOG
  
  config BOOTPARAM_SOFTLOCKUP_PANIC
         bool "Panic (Reboot) On Soft Lockups"
diff --git a/lib/Makefile b/lib/Makefile

index e6a3763b82126729ecad6636caec9686cd7dec5f..9e2db72d128e6f22ec560ff7f04584777c293dd7 100644 (file)
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
  endif
  
  lib-y := ctype.o string.o vsprintf.o cmdline.o \
-        rbtree.o radix-tree.o dump_stack.o \
+        rbtree.o radix-tree.o dump_stack.o timerqueue.o\
          idr.o int_sqrt.o extable.o prio_tree.o \
          sha1.o irq_regs.o reciprocal_div.o argv_split.o \
          proportions.o prio_heap.o ratelimit.o show_mem.o \
diff --git a/lib/timerqueue.c b/lib/timerqueue.c

new file mode 100644 (file)

index 0000000..e3a1050
--- /dev/null
+++ b/lib/timerqueue.c
@@ -0,0 +1,107 @@
+/*
+ *  Generic Timer-queue
+ *
+ *  Manages a simple queue of timers, ordered by expiration time.
+ *  Uses rbtrees for quick list adds and expiration.
+ *
+ *  NOTE: All of the following functions need to be serialized
+ *  to avoid races. No locking is done by this libary code.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/timerqueue.h>
+#include <linux/rbtree.h>
+#include <linux/module.h>
+
+/**
+ * timerqueue_add - Adds timer to timerqueue.
+ *
+ * @head: head of timerqueue
+ * @node: timer node to be added
+ *
+ * Adds the timer node to the timerqueue, sorted by the
+ * node's expires value.
+ */
+void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
+{
+       struct rb_node **p = &head->head.rb_node;
+       struct rb_node *parent = NULL;
+       struct timerqueue_node  *ptr;
+
+       /* Make sure we don't add nodes that are already added */
+       WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));
+
+       while (*p) {
+               parent = *p;
+               ptr = rb_entry(parent, struct timerqueue_node, node);
+               if (node->expires.tv64 < ptr->expires.tv64)
+                       p = &(*p)->rb_left;
+               else
+                       p = &(*p)->rb_right;
+       }
+       rb_link_node(&node->node, parent, p);
+       rb_insert_color(&node->node, &head->head);
+
+       if (!head->next || node->expires.tv64 < head->next->expires.tv64)
+               head->next = node;
+}
+EXPORT_SYMBOL_GPL(timerqueue_add);
+
+/**
+ * timerqueue_del - Removes a timer from the timerqueue.
+ *
+ * @head: head of timerqueue
+ * @node: timer node to be removed
+ *
+ * Removes the timer node from the timerqueue.
+ */
+void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
+{
+       WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));
+
+       /* update next pointer */
+       if (head->next == node) {
+               struct rb_node *rbn = rb_next(&node->node);
+
+               head->next = rbn ?
+                       rb_entry(rbn, struct timerqueue_node, node) : NULL;
+       }
+       rb_erase(&node->node, &head->head);
+       RB_CLEAR_NODE(&node->node);
+}
+EXPORT_SYMBOL_GPL(timerqueue_del);
+
+/**
+ * timerqueue_iterate_next - Returns the timer after the provided timer
+ *
+ * @node: Pointer to a timer.
+ *
+ * Provides the timer that is after the given node. This is used, when
+ * necessary, to iterate through the list of timers in a timer list
+ * without modifying the list.
+ */
+struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node)
+{
+       struct rb_node *next;
+
+       if (!node)
+               return NULL;
+       next = rb_next(&node->node);
+       if (!next)
+               return NULL;
+       return container_of(next, struct timerqueue_node, node);
+}
+EXPORT_SYMBOL_GPL(timerqueue_iterate_next);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 7a22b41292115f78dc039ebf5e271090dc5a8f9d..00bb8a64d028f945d50346c2332c6d4b54373621 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1925,19 +1925,18 @@ again:
  
                 rcu_read_lock();
                 p = rcu_dereference(mm->owner);
-               VM_BUG_ON(!p);
                 /*
-                * because we don't have task_lock(), "p" can exit while
-                * we're here. In that case, "mem" can point to root
-                * cgroup but never be NULL. (and task_struct itself is freed
-                * by RCU, cgroup itself is RCU safe.) Then, we have small
-                * risk here to get wrong cgroup. But such kind of mis-account
-                * by race always happens because we don't have cgroup_mutex().
-                * It's overkill and we allow that small race, here.
+                * Because we don't have task_lock(), "p" can exit.
+                * In that case, "mem" can point to root or p can be NULL with
+                * race with swapoff. Then, we have small risk of mis-accouning.
+                * But such kind of mis-account by race always happens because
+                * we don't have cgroup_mutex(). It's overkill and we allo that
+                * small race, here.
+                * (*) swapoff at el will charge against mm-struct not against
+                * task-struct. So, mm->owner can be NULL.
                  */
                 mem = mem_cgroup_from_task(p);
-               VM_BUG_ON(!mem);
-               if (mem_cgroup_is_root(mem)) {
+               if (!mem || mem_cgroup_is_root(mem)) {
                         rcu_read_unlock();
                         goto done;
                 }
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c

index f19e347f56f6f41543b33f174af38c997739294e..543b3262d002cdca0213348b4878c364622394b1 100644 (file)
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1430,7 +1430,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
                                  struct net_bridge_port *port,
                                  struct sk_buff *skb)
  {
-       struct sk_buff *skb2 = skb;
+       struct sk_buff *skb2;
         struct ipv6hdr *ip6h;
         struct icmp6hdr *icmp6h;
         u8 nexthdr;
@@ -1469,15 +1469,15 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
         if (!skb2)
                 return -ENOMEM;
  
+       err = -EINVAL;
+       if (!pskb_may_pull(skb2, offset + sizeof(struct icmp6hdr)))
+               goto out;
+
         len -= offset - skb_network_offset(skb2);
  
         __skb_pull(skb2, offset);
         skb_reset_transport_header(skb2);
  
-       err = -EINVAL;
-       if (!pskb_may_pull(skb2, sizeof(*icmp6h)))
-               goto out;
-
         icmp6h = icmp6_hdr(skb2);
  
         switch (icmp6h->icmp6_type) {
@@ -1516,7 +1516,12 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
         switch (icmp6h->icmp6_type) {
         case ICMPV6_MGM_REPORT:
             {
-               struct mld_msg *mld = (struct mld_msg *)icmp6h;
+               struct mld_msg *mld;
+               if (!pskb_may_pull(skb2, sizeof(*mld))) {
+                       err = -EINVAL;
+                       goto out;
+               }
+               mld = (struct mld_msg *)skb_transport_header(skb2);
                 BR_INPUT_SKB_CB(skb2)->mrouters_only = 1;
                 err = br_ip6_multicast_add_group(br, port, &mld->mld_mca);
                 break;
@@ -1529,15 +1534,18 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
                 break;
         case ICMPV6_MGM_REDUCTION:
             {
-               struct mld_msg *mld = (struct mld_msg *)icmp6h;
+               struct mld_msg *mld;
+               if (!pskb_may_pull(skb2, sizeof(*mld))) {
+                       err = -EINVAL;
+                       goto out;
+               }
+               mld = (struct mld_msg *)skb_transport_header(skb2);
                 br_ip6_multicast_leave_group(br, port, &mld->mld_mca);
             }
         }
  
  out:
-       __skb_push(skb2, offset);
-       if (skb2 != skb)
-               kfree_skb(skb2);
+       kfree_skb(skb2);
         return err;
  }
  #endif
diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c

index 35cf27087b561d6e9955fd75b4b03213a6e9e8d8..e3d7aefa91811d8945d9283d5de7ede9b168da34 100644 (file)
--- a/net/bridge/br_stp_bpdu.c
+++ b/net/bridge/br_stp_bpdu.c
@@ -50,6 +50,8 @@ static void br_send_bpdu(struct net_bridge_port *p,
  
         llc_mac_hdr_init(skb, p->dev->dev_addr, p->br->group_addr);
  
+       skb_reset_mac_header(skb);
+
         NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
                 dev_queue_xmit);
  }
diff --git a/net/can/bcm.c b/net/can/bcm.c

index 6faa8256e10ca22d6fb0b2005d74c5c5580a404d..9d5e8accfab1d73f25e8a00ae45523a400b820ce 100644 (file)
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -125,7 +125,7 @@ struct bcm_sock {
         struct list_head tx_ops;
         unsigned long dropped_usr_msgs;
         struct proc_dir_entry *bcm_proc_read;
-       char procname [20]; /* pointer printed in ASCII with \0 */
+       char procname [32]; /* inode number in decimal with \0 */
  };
  
  static inline struct bcm_sock *bcm_sk(const struct sock *sk)
@@ -1521,7 +1521,7 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len,
  
         if (proc_dir) {
                 /* unique socket address as filename */
-               sprintf(bo->procname, "%p", sock);
+               sprintf(bo->procname, "%lu", sock_i_ino(sk));
                 bo->bcm_proc_read = proc_create_data(bo->procname, 0644,
                                                      proc_dir,
                                                      &bcm_proc_fops, sk);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c

index df948b0f1ac97c0e1d436690c6bb49f4c9fd056d..93bfd95584f4656605eb963919de2d74b7c729f0 100644 (file)
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2649,8 +2649,12 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
         }
  
         if (res.type == RTN_LOCAL) {
-               if (!fl.fl4_src)
-                       fl.fl4_src = fl.fl4_dst;
+               if (!fl.fl4_src) {
+                       if (res.fi->fib_prefsrc)
+                               fl.fl4_src = res.fi->fib_prefsrc;
+                       else
+                               fl.fl4_src = fl.fl4_dst;
+               }
                 dev_out = net->loopback_dev;
                 fl.oif = dev_out->ifindex;
                 res.fi = NULL;
diff --git a/scripts/Makefile.build b/scripts/Makefile.build

index 5ad25e17b6cb2782a2101b59ad7cfd442a8af2ea..4eb99ab34053769f5b2b644594427b2bdc108c82 100644 (file)
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -214,17 +214,22 @@ ifdef BUILD_C_RECORDMCOUNT
  # The empty.o file is created in the make process in order to determine
  #  the target endianness and word size. It is made before all other C
  #  files, including recordmcount.
-cmd_record_mcount = if [ $(@) != "scripts/mod/empty.o" ]; then                 \
-                       $(objtree)/scripts/recordmcount "$(@)";                 \
-                   fi;
+sub_cmd_record_mcount =                                        \
+       if [ $(@) != "scripts/mod/empty.o" ]; then      \
+               $(objtree)/scripts/recordmcount "$(@)"; \
+       fi;
  else
-cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \
+sub_cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \
         "$(if $(CONFIG_CPU_BIG_ENDIAN),big,little)" \
         "$(if $(CONFIG_64BIT),64,32)" \
         "$(OBJDUMP)" "$(OBJCOPY)" "$(CC) $(KBUILD_CFLAGS)" \
         "$(LD)" "$(NM)" "$(RM)" "$(MV)" \
         "$(if $(part-of-module),1,0)" "$(@)";
  endif
+cmd_record_mcount =                                            \
+       if [ "$(findstring -pg,$(_c_flags))" = "-pg" ]; then    \
+               $(sub_cmd_record_mcount)                        \
+       fi;
  endif
  
  define rule_cc_o_c
diff --git a/scripts/kconfig/menu.c b/scripts/kconfig/menu.c

index b9d9aa18e6d62bbaa40c34cf7062c00c8bb15e35..5f77dcb8977e0632b5875167d5a652ef62f4be1c 100644 (file)
--- a/scripts/kconfig/menu.c
+++ b/scripts/kconfig/menu.c
@@ -140,6 +140,20 @@ struct property *menu_add_prop(enum prop_type type, char *prompt, struct expr *e
                 }
                 if (current_entry->prompt && current_entry != &rootmenu)
                         prop_warn(prop, "prompt redefined");
+
+               /* Apply all upper menus' visibilities to actual prompts. */
+               if(type == P_PROMPT) {
+                       struct menu *menu = current_entry;
+
+                       while ((menu = menu->parent) != NULL) {
+                               if (!menu->visibility)
+                                       continue;
+                               prop->visible.expr
+                                       = expr_alloc_and(prop->visible.expr,
+                                                        menu->visibility);
+                       }
+               }
+
                 current_entry->prompt = prop;
         }
         prop->text = prompt;
diff --git a/scripts/kernel-doc b/scripts/kernel-doc

index 39580a5dc5df6083a5766ae8853c6610e417474f..9f85012acf0d2fb749a4ded8bb962d514d524c03 100755 (executable)
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -155,6 +155,8 @@ use strict;
  # '@parameter' - name of a parameter
  # '%CONST' - name of a constant.
  
+## init lots of data
+
  my $errors = 0;
  my $warnings = 0;
  my $anon_struct_union = 0;
@@ -218,21 +220,14 @@ my %highlights_list = ( $type_constant, "\$1",
                         $type_param, "\$1" );
  my $blankline_list = "";
  
-sub usage {
-    print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n";
-    print "         [ -no-doc-sections ]\n";
-    print "         [ -function funcname [ -function funcname ...] ]\n";
-    print "         [ -nofunction funcname [ -nofunction funcname ...] ]\n";
-    print "         c source file(s) > outputfile\n";
-    print "         -v : verbose output, more warnings & other info listed\n";
-    exit 1;
-}
-
  # read arguments
  if ($#ARGV == -1) {
      usage();
  }
  
+my $kernelversion;
+my $dohighlight = "";
+
  my $verbose = 0;
  my $output_mode = "man";
  my $no_doc_sections = 0;
@@ -245,7 +240,7 @@ my $man_date = ('January', 'February', 'March', 'April', 'May', 'June',
                 'November', 'December')[(localtime)[4]] .
    " " . ((localtime)[5]+1900);
  
-# Essentially these are globals
+# Essentially these are globals.
  # They probably want to be tidied up, made more localised or something.
  # CAVEAT EMPTOR!  Some of the others I localised may not want to be, which
  # could cause "use of undefined value" or other bugs.
@@ -353,6 +348,18 @@ while ($ARGV[0] =~ m/^-(.*)/) {
      }
  }
  
+# continue execution near EOF;
+
+sub usage {
+    print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n";
+    print "         [ -no-doc-sections ]\n";
+    print "         [ -function funcname [ -function funcname ...] ]\n";
+    print "         [ -nofunction funcname [ -nofunction funcname ...] ]\n";
+    print "         c source file(s) > outputfile\n";
+    print "         -v : verbose output, more warnings & other info listed\n";
+    exit 1;
+}
+
  # get kernel version from env
  sub get_kernel_version() {
      my $version = 'unknown kernel version';
@@ -362,15 +369,6 @@ sub get_kernel_version() {
      }
      return $version;
  }
-my $kernelversion = get_kernel_version();
-
-# generate a sequence of code that will splice in highlighting information
-# using the s// operator.
-my $dohighlight = "";
-foreach my $pattern (keys %highlights) {
-#   print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n";
-    $dohighlight .=  "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n";
-}
  
  ##
  # dumps section contents to arrays/hashes intended for that purpose.
@@ -1851,34 +1849,6 @@ sub dump_function($$) {
                        });
  }
  
-sub process_file($);
-
-# Read the file that maps relative names to absolute names for
-# separate source and object directories and for shadow trees.
-if (open(SOURCE_MAP, "<.tmp_filelist.txt")) {
-       my ($relname, $absname);
-       while(<SOURCE_MAP>) {
-               chop();
-               ($relname, $absname) = (split())[0..1];
-               $relname =~ s:^/+::;
-               $source_map{$relname} = $absname;
-       }
-       close(SOURCE_MAP);
-}
-
-foreach (@ARGV) {
-    chomp;
-    process_file($_);
-}
-if ($verbose && $errors) {
-  print STDERR "$errors errors\n";
-}
-if ($verbose && $warnings) {
-  print STDERR "$warnings warnings\n";
-}
-
-exit($errors);
-
  sub reset_state {
      $function = "";
      %constants = ();
@@ -2285,3 +2255,39 @@ sub process_file($) {
         }
      }
  }
+
+
+$kernelversion = get_kernel_version();
+
+# generate a sequence of code that will splice in highlighting information
+# using the s// operator.
+foreach my $pattern (keys %highlights) {
+#   print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n";
+    $dohighlight .=  "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n";
+}
+
+# Read the file that maps relative names to absolute names for
+# separate source and object directories and for shadow trees.
+if (open(SOURCE_MAP, "<.tmp_filelist.txt")) {
+       my ($relname, $absname);
+       while(<SOURCE_MAP>) {
+               chop();
+               ($relname, $absname) = (split())[0..1];
+               $relname =~ s:^/+::;
+               $source_map{$relname} = $absname;
+       }
+       close(SOURCE_MAP);
+}
+
+foreach (@ARGV) {
+    chomp;
+    process_file($_);
+}
+if ($verbose && $errors) {
+  print STDERR "$errors errors\n";
+}
+if ($verbose && $warnings) {
+  print STDERR "$warnings warnings\n";
+}
+
+exit($errors);
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c

index aef8c0a923ab8e2276f97e1fe2aa61b7748e25e4..d661afbe474c2fce8f74907869be0a644c5789c9 100644 (file)
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -253,6 +253,8 @@ static int ima_lsm_rule_init(struct ima_measure_rule_entry *entry,
         result = security_filter_rule_init(entry->lsm[lsm_rule].type,
                                            Audit_equal, args,
                                            &entry->lsm[lsm_rule].rule);
+       if (!entry->lsm[lsm_rule].rule)
+               return -EINVAL;
         return result;
  }
  
diff --git a/sound/oss/soundcard.c b/sound/oss/soundcard.c

index 46c0d03dbecced68ca318dbbe20df52d1da4ab0b..fcb14a09982262d473842aeb55f365b031ebeb70 100644 (file)
--- a/sound/oss/soundcard.c
+++ b/sound/oss/soundcard.c
@@ -87,7 +87,7 @@ int *load_mixer_volumes(char *name, int *levels, int present)
         int             i, n;
  
         for (i = 0; i < num_mixer_volumes; i++) {
-               if (strcmp(name, mixer_vols[i].name) == 0) {
+               if (strncmp(name, mixer_vols[i].name, 32) == 0) {
                         if (present)
                                 mixer_vols[i].num = i;
                         return mixer_vols[i].levels;
@@ -99,7 +99,7 @@ int *load_mixer_volumes(char *name, int *levels, int present)
         }
         n = num_mixer_volumes++;
  
-       strcpy(mixer_vols[n].name, name);
+       strncpy(mixer_vols[n].name, name, 32);
  
         if (present)
                 mixer_vols[n].num = n;
diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c

index b030c8eba21fdc618fe0b5bfb32cf5b156c39bba..a1c4008af89185449567331db04014a31b87007c 100644 (file)
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -2300,6 +2300,7 @@ static struct snd_pci_quirk position_fix_list[] __devinitdata = {
         SND_PCI_QUIRK(0x1028, 0x01cc, "Dell D820", POS_FIX_LPIB),
         SND_PCI_QUIRK(0x1028, 0x01de, "Dell Precision 390", POS_FIX_LPIB),
         SND_PCI_QUIRK(0x1028, 0x01f6, "Dell Latitude 131L", POS_FIX_LPIB),
+       SND_PCI_QUIRK(0x1028, 0x0470, "Dell Inspiron 1120", POS_FIX_LPIB),
         SND_PCI_QUIRK(0x103c, 0x306d, "HP dv3", POS_FIX_LPIB),
         SND_PCI_QUIRK(0x1043, 0x813d, "ASUS P5AD2", POS_FIX_LPIB),
         SND_PCI_QUIRK(0x1043, 0x81b3, "ASUS", POS_FIX_LPIB),
diff --git a/sound/soc/codecs/max98088.c b/sound/soc/codecs/max98088.c

index d63e28773eb1841ac7bdd06af0d37a0d5c03013b..6447dbb2f1238f1288c6ed6a797757a08ddd69fa 100644 (file)
--- a/sound/soc/codecs/max98088.c
+++ b/sound/soc/codecs/max98088.c
@@ -40,7 +40,6 @@ struct max98088_cdata {
  };
  
  struct max98088_priv {
-       u8 reg_cache[M98088_REG_CNT];
         enum max98088_type devtype;
         void *control_data;
         struct max98088_pdata *pdata;
@@ -1588,7 +1587,7 @@ static int max98088_dai2_set_fmt(struct snd_soc_dai *codec_dai,
  
  static void max98088_sync_cache(struct snd_soc_codec *codec)
  {
-       struct max98088_priv *max98088 = snd_soc_codec_get_drvdata(codec);
+       u16 *reg_cache = codec->reg_cache;
         int i;
  
         if (!codec->cache_sync)
@@ -1599,14 +1598,14 @@ static void max98088_sync_cache(struct snd_soc_codec *codec)
         /* write back cached values if they're writeable and
          * different from the hardware default.
          */
-       for (i = 1; i < ARRAY_SIZE(max98088->reg_cache); i++) {
+       for (i = 1; i < codec->driver->reg_cache_size; i++) {
                 if (!max98088_access[i].writable)
                         continue;
  
-               if (max98088->reg_cache[i] == max98088_reg[i])
+               if (reg_cache[i] == max98088_reg[i])
                         continue;
  
-               snd_soc_write(codec, i, max98088->reg_cache[i]);
+               snd_soc_write(codec, i, reg_cache[i]);
         }
  
         codec->cache_sync = 0;
@@ -1951,7 +1950,6 @@ static int max98088_probe(struct snd_soc_codec *codec)
         int ret = 0;
  
         codec->cache_sync = 1;
-       memcpy(codec->reg_cache, max98088_reg, sizeof(max98088_reg));
  
         ret = snd_soc_codec_set_cache_io(codec, 8, 8, SND_SOC_I2C);
         if (ret != 0) {
diff --git a/sound/soc/codecs/wm8523.c b/sound/soc/codecs/wm8523.c

index 9a433a5396cb781727e39a4c1c6005f1bbc9a421..deca79ea2b4b98ab6d3232c9cb3c1b4c3e6858dd 100644 (file)
--- a/sound/soc/codecs/wm8523.c
+++ b/sound/soc/codecs/wm8523.c
@@ -41,7 +41,6 @@ static const char *wm8523_supply_names[WM8523_NUM_SUPPLIES] = {
  /* codec private data */
  struct wm8523_priv {
         enum snd_soc_control_type control_type;
-       u16 reg_cache[WM8523_REGISTER_COUNT];
         struct regulator_bulk_data supplies[WM8523_NUM_SUPPLIES];
         unsigned int sysclk;
         unsigned int rate_constraint_list[WM8523_NUM_RATES];
@@ -314,6 +313,7 @@ static int wm8523_set_bias_level(struct snd_soc_codec *codec,
                                  enum snd_soc_bias_level level)
  {
         struct wm8523_priv *wm8523 = snd_soc_codec_get_drvdata(codec);
+       u16 *reg_cache = codec->reg_cache;
         int ret, i;
  
         switch (level) {
@@ -344,7 +344,7 @@ static int wm8523_set_bias_level(struct snd_soc_codec *codec,
                         /* Sync back default/cached values */
                         for (i = WM8523_AIF_CTRL1;
                              i < WM8523_MAX_REGISTER; i++)
-                               snd_soc_write(codec, i, wm8523->reg_cache[i]);
+                               snd_soc_write(codec, i, reg_cache[i]);
  
  
                         msleep(100);
@@ -414,6 +414,7 @@ static int wm8523_resume(struct snd_soc_codec *codec)
  static int wm8523_probe(struct snd_soc_codec *codec)
  {
         struct wm8523_priv *wm8523 = snd_soc_codec_get_drvdata(codec);
+       u16 *reg_cache = codec->reg_cache;
         int ret, i;
  
         codec->hw_write = (hw_write_t)i2c_master_send;
@@ -470,8 +471,8 @@ static int wm8523_probe(struct snd_soc_codec *codec)
         }
  
         /* Change some default settings - latch VU and enable ZC */
-       wm8523->reg_cache[WM8523_DAC_GAINR] |= WM8523_DACR_VU;
-       wm8523->reg_cache[WM8523_DAC_CTRL3] |= WM8523_ZC;
+       reg_cache[WM8523_DAC_GAINR] |= WM8523_DACR_VU;
+       reg_cache[WM8523_DAC_CTRL3] |= WM8523_ZC;
  
         wm8523_set_bias_level(codec, SND_SOC_BIAS_STANDBY);
  
diff --git a/sound/soc/codecs/wm8741.c b/sound/soc/codecs/wm8741.c

index 90e31e9aa6f7c66346c8360a3c1ab900519e4a98..aea60ef8aba73da14e7cd17f62b72955adfd645a 100644 (file)
--- a/sound/soc/codecs/wm8741.c
+++ b/sound/soc/codecs/wm8741.c
@@ -41,7 +41,6 @@ static const char *wm8741_supply_names[WM8741_NUM_SUPPLIES] = {
  /* codec private data */
  struct wm8741_priv {
         enum snd_soc_control_type control_type;
-       u16 reg_cache[WM8741_REGISTER_COUNT];
         struct regulator_bulk_data supplies[WM8741_NUM_SUPPLIES];
         unsigned int sysclk;
         struct snd_pcm_hw_constraint_list *sysclk_constraints;
@@ -422,6 +421,7 @@ static int wm8741_resume(struct snd_soc_codec *codec)
  static int wm8741_probe(struct snd_soc_codec *codec)
  {
         struct wm8741_priv *wm8741 = snd_soc_codec_get_drvdata(codec);
+       u16 *reg_cache = codec->reg_cache;
         int ret = 0;
  
         ret = snd_soc_codec_set_cache_io(codec, 7, 9, wm8741->control_type);
@@ -437,10 +437,10 @@ static int wm8741_probe(struct snd_soc_codec *codec)
         }
  
         /* Change some default settings - latch VU */
-       wm8741->reg_cache[WM8741_DACLLSB_ATTENUATION] |= WM8741_UPDATELL;
-       wm8741->reg_cache[WM8741_DACLMSB_ATTENUATION] |= WM8741_UPDATELM;
-       wm8741->reg_cache[WM8741_DACRLSB_ATTENUATION] |= WM8741_UPDATERL;
-       wm8741->reg_cache[WM8741_DACRLSB_ATTENUATION] |= WM8741_UPDATERM;
+       reg_cache[WM8741_DACLLSB_ATTENUATION] |= WM8741_UPDATELL;
+       reg_cache[WM8741_DACLMSB_ATTENUATION] |= WM8741_UPDATELM;
+       reg_cache[WM8741_DACRLSB_ATTENUATION] |= WM8741_UPDATERL;
+       reg_cache[WM8741_DACRLSB_ATTENUATION] |= WM8741_UPDATERM;
  
         snd_soc_add_controls(codec, wm8741_snd_controls,
                              ARRAY_SIZE(wm8741_snd_controls));
diff --git a/sound/soc/codecs/wm8753.c b/sound/soc/codecs/wm8753.c

index 8f679a13f2bcaae23653adaffafff6fedcd8e66f..87caae59e939c78465750f470d5d4ce35ee505d3 100644 (file)
--- a/sound/soc/codecs/wm8753.c
+++ b/sound/soc/codecs/wm8753.c
@@ -65,22 +65,22 @@ static void wm8753_set_dai_mode(struct snd_soc_codec *codec,
   * are using 2 wire for device control, so we cache them instead.
   */
  static const u16 wm8753_reg[] = {
-       0x0008, 0x0000, 0x000a, 0x000a,
-       0x0033, 0x0000, 0x0007, 0x00ff,
-       0x00ff, 0x000f, 0x000f, 0x007b,
-       0x0000, 0x0032, 0x0000, 0x00c3,
-       0x00c3, 0x00c0, 0x0000, 0x0000,
+       0x0000, 0x0008, 0x0000, 0x000a,
+       0x000a, 0x0033, 0x0000, 0x0007,
+       0x00ff, 0x00ff, 0x000f, 0x000f,
+       0x007b, 0x0000, 0x0032, 0x0000,
+       0x00c3, 0x00c3, 0x00c0, 0x0000,
         0x0000, 0x0000, 0x0000, 0x0000,
         0x0000, 0x0000, 0x0000, 0x0000,
-       0x0000, 0x0000, 0x0000, 0x0055,
-       0x0005, 0x0050, 0x0055, 0x0050,
-       0x0055, 0x0050, 0x0055, 0x0079,
-       0x0079, 0x0079, 0x0079, 0x0079,
         0x0000, 0x0000, 0x0000, 0x0000,
-       0x0097, 0x0097, 0x0000, 0x0004,
-       0x0000, 0x0083, 0x0024, 0x01ba,
-       0x0000, 0x0083, 0x0024, 0x01ba,
-       0x0000, 0x0000, 0x0000
+       0x0055, 0x0005, 0x0050, 0x0055,
+       0x0050, 0x0055, 0x0050, 0x0055,
+       0x0079, 0x0079, 0x0079, 0x0079,
+       0x0079, 0x0000, 0x0000, 0x0000,
+       0x0000, 0x0097, 0x0097, 0x0000,
+       0x0004, 0x0000, 0x0083, 0x0024,
+       0x01ba, 0x0000, 0x0083, 0x0024,
+       0x01ba, 0x0000, 0x0000, 0x0000
  };
  
  /* codec private data */
@@ -88,57 +88,10 @@ struct wm8753_priv {
         enum snd_soc_control_type control_type;
         unsigned int sysclk;
         unsigned int pcmclk;
-       u16 reg_cache[ARRAY_SIZE(wm8753_reg)];
         int dai_func;
  };
  
-/*
- * read wm8753 register cache
- */
-static inline unsigned int wm8753_read_reg_cache(struct snd_soc_codec *codec,
-       unsigned int reg)
-{
-       u16 *cache = codec->reg_cache;
-       if (reg < 1 || reg >= (ARRAY_SIZE(wm8753_reg) + 1))
-               return -1;
-       return cache[reg - 1];
-}
-
-/*
- * write wm8753 register cache
- */
-static inline void wm8753_write_reg_cache(struct snd_soc_codec *codec,
-       unsigned int reg, unsigned int value)
-{
-       u16 *cache = codec->reg_cache;
-       if (reg < 1 || reg >= (ARRAY_SIZE(wm8753_reg) + 1))
-               return;
-       cache[reg - 1] = value;
-}
-
-/*
- * write to the WM8753 register space
- */
-static int wm8753_write(struct snd_soc_codec *codec, unsigned int reg,
-       unsigned int value)
-{
-       u8 data[2];
-
-       /* data is
-        *   D15..D9 WM8753 register offset
-        *   D8...D0 register data
-        */
-       data[0] = (reg << 1) | ((value >> 8) & 0x0001);
-       data[1] = value & 0x00ff;
-
-       wm8753_write_reg_cache(codec, reg, value);
-       if (codec->hw_write(codec->control_data, data, 2) == 2)
-               return 0;
-       else
-               return -EIO;
-}
-
-#define wm8753_reset(c) wm8753_write(c, WM8753_RESET, 0)
+#define wm8753_reset(c) snd_soc_write(c, WM8753_RESET, 0)
  
  /*
   * WM8753 Controls
@@ -218,7 +171,7 @@ static int wm8753_get_dai(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
         struct snd_soc_codec *codec =  snd_kcontrol_chip(kcontrol);
-       int mode = wm8753_read_reg_cache(codec, WM8753_IOCTL);
+       int mode = snd_soc_read(codec, WM8753_IOCTL);
  
         ucontrol->value.integer.value[0] = (mode & 0xc) >> 2;
         return 0;
@@ -228,7 +181,7 @@ static int wm8753_set_dai(struct snd_kcontrol *kcontrol,
         struct snd_ctl_elem_value *ucontrol)
  {
         struct snd_soc_codec *codec =  snd_kcontrol_chip(kcontrol);
-       int mode = wm8753_read_reg_cache(codec, WM8753_IOCTL);
+       int mode = snd_soc_read(codec, WM8753_IOCTL);
         struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
  
         if (((mode & 0xc) >> 2) == ucontrol->value.integer.value[0])
@@ -738,17 +691,17 @@ static int wm8753_set_dai_pll(struct snd_soc_dai *codec_dai, int pll_id,
         if (pll_id == WM8753_PLL1) {
                 offset = 0;
                 enable = 0x10;
-               reg = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0xffef;
+               reg = snd_soc_read(codec, WM8753_CLOCK) & 0xffef;
         } else {
                 offset = 4;
                 enable = 0x8;
-               reg = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0xfff7;
+               reg = snd_soc_read(codec, WM8753_CLOCK) & 0xfff7;
         }
  
         if (!freq_in || !freq_out) {
                 /* disable PLL  */
-               wm8753_write(codec, WM8753_PLL1CTL1 + offset, 0x0026);
-               wm8753_write(codec, WM8753_CLOCK, reg);
+               snd_soc_write(codec, WM8753_PLL1CTL1 + offset, 0x0026);
+               snd_soc_write(codec, WM8753_CLOCK, reg);
                 return 0;
         } else {
                 u16 value = 0;
@@ -759,20 +712,20 @@ static int wm8753_set_dai_pll(struct snd_soc_dai *codec_dai, int pll_id,
                 /* set up N and K PLL divisor ratios */
                 /* bits 8:5 = PLL_N, bits 3:0 = PLL_K[21:18] */
                 value = (pll_div.n << 5) + ((pll_div.k & 0x3c0000) >> 18);
-               wm8753_write(codec, WM8753_PLL1CTL2 + offset, value);
+               snd_soc_write(codec, WM8753_PLL1CTL2 + offset, value);
  
                 /* bits 8:0 = PLL_K[17:9] */
                 value = (pll_div.k & 0x03fe00) >> 9;
-               wm8753_write(codec, WM8753_PLL1CTL3 + offset, value);
+               snd_soc_write(codec, WM8753_PLL1CTL3 + offset, value);
  
                 /* bits 8:0 = PLL_K[8:0] */
                 value = pll_div.k & 0x0001ff;
-               wm8753_write(codec, WM8753_PLL1CTL4 + offset, value);
+               snd_soc_write(codec, WM8753_PLL1CTL4 + offset, value);
  
                 /* set PLL as input and enable */
-               wm8753_write(codec, WM8753_PLL1CTL1 + offset, 0x0027 |
+               snd_soc_write(codec, WM8753_PLL1CTL1 + offset, 0x0027 |
                         (pll_div.div2 << 3));
-               wm8753_write(codec, WM8753_CLOCK, reg | enable);
+               snd_soc_write(codec, WM8753_CLOCK, reg | enable);
         }
         return 0;
  }
@@ -879,7 +832,7 @@ static int wm8753_vdac_adc_set_dai_fmt(struct snd_soc_dai *codec_dai,
                 unsigned int fmt)
  {
         struct snd_soc_codec *codec = codec_dai->codec;
-       u16 voice = wm8753_read_reg_cache(codec, WM8753_PCM) & 0x01ec;
+       u16 voice = snd_soc_read(codec, WM8753_PCM) & 0x01ec;
  
         /* interface format */
         switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) {
@@ -901,7 +854,7 @@ static int wm8753_vdac_adc_set_dai_fmt(struct snd_soc_dai *codec_dai,
                 return -EINVAL;
         }
  
-       wm8753_write(codec, WM8753_PCM, voice);
+       snd_soc_write(codec, WM8753_PCM, voice);
         return 0;
  }
  
@@ -922,8 +875,8 @@ static int wm8753_pcm_hw_params(struct snd_pcm_substream *substream,
         struct snd_soc_pcm_runtime *rtd = substream->private_data;
         struct snd_soc_codec *codec = rtd->codec;
         struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
-       u16 voice = wm8753_read_reg_cache(codec, WM8753_PCM) & 0x01f3;
-       u16 srate = wm8753_read_reg_cache(codec, WM8753_SRATE1) & 0x017f;
+       u16 voice = snd_soc_read(codec, WM8753_PCM) & 0x01f3;
+       u16 srate = snd_soc_read(codec, WM8753_SRATE1) & 0x017f;
  
         /* bit size */
         switch (params_format(params)) {
@@ -943,9 +896,9 @@ static int wm8753_pcm_hw_params(struct snd_pcm_substream *substream,
         /* sample rate */
         if (params_rate(params) * 384 == wm8753->pcmclk)
                 srate |= 0x80;
-       wm8753_write(codec, WM8753_SRATE1, srate);
+       snd_soc_write(codec, WM8753_SRATE1, srate);
  
-       wm8753_write(codec, WM8753_PCM, voice);
+       snd_soc_write(codec, WM8753_PCM, voice);
         return 0;
  }
  
@@ -958,8 +911,8 @@ static int wm8753_pcm_set_dai_fmt(struct snd_soc_dai *codec_dai,
         struct snd_soc_codec *codec = codec_dai->codec;
         u16 voice, ioctl;
  
-       voice = wm8753_read_reg_cache(codec, WM8753_PCM) & 0x011f;
-       ioctl = wm8753_read_reg_cache(codec, WM8753_IOCTL) & 0x015d;
+       voice = snd_soc_read(codec, WM8753_PCM) & 0x011f;
+       ioctl = snd_soc_read(codec, WM8753_IOCTL) & 0x015d;
  
         /* set master/slave audio interface */
         switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) {
@@ -1013,8 +966,8 @@ static int wm8753_pcm_set_dai_fmt(struct snd_soc_dai *codec_dai,
                 return -EINVAL;
         }
  
-       wm8753_write(codec, WM8753_PCM, voice);
-       wm8753_write(codec, WM8753_IOCTL, ioctl);
+       snd_soc_write(codec, WM8753_PCM, voice);
+       snd_soc_write(codec, WM8753_IOCTL, ioctl);
         return 0;
  }
  
@@ -1026,16 +979,16 @@ static int wm8753_set_dai_clkdiv(struct snd_soc_dai *codec_dai,
  
         switch (div_id) {
         case WM8753_PCMDIV:
-               reg = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0x003f;
-               wm8753_write(codec, WM8753_CLOCK, reg | div);
+               reg = snd_soc_read(codec, WM8753_CLOCK) & 0x003f;
+               snd_soc_write(codec, WM8753_CLOCK, reg | div);
                 break;
         case WM8753_BCLKDIV:
-               reg = wm8753_read_reg_cache(codec, WM8753_SRATE2) & 0x01c7;
-               wm8753_write(codec, WM8753_SRATE2, reg | div);
+               reg = snd_soc_read(codec, WM8753_SRATE2) & 0x01c7;
+               snd_soc_write(codec, WM8753_SRATE2, reg | div);
                 break;
         case WM8753_VXCLKDIV:
-               reg = wm8753_read_reg_cache(codec, WM8753_SRATE2) & 0x003f;
-               wm8753_write(codec, WM8753_SRATE2, reg | div);
+               reg = snd_soc_read(codec, WM8753_SRATE2) & 0x003f;
+               snd_soc_write(codec, WM8753_SRATE2, reg | div);
                 break;
         default:
                 return -EINVAL;
@@ -1050,7 +1003,7 @@ static int wm8753_hdac_set_dai_fmt(struct snd_soc_dai *codec_dai,
                 unsigned int fmt)
  {
         struct snd_soc_codec *codec = codec_dai->codec;
-       u16 hifi = wm8753_read_reg_cache(codec, WM8753_HIFI) & 0x01e0;
+       u16 hifi = snd_soc_read(codec, WM8753_HIFI) & 0x01e0;
  
         /* interface format */
         switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) {
@@ -1072,7 +1025,7 @@ static int wm8753_hdac_set_dai_fmt(struct snd_soc_dai *codec_dai,
                 return -EINVAL;
         }
  
-       wm8753_write(codec, WM8753_HIFI, hifi);
+       snd_soc_write(codec, WM8753_HIFI, hifi);
         return 0;
  }
  
@@ -1085,8 +1038,8 @@ static int wm8753_i2s_set_dai_fmt(struct snd_soc_dai *codec_dai,
         struct snd_soc_codec *codec = codec_dai->codec;
         u16 ioctl, hifi;
  
-       hifi = wm8753_read_reg_cache(codec, WM8753_HIFI) & 0x011f;
-       ioctl = wm8753_read_reg_cache(codec, WM8753_IOCTL) & 0x00ae;
+       hifi = snd_soc_read(codec, WM8753_HIFI) & 0x011f;
+       ioctl = snd_soc_read(codec, WM8753_IOCTL) & 0x00ae;
  
         /* set master/slave audio interface */
         switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) {
@@ -1140,8 +1093,8 @@ static int wm8753_i2s_set_dai_fmt(struct snd_soc_dai *codec_dai,
                 return -EINVAL;
         }
  
-       wm8753_write(codec, WM8753_HIFI, hifi);
-       wm8753_write(codec, WM8753_IOCTL, ioctl);
+       snd_soc_write(codec, WM8753_HIFI, hifi);
+       snd_soc_write(codec, WM8753_IOCTL, ioctl);
         return 0;
  }
  
@@ -1162,8 +1115,8 @@ static int wm8753_i2s_hw_params(struct snd_pcm_substream *substream,
         struct snd_soc_pcm_runtime *rtd = substream->private_data;
         struct snd_soc_codec *codec = rtd->codec;
         struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
-       u16 srate = wm8753_read_reg_cache(codec, WM8753_SRATE1) & 0x01c0;
-       u16 hifi = wm8753_read_reg_cache(codec, WM8753_HIFI) & 0x01f3;
+       u16 srate = snd_soc_read(codec, WM8753_SRATE1) & 0x01c0;
+       u16 hifi = snd_soc_read(codec, WM8753_HIFI) & 0x01f3;
         int coeff;
  
         /* is digital filter coefficient valid ? */
@@ -1172,7 +1125,7 @@ static int wm8753_i2s_hw_params(struct snd_pcm_substream *substream,
                 printk(KERN_ERR "wm8753 invalid MCLK or rate\n");
                 return coeff;
         }
-       wm8753_write(codec, WM8753_SRATE1, srate | (coeff_div[coeff].sr << 1) |
+       snd_soc_write(codec, WM8753_SRATE1, srate | (coeff_div[coeff].sr << 1) |
                 coeff_div[coeff].usb);
  
         /* bit size */
@@ -1190,7 +1143,7 @@ static int wm8753_i2s_hw_params(struct snd_pcm_substream *substream,
                 break;
         }
  
-       wm8753_write(codec, WM8753_HIFI, hifi);
+       snd_soc_write(codec, WM8753_HIFI, hifi);
         return 0;
  }
  
@@ -1201,8 +1154,8 @@ static int wm8753_mode1v_set_dai_fmt(struct snd_soc_dai *codec_dai,
         u16 clock;
  
         /* set clk source as pcmclk */
-       clock = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0xfffb;
-       wm8753_write(codec, WM8753_CLOCK, clock);
+       clock = snd_soc_read(codec, WM8753_CLOCK) & 0xfffb;
+       snd_soc_write(codec, WM8753_CLOCK, clock);
  
         if (wm8753_vdac_adc_set_dai_fmt(codec_dai, fmt) < 0)
                 return -EINVAL;
@@ -1224,8 +1177,8 @@ static int wm8753_mode2_set_dai_fmt(struct snd_soc_dai *codec_dai,
         u16 clock;
  
         /* set clk source as pcmclk */
-       clock = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0xfffb;
-       wm8753_write(codec, WM8753_CLOCK, clock);
+       clock = snd_soc_read(codec, WM8753_CLOCK) & 0xfffb;
+       snd_soc_write(codec, WM8753_CLOCK, clock);
  
         if (wm8753_vdac_adc_set_dai_fmt(codec_dai, fmt) < 0)
                 return -EINVAL;
@@ -1239,8 +1192,8 @@ static int wm8753_mode3_4_set_dai_fmt(struct snd_soc_dai *codec_dai,
         u16 clock;
  
         /* set clk source as mclk */
-       clock = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0xfffb;
-       wm8753_write(codec, WM8753_CLOCK, clock | 0x4);
+       clock = snd_soc_read(codec, WM8753_CLOCK) & 0xfffb;
+       snd_soc_write(codec, WM8753_CLOCK, clock | 0x4);
  
         if (wm8753_hdac_set_dai_fmt(codec_dai, fmt) < 0)
                 return -EINVAL;
@@ -1252,19 +1205,19 @@ static int wm8753_mode3_4_set_dai_fmt(struct snd_soc_dai *codec_dai,
  static int wm8753_mute(struct snd_soc_dai *dai, int mute)
  {
         struct snd_soc_codec *codec = dai->codec;
-       u16 mute_reg = wm8753_read_reg_cache(codec, WM8753_DAC) & 0xfff7;
+       u16 mute_reg = snd_soc_read(codec, WM8753_DAC) & 0xfff7;
         struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
  
         /* the digital mute covers the HiFi and Voice DAC's on the WM8753.
          * make sure we check if they are not both active when we mute */
         if (mute && wm8753->dai_func == 1) {
                 if (!codec->active)
-                       wm8753_write(codec, WM8753_DAC, mute_reg | 0x8);
+                       snd_soc_write(codec, WM8753_DAC, mute_reg | 0x8);
         } else {
                 if (mute)
-                       wm8753_write(codec, WM8753_DAC, mute_reg | 0x8);
+                       snd_soc_write(codec, WM8753_DAC, mute_reg | 0x8);
                 else
-                       wm8753_write(codec, WM8753_DAC, mute_reg);
+                       snd_soc_write(codec, WM8753_DAC, mute_reg);
         }
  
         return 0;
@@ -1273,23 +1226,23 @@ static int wm8753_mute(struct snd_soc_dai *dai, int mute)
  static int wm8753_set_bias_level(struct snd_soc_codec *codec,
                                  enum snd_soc_bias_level level)
  {
-       u16 pwr_reg = wm8753_read_reg_cache(codec, WM8753_PWR1) & 0xfe3e;
+       u16 pwr_reg = snd_soc_read(codec, WM8753_PWR1) & 0xfe3e;
  
         switch (level) {
         case SND_SOC_BIAS_ON:
                 /* set vmid to 50k and unmute dac */
-               wm8753_write(codec, WM8753_PWR1, pwr_reg | 0x00c0);
+               snd_soc_write(codec, WM8753_PWR1, pwr_reg | 0x00c0);
                 break;
         case SND_SOC_BIAS_PREPARE:
                 /* set vmid to 5k for quick power up */
-               wm8753_write(codec, WM8753_PWR1, pwr_reg | 0x01c1);
+               snd_soc_write(codec, WM8753_PWR1, pwr_reg | 0x01c1);
                 break;
         case SND_SOC_BIAS_STANDBY:
                 /* mute dac and set vmid to 500k, enable VREF */
-               wm8753_write(codec, WM8753_PWR1, pwr_reg | 0x0141);
+               snd_soc_write(codec, WM8753_PWR1, pwr_reg | 0x0141);
                 break;
         case SND_SOC_BIAS_OFF:
-               wm8753_write(codec, WM8753_PWR1, 0x0001);
+               snd_soc_write(codec, WM8753_PWR1, 0x0001);
                 break;
         }
         codec->bias_level = level;
@@ -1477,7 +1430,7 @@ static void wm8753_set_dai_mode(struct snd_soc_codec *codec,
                 else
                         dai->driver = &wm8753_all_dai[(wm8753->dai_func << 1) + 1];
         }
-       wm8753_write(codec, WM8753_IOCTL, wm8753->dai_func);
+       snd_soc_write(codec, WM8753_IOCTL, wm8753->dai_func);
  }
  
  static void wm8753_work(struct work_struct *work)
@@ -1495,22 +1448,19 @@ static int wm8753_suspend(struct snd_soc_codec *codec, pm_message_t state)
  
  static int wm8753_resume(struct snd_soc_codec *codec)
  {
+       u16 *reg_cache = codec->reg_cache;
         int i;
-       u8 data[2];
-       u16 *cache = codec->reg_cache;
  
         /* Sync reg_cache with the hardware */
-       for (i = 0; i < ARRAY_SIZE(wm8753_reg); i++) {
-               if (i + 1 == WM8753_RESET)
+       for (i = 1; i < ARRAY_SIZE(wm8753_reg); i++) {
+               if (i == WM8753_RESET)
                         continue;
  
                 /* No point in writing hardware default values back */
-               if (cache[i] == wm8753_reg[i])
+               if (reg_cache[i] == wm8753_reg[i])
                         continue;
  
-               data[0] = ((i + 1) << 1) | ((cache[i] >> 8) & 0x0001);
-               data[1] = cache[i] & 0x00ff;
-               codec->hw_write(codec->control_data, data, 2);
+               snd_soc_write(codec, i, reg_cache[i]);
         }
  
         wm8753_set_bias_level(codec, SND_SOC_BIAS_STANDBY);
@@ -1548,7 +1498,7 @@ static int run_delayed_work(struct delayed_work *dwork)
  static int wm8753_probe(struct snd_soc_codec *codec)
  {
         struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
-       int ret = 0, reg;
+       int ret;
  
         INIT_DELAYED_WORK(&codec->delayed_work, wm8753_work);
  
@@ -1573,26 +1523,16 @@ static int wm8753_probe(struct snd_soc_codec *codec)
                               msecs_to_jiffies(caps_charge));
  
         /* set the update bits */
-       reg = wm8753_read_reg_cache(codec, WM8753_LDAC);
-       wm8753_write(codec, WM8753_LDAC, reg | 0x0100);
-       reg = wm8753_read_reg_cache(codec, WM8753_RDAC);
-       wm8753_write(codec, WM8753_RDAC, reg | 0x0100);
-       reg = wm8753_read_reg_cache(codec, WM8753_LADC);
-       wm8753_write(codec, WM8753_LADC, reg | 0x0100);
-       reg = wm8753_read_reg_cache(codec, WM8753_RADC);
-       wm8753_write(codec, WM8753_RADC, reg | 0x0100);
-       reg = wm8753_read_reg_cache(codec, WM8753_LOUT1V);
-       wm8753_write(codec, WM8753_LOUT1V, reg | 0x0100);
-       reg = wm8753_read_reg_cache(codec, WM8753_ROUT1V);
-       wm8753_write(codec, WM8753_ROUT1V, reg | 0x0100);
-       reg = wm8753_read_reg_cache(codec, WM8753_LOUT2V);
-       wm8753_write(codec, WM8753_LOUT2V, reg | 0x0100);
-       reg = wm8753_read_reg_cache(codec, WM8753_ROUT2V);
-       wm8753_write(codec, WM8753_ROUT2V, reg | 0x0100);
-       reg = wm8753_read_reg_cache(codec, WM8753_LINVOL);
-       wm8753_write(codec, WM8753_LINVOL, reg | 0x0100);
-       reg = wm8753_read_reg_cache(codec, WM8753_RINVOL);
-       wm8753_write(codec, WM8753_RINVOL, reg | 0x0100);
+       snd_soc_update_bits(codec, WM8753_LDAC, 0x0100, 0x0100);
+       snd_soc_update_bits(codec, WM8753_RDAC, 0x0100, 0x0100);
+       snd_soc_update_bits(codec, WM8753_LDAC, 0x0100, 0x0100);
+       snd_soc_update_bits(codec, WM8753_RDAC, 0x0100, 0x0100);
+       snd_soc_update_bits(codec, WM8753_LOUT1V, 0x0100, 0x0100);
+       snd_soc_update_bits(codec, WM8753_ROUT1V, 0x0100, 0x0100);
+       snd_soc_update_bits(codec, WM8753_LOUT2V, 0x0100, 0x0100);
+       snd_soc_update_bits(codec, WM8753_ROUT2V, 0x0100, 0x0100);
+       snd_soc_update_bits(codec, WM8753_LINVOL, 0x0100, 0x0100);
+       snd_soc_update_bits(codec, WM8753_RINVOL, 0x0100, 0x0100);
  
         snd_soc_add_controls(codec, wm8753_snd_controls,
                              ARRAY_SIZE(wm8753_snd_controls));
diff --git a/sound/soc/codecs/wm8904.c b/sound/soc/codecs/wm8904.c

index 9001cc48ba1371596c890ce12465e314d9022a81..1ec12eff06205f9023b57c5da7b836d6e81a1f4b 100644 (file)
--- a/sound/soc/codecs/wm8904.c
+++ b/sound/soc/codecs/wm8904.c
@@ -50,8 +50,6 @@ static const char *wm8904_supply_names[WM8904_NUM_SUPPLIES] = {
  /* codec private data */
  struct wm8904_priv {
  
-       u16 reg_cache[WM8904_MAX_REGISTER + 1];
-
         enum wm8904_type devtype;
         void *control_data;
  
@@ -2094,7 +2092,7 @@ static int wm8904_digital_mute(struct snd_soc_dai *codec_dai, int mute)
  
  static void wm8904_sync_cache(struct snd_soc_codec *codec)
  {
-       struct wm8904_priv *wm8904 = snd_soc_codec_get_drvdata(codec);
+       u16 *reg_cache = codec->reg_cache;
         int i;
  
         if (!codec->cache_sync)
@@ -2105,14 +2103,14 @@ static void wm8904_sync_cache(struct snd_soc_codec *codec)
         /* Sync back cached values if they're different from the
          * hardware default.
          */
-       for (i = 1; i < ARRAY_SIZE(wm8904->reg_cache); i++) {
+       for (i = 1; i < codec->driver->reg_cache_size; i++) {
                 if (!wm8904_access[i].writable)
                         continue;
  
-               if (wm8904->reg_cache[i] == wm8904_reg[i])
+               if (reg_cache[i] == wm8904_reg[i])
                         continue;
  
-               snd_soc_write(codec, i, wm8904->reg_cache[i]);
+               snd_soc_write(codec, i, reg_cache[i]);
         }
  
         codec->cache_sync = 0;
@@ -2371,6 +2369,7 @@ static int wm8904_probe(struct snd_soc_codec *codec)
  {
         struct wm8904_priv *wm8904 = snd_soc_codec_get_drvdata(codec);
         struct wm8904_pdata *pdata = wm8904->pdata;
+       u16 *reg_cache = codec->reg_cache;
         int ret, i;
  
         codec->cache_sync = 1;
@@ -2437,19 +2436,19 @@ static int wm8904_probe(struct snd_soc_codec *codec)
         }
  
         /* Change some default settings - latch VU and enable ZC */
-       wm8904->reg_cache[WM8904_ADC_DIGITAL_VOLUME_LEFT] |= WM8904_ADC_VU;
-       wm8904->reg_cache[WM8904_ADC_DIGITAL_VOLUME_RIGHT] |= WM8904_ADC_VU;
-       wm8904->reg_cache[WM8904_DAC_DIGITAL_VOLUME_LEFT] |= WM8904_DAC_VU;
-       wm8904->reg_cache[WM8904_DAC_DIGITAL_VOLUME_RIGHT] |= WM8904_DAC_VU;
-       wm8904->reg_cache[WM8904_ANALOGUE_OUT1_LEFT] |= WM8904_HPOUT_VU |
+       reg_cache[WM8904_ADC_DIGITAL_VOLUME_LEFT] |= WM8904_ADC_VU;
+       reg_cache[WM8904_ADC_DIGITAL_VOLUME_RIGHT] |= WM8904_ADC_VU;
+       reg_cache[WM8904_DAC_DIGITAL_VOLUME_LEFT] |= WM8904_DAC_VU;
+       reg_cache[WM8904_DAC_DIGITAL_VOLUME_RIGHT] |= WM8904_DAC_VU;
+       reg_cache[WM8904_ANALOGUE_OUT1_LEFT] |= WM8904_HPOUT_VU |
                 WM8904_HPOUTLZC;
-       wm8904->reg_cache[WM8904_ANALOGUE_OUT1_RIGHT] |= WM8904_HPOUT_VU |
+       reg_cache[WM8904_ANALOGUE_OUT1_RIGHT] |= WM8904_HPOUT_VU |
                 WM8904_HPOUTRZC;
-       wm8904->reg_cache[WM8904_ANALOGUE_OUT2_LEFT] |= WM8904_LINEOUT_VU |
+       reg_cache[WM8904_ANALOGUE_OUT2_LEFT] |= WM8904_LINEOUT_VU |
                 WM8904_LINEOUTLZC;
-       wm8904->reg_cache[WM8904_ANALOGUE_OUT2_RIGHT] |= WM8904_LINEOUT_VU |
+       reg_cache[WM8904_ANALOGUE_OUT2_RIGHT] |= WM8904_LINEOUT_VU |
                 WM8904_LINEOUTRZC;
-       wm8904->reg_cache[WM8904_CLOCK_RATES_0] &= ~WM8904_SR_MODE;
+       reg_cache[WM8904_CLOCK_RATES_0] &= ~WM8904_SR_MODE;
  
         /* Apply configuration from the platform data. */
         if (wm8904->pdata) {
@@ -2457,23 +2456,23 @@ static int wm8904_probe(struct snd_soc_codec *codec)
                         if (!pdata->gpio_cfg[i])
                                 continue;
  
-                       wm8904->reg_cache[WM8904_GPIO_CONTROL_1 + i]
+                       reg_cache[WM8904_GPIO_CONTROL_1 + i]
                                 = pdata->gpio_cfg[i] & 0xffff;
                 }
  
                 /* Zero is the default value for these anyway */
                 for (i = 0; i < WM8904_MIC_REGS; i++)
-                       wm8904->reg_cache[WM8904_MIC_BIAS_CONTROL_0 + i]
+                       reg_cache[WM8904_MIC_BIAS_CONTROL_0 + i]
                                 = pdata->mic_cfg[i];
         }
  
         /* Set Class W by default - this will be managed by the Class
          * G widget at runtime where bypass paths are available.
          */
-       wm8904->reg_cache[WM8904_CLASS_W_0] |= WM8904_CP_DYN_PWR;
+       reg_cache[WM8904_CLASS_W_0] |= WM8904_CP_DYN_PWR;
  
         /* Use normal bias source */
-       wm8904->reg_cache[WM8904_BIAS_CONTROL_0] &= ~WM8904_POBCTRL;
+       reg_cache[WM8904_BIAS_CONTROL_0] &= ~WM8904_POBCTRL;
  
         wm8904_set_bias_level(codec, SND_SOC_BIAS_STANDBY);
  
diff --git a/sound/soc/codecs/wm8940.c b/sound/soc/codecs/wm8940.c

index 2cb16f895c4607c9d323ea3cbb9d8e31c4741cb1..23086e2c976abf686c62624da2fd00ff59417594 100644 (file)
--- a/sound/soc/codecs/wm8940.c
+++ b/sound/soc/codecs/wm8940.c
@@ -768,6 +768,7 @@ static __devinit int wm8940_i2c_probe(struct i2c_client *i2c,
  
         i2c_set_clientdata(i2c, wm8940);
         wm8940->control_data = i2c;
+       wm8940->control_type = SND_SOC_I2C;
  
         ret = snd_soc_register_codec(&i2c->dev,
                         &soc_codec_dev_wm8940, &wm8940_dai, 1);
diff --git a/sound/soc/codecs/wm8955.c b/sound/soc/codecs/wm8955.c

index 9cbab8e1de0149cd8b2063406ecbae29ca0c78e5..2ac35b0be86acb37faaabdb1c0f81b321a2bb741 100644 (file)
--- a/sound/soc/codecs/wm8955.c
+++ b/sound/soc/codecs/wm8955.c
@@ -42,8 +42,6 @@ static const char *wm8955_supply_names[WM8955_NUM_SUPPLIES] = {
  struct wm8955_priv {
         enum snd_soc_control_type control_type;
  
-       u16 reg_cache[WM8955_MAX_REGISTER + 1];
-
         unsigned int mclk_rate;
  
         int deemph;
@@ -768,6 +766,7 @@ static int wm8955_set_bias_level(struct snd_soc_codec *codec,
                                  enum snd_soc_bias_level level)
  {
         struct wm8955_priv *wm8955 = snd_soc_codec_get_drvdata(codec);
+       u16 *reg_cache = codec->reg_cache;
         int ret, i;
  
         switch (level) {
@@ -800,14 +799,14 @@ static int wm8955_set_bias_level(struct snd_soc_codec *codec,
                         /* Sync back cached values if they're
                          * different from the hardware default.
                          */
-                       for (i = 0; i < ARRAY_SIZE(wm8955->reg_cache); i++) {
+                       for (i = 0; i < codec->driver->reg_cache_size; i++) {
                                 if (i == WM8955_RESET)
                                         continue;
  
-                               if (wm8955->reg_cache[i] == wm8955_reg[i])
+                               if (reg_cache[i] == wm8955_reg[i])
                                         continue;
  
-                               snd_soc_write(codec, i, wm8955->reg_cache[i]);
+                               snd_soc_write(codec, i, reg_cache[i]);
                         }
  
                         /* Enable VREF and VMID */
@@ -902,6 +901,7 @@ static int wm8955_probe(struct snd_soc_codec *codec)
  {
         struct wm8955_priv *wm8955 = snd_soc_codec_get_drvdata(codec);
         struct wm8955_pdata *pdata = dev_get_platdata(codec->dev);
+       u16 *reg_cache = codec->reg_cache;
         int ret, i;
  
         ret = snd_soc_codec_set_cache_io(codec, 7, 9, wm8955->control_type);
@@ -934,25 +934,25 @@ static int wm8955_probe(struct snd_soc_codec *codec)
         }
  
         /* Change some default settings - latch VU and enable ZC */
-       wm8955->reg_cache[WM8955_LEFT_DAC_VOLUME] |= WM8955_LDVU;
-       wm8955->reg_cache[WM8955_RIGHT_DAC_VOLUME] |= WM8955_RDVU;
-       wm8955->reg_cache[WM8955_LOUT1_VOLUME] |= WM8955_LO1VU | WM8955_LO1ZC;
-       wm8955->reg_cache[WM8955_ROUT1_VOLUME] |= WM8955_RO1VU | WM8955_RO1ZC;
-       wm8955->reg_cache[WM8955_LOUT2_VOLUME] |= WM8955_LO2VU | WM8955_LO2ZC;
-       wm8955->reg_cache[WM8955_ROUT2_VOLUME] |= WM8955_RO2VU | WM8955_RO2ZC;
-       wm8955->reg_cache[WM8955_MONOOUT_VOLUME] |= WM8955_MOZC;
+       reg_cache[WM8955_LEFT_DAC_VOLUME] |= WM8955_LDVU;
+       reg_cache[WM8955_RIGHT_DAC_VOLUME] |= WM8955_RDVU;
+       reg_cache[WM8955_LOUT1_VOLUME] |= WM8955_LO1VU | WM8955_LO1ZC;
+       reg_cache[WM8955_ROUT1_VOLUME] |= WM8955_RO1VU | WM8955_RO1ZC;
+       reg_cache[WM8955_LOUT2_VOLUME] |= WM8955_LO2VU | WM8955_LO2ZC;
+       reg_cache[WM8955_ROUT2_VOLUME] |= WM8955_RO2VU | WM8955_RO2ZC;
+       reg_cache[WM8955_MONOOUT_VOLUME] |= WM8955_MOZC;
  
         /* Also enable adaptive bass boost by default */
-       wm8955->reg_cache[WM8955_BASS_CONTROL] |= WM8955_BB;
+       reg_cache[WM8955_BASS_CONTROL] |= WM8955_BB;
  
         /* Set platform data values */
         if (pdata) {
                 if (pdata->out2_speaker)
-                       wm8955->reg_cache[WM8955_ADDITIONAL_CONTROL_2]
+                       reg_cache[WM8955_ADDITIONAL_CONTROL_2]
                                 |= WM8955_ROUT2INV;
  
                 if (pdata->monoin_diff)
-                       wm8955->reg_cache[WM8955_MONO_OUT_MIX_1]
+                       reg_cache[WM8955_MONO_OUT_MIX_1]
                                 |= WM8955_DMEN;
         }
  
@@ -1003,6 +1003,7 @@ static __devinit int wm8955_i2c_probe(struct i2c_client *i2c,
                 return -ENOMEM;
  
         i2c_set_clientdata(i2c, wm8955);
+       wm8955->control_type = SND_SOC_I2C;
  
         ret = snd_soc_register_codec(&i2c->dev,
                         &soc_codec_dev_wm8955, &wm8955_dai, 1);
diff --git a/sound/soc/codecs/wm8960.c b/sound/soc/codecs/wm8960.c

index 21986c42272f07ac693cd21d15f48cff20c43352..ff6ff2f529d2a97f9e2522ceff5df18355cecd5c 100644 (file)
--- a/sound/soc/codecs/wm8960.c
+++ b/sound/soc/codecs/wm8960.c
@@ -1013,6 +1013,7 @@ static __devinit int wm8960_i2c_probe(struct i2c_client *i2c,
                 return -ENOMEM;
  
         i2c_set_clientdata(i2c, wm8960);
+       wm8960->control_type = SND_SOC_I2C;
         wm8960->control_data = i2c;
  
         ret = snd_soc_register_codec(&i2c->dev,
diff --git a/sound/soc/codecs/wm8962.c b/sound/soc/codecs/wm8962.c

index 1304ca91a11c708c567f69c45052c95f1f7be17e..7c421cc837bd3d697969b6f0c0f94f0ecb7abc5f 100644 (file)
--- a/sound/soc/codecs/wm8962.c
+++ b/sound/soc/codecs/wm8962.c
@@ -52,8 +52,6 @@ static const char *wm8962_supply_names[WM8962_NUM_SUPPLIES] = {
  struct wm8962_priv {
         struct snd_soc_codec *codec;
  
-       u16 reg_cache[WM8962_MAX_REGISTER + 1];
-
         int sysclk;
         int sysclk_rate;
  
@@ -1991,8 +1989,7 @@ static int wm8962_put_hp_sw(struct snd_kcontrol *kcontrol,
                             struct snd_ctl_elem_value *ucontrol)
  {
         struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
-       struct wm8962_priv *wm8962 = snd_soc_codec_get_drvdata(codec);
-       u16 *reg_cache = wm8962->reg_cache;
+       u16 *reg_cache = codec->reg_cache;
         int ret;
  
         /* Apply the update (if any) */
@@ -2020,8 +2017,7 @@ static int wm8962_put_spk_sw(struct snd_kcontrol *kcontrol,
                             struct snd_ctl_elem_value *ucontrol)
  {
         struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
-       struct wm8962_priv *wm8962 = snd_soc_codec_get_drvdata(codec);
-       u16 *reg_cache = wm8962->reg_cache;
+       u16 *reg_cache = codec->reg_cache;
         int ret;
  
         /* Apply the update (if any) */
@@ -2329,8 +2325,7 @@ static int out_pga_event(struct snd_soc_dapm_widget *w,
                          struct snd_kcontrol *kcontrol, int event)
  {
         struct snd_soc_codec *codec = w->codec;
-       struct wm8962_priv *wm8962 = snd_soc_codec_get_drvdata(codec);
-       u16 *reg_cache = wm8962->reg_cache;
+       u16 *reg_cache = codec->reg_cache;
         int reg;
  
         switch (w->shift) {
@@ -2719,7 +2714,7 @@ static int wm8962_add_widgets(struct snd_soc_codec *codec)
  
  static void wm8962_sync_cache(struct snd_soc_codec *codec)
  {
-       struct wm8962_priv *wm8962 = snd_soc_codec_get_drvdata(codec);
+       u16 *reg_cache = codec->reg_cache;
         int i;
  
         if (!codec->cache_sync)
@@ -2732,13 +2727,13 @@ static void wm8962_sync_cache(struct snd_soc_codec *codec)
         /* Sync back cached values if they're different from the
          * hardware default.
          */
-       for (i = 1; i < ARRAY_SIZE(wm8962->reg_cache); i++) {
+       for (i = 1; i < codec->driver->reg_cache_size; i++) {
                 if (i == WM8962_SOFTWARE_RESET)
                         continue;
-               if (wm8962->reg_cache[i] == wm8962_reg[i])
+               if (reg_cache[i] == wm8962_reg[i])
                         continue;
  
-               snd_soc_write(codec, i, wm8962->reg_cache[i]);
+               snd_soc_write(codec, i, reg_cache[i]);
         }
  
         codec->cache_sync = 0;
@@ -3406,12 +3401,11 @@ EXPORT_SYMBOL_GPL(wm8962_mic_detect);
  #ifdef CONFIG_PM
  static int wm8962_resume(struct snd_soc_codec *codec)
  {
-       struct wm8962_priv *wm8962 = snd_soc_codec_get_drvdata(codec);
         u16 *reg_cache = codec->reg_cache;
         int i;
  
         /* Restore the registers */
-       for (i = 1; i < ARRAY_SIZE(wm8962->reg_cache); i++) {
+       for (i = 1; i < codec->driver->reg_cache_size; i++) {
                 switch (i) {
                 case WM8962_SOFTWARE_RESET:
                         continue;
@@ -3705,6 +3699,7 @@ static int wm8962_probe(struct snd_soc_codec *codec)
         struct wm8962_pdata *pdata = dev_get_platdata(codec->dev);
         struct i2c_client *i2c = container_of(codec->dev, struct i2c_client,
                                               dev);
+       u16 *reg_cache = codec->reg_cache;
         int i, trigger, irq_pol;
  
         wm8962->codec = codec;
@@ -3804,7 +3799,7 @@ static int wm8962_probe(struct snd_soc_codec *codec)
  
                 /* Put the speakers into mono mode? */
                 if (pdata->spk_mono)
-                       wm8962->reg_cache[WM8962_CLASS_D_CONTROL_2]
+                       reg_cache[WM8962_CLASS_D_CONTROL_2]
                                 |= WM8962_SPK_MONO;
  
                 /* Micbias setup, detection enable and detection
@@ -3819,16 +3814,16 @@ static int wm8962_probe(struct snd_soc_codec *codec)
         }
  
         /* Latch volume update bits */
-       wm8962->reg_cache[WM8962_LEFT_INPUT_VOLUME] |= WM8962_IN_VU;
-       wm8962->reg_cache[WM8962_RIGHT_INPUT_VOLUME] |= WM8962_IN_VU;
-       wm8962->reg_cache[WM8962_LEFT_ADC_VOLUME] |= WM8962_ADC_VU;
-       wm8962->reg_cache[WM8962_RIGHT_ADC_VOLUME] |= WM8962_ADC_VU;    
-       wm8962->reg_cache[WM8962_LEFT_DAC_VOLUME] |= WM8962_DAC_VU;
-       wm8962->reg_cache[WM8962_RIGHT_DAC_VOLUME] |= WM8962_DAC_VU;
-       wm8962->reg_cache[WM8962_SPKOUTL_VOLUME] |= WM8962_SPKOUT_VU;
-       wm8962->reg_cache[WM8962_SPKOUTR_VOLUME] |= WM8962_SPKOUT_VU;
-       wm8962->reg_cache[WM8962_HPOUTL_VOLUME] |= WM8962_HPOUT_VU;
-       wm8962->reg_cache[WM8962_HPOUTR_VOLUME] |= WM8962_HPOUT_VU;
+       reg_cache[WM8962_LEFT_INPUT_VOLUME] |= WM8962_IN_VU;
+       reg_cache[WM8962_RIGHT_INPUT_VOLUME] |= WM8962_IN_VU;
+       reg_cache[WM8962_LEFT_ADC_VOLUME] |= WM8962_ADC_VU;
+       reg_cache[WM8962_RIGHT_ADC_VOLUME] |= WM8962_ADC_VU;
+       reg_cache[WM8962_LEFT_DAC_VOLUME] |= WM8962_DAC_VU;
+       reg_cache[WM8962_RIGHT_DAC_VOLUME] |= WM8962_DAC_VU;
+       reg_cache[WM8962_SPKOUTL_VOLUME] |= WM8962_SPKOUT_VU;
+       reg_cache[WM8962_SPKOUTR_VOLUME] |= WM8962_SPKOUT_VU;
+       reg_cache[WM8962_HPOUTL_VOLUME] |= WM8962_HPOUT_VU;
+       reg_cache[WM8962_HPOUTR_VOLUME] |= WM8962_HPOUT_VU;
  
         wm8962_add_widgets(codec);
  
diff --git a/sound/soc/codecs/wm8971.c b/sound/soc/codecs/wm8971.c

index 63f6dbf5d07021887084e57541591bf1fa570fb3..9f18db6e167c0c86924b08e434d671bea98c8ff1 100644 (file)
--- a/sound/soc/codecs/wm8971.c
+++ b/sound/soc/codecs/wm8971.c
@@ -718,6 +718,7 @@ static __devinit int wm8971_i2c_probe(struct i2c_client *i2c,
         if (wm8971 == NULL)
                 return -ENOMEM;
  
+       wm8971->control_type = SND_SOC_I2C;
         i2c_set_clientdata(i2c, wm8971);
  
         ret = snd_soc_register_codec(&i2c->dev,
diff --git a/sound/soc/codecs/wm9081.c b/sound/soc/codecs/wm9081.c

index ecc7c37180c7ad2158f11b9de1579bde332cb44d..a486670966bd7e6a44470ece6918fab24c638c2b 100644 (file)
--- a/sound/soc/codecs/wm9081.c
+++ b/sound/soc/codecs/wm9081.c
@@ -1335,6 +1335,7 @@ static __devinit int wm9081_i2c_probe(struct i2c_client *i2c,
                 return -ENOMEM;
  
         i2c_set_clientdata(i2c, wm9081);
+       wm9081->control_type = SND_SOC_I2C;
         wm9081->control_data = i2c;
  
         ret = snd_soc_register_codec(&i2c->dev,
diff --git a/sound/soc/codecs/wm9090.c b/sound/soc/codecs/wm9090.c

index 99c046ba46bb6ed637d356579a2f4af9d1841bc3..6e5f64f627cb82e6c7379ab389ec0a10e45d9b0a 100644 (file)
--- a/sound/soc/codecs/wm9090.c
+++ b/sound/soc/codecs/wm9090.c
@@ -141,7 +141,6 @@ static const u16 wm9090_reg_defaults[] = {
  /* This struct is used to save the context */
  struct wm9090_priv {
         struct mutex mutex;
-       u16 reg_cache[WM9090_MAX_REGISTER + 1];
         struct wm9090_platform_data pdata;
         void *control_data;
  };
@@ -552,6 +551,7 @@ static int wm9090_set_bias_level(struct snd_soc_codec *codec,
  static int wm9090_probe(struct snd_soc_codec *codec)
  {
         struct wm9090_priv *wm9090 = snd_soc_codec_get_drvdata(codec);
+       u16 *reg_cache = codec->reg_cache;
         int ret;
  
         codec->control_data = wm9090->control_data;
@@ -576,22 +576,22 @@ static int wm9090_probe(struct snd_soc_codec *codec)
         /* Configure some defaults; they will be written out when we
          * bring the bias up.
          */
-       wm9090->reg_cache[WM9090_IN1_LINE_INPUT_A_VOLUME] |= WM9090_IN1_VU
+       reg_cache[WM9090_IN1_LINE_INPUT_A_VOLUME] |= WM9090_IN1_VU
                 | WM9090_IN1A_ZC;
-       wm9090->reg_cache[WM9090_IN1_LINE_INPUT_B_VOLUME] |= WM9090_IN1_VU
+       reg_cache[WM9090_IN1_LINE_INPUT_B_VOLUME] |= WM9090_IN1_VU
                 | WM9090_IN1B_ZC;
-       wm9090->reg_cache[WM9090_IN2_LINE_INPUT_A_VOLUME] |= WM9090_IN2_VU
+       reg_cache[WM9090_IN2_LINE_INPUT_A_VOLUME] |= WM9090_IN2_VU
                 | WM9090_IN2A_ZC;
-       wm9090->reg_cache[WM9090_IN2_LINE_INPUT_B_VOLUME] |= WM9090_IN2_VU
+       reg_cache[WM9090_IN2_LINE_INPUT_B_VOLUME] |= WM9090_IN2_VU
                 | WM9090_IN2B_ZC;
-       wm9090->reg_cache[WM9090_SPEAKER_VOLUME_LEFT] |=
+       reg_cache[WM9090_SPEAKER_VOLUME_LEFT] |=
                 WM9090_SPKOUT_VU | WM9090_SPKOUTL_ZC;
-       wm9090->reg_cache[WM9090_LEFT_OUTPUT_VOLUME] |=
+       reg_cache[WM9090_LEFT_OUTPUT_VOLUME] |=
                 WM9090_HPOUT1_VU | WM9090_HPOUT1L_ZC;
-       wm9090->reg_cache[WM9090_RIGHT_OUTPUT_VOLUME] |=
+       reg_cache[WM9090_RIGHT_OUTPUT_VOLUME] |=
                 WM9090_HPOUT1_VU | WM9090_HPOUT1R_ZC;
  
-       wm9090->reg_cache[WM9090_CLOCKING_1] |= WM9090_TOCLK_ENA;
+       reg_cache[WM9090_CLOCKING_1] |= WM9090_TOCLK_ENA;
  
         wm9090_set_bias_level(codec, SND_SOC_BIAS_STANDBY);
  
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt

index b2c63309a65165b471822e99268c828bbdb07777..6f5a498608b292241e93dc9c498e8ce5f6a683cc 100644 (file)
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -24,12 +24,47 @@ OPTIONS
  --input=::
          Input file name. (default: perf.data)
  
+-d::
+--dsos=<dso[,dso...]>::
+        Only consider symbols in these dsos.
+-s::
+--symbol=<symbol>::
+        Symbol to annotate.
+
+-f::
+--force::
+        Don't complain, do it.
+
+-v::
+--verbose::
+        Be more verbose. (Show symbol address, etc)
+
+-D::
+--dump-raw-trace::
+        Dump raw trace in ASCII.
+
+-k::
+--vmlinux=<file>::
+        vmlinux pathname.
+
+-m::
+--modules::
+        Load module symbols. WARNING: use only with -k and LIVE kernel.
+
+-l::
+--print-line::
+        Print matching source lines (may be slow).
+
+-P::
+--full-paths::
+        Don't shorten the displayed pathnames.
+
  --stdio:: Use the stdio interface.
  
  --tui:: Use the TUI interface Use of --tui requires a tty, if one is not
         present, as when piping to other commands, the stdio interface is
         used. This interfaces starts by centering on the line with more
-       samples, TAB/UNTAB cycles thru the lines with more samples.
+       samples, TAB/UNTAB cycles through the lines with more samples.
  
  SEE ALSO
  --------
diff --git a/tools/perf/Documentation/perf-buildid-list.txt b/tools/perf/Documentation/perf-buildid-list.txt

index 01b642c0bf8f974aedf2560df4e91cc5dd514a37..5eaac6f26d51e861236cfa48ab7c0509107b472c 100644 (file)
--- a/tools/perf/Documentation/perf-buildid-list.txt
+++ b/tools/perf/Documentation/perf-buildid-list.txt
@@ -18,6 +18,9 @@ perf report.
  
  OPTIONS
  -------
+-H::
+--with-hits::
+        Show only DSOs with hits.
  -i::
  --input=::
          Input file name. (default: perf.data)
diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt

index 20d97d84ea1c37164005a4f38b8af5e23104ef93..74d7481ed7a6916f8797ed67b91b5c5d80382edc 100644 (file)
--- a/tools/perf/Documentation/perf-diff.txt
+++ b/tools/perf/Documentation/perf-diff.txt
@@ -19,6 +19,18 @@ If no parameters are passed it will assume perf.data.old and perf.data.
  
  OPTIONS
  -------
+-M::
+--displacement::
+        Show position displacement relative to baseline.
+
+-D::
+--dump-raw-trace::
+        Dump raw trace in ASCII.
+
+-m::
+--modules::
+        Load module symbols. WARNING: use only with -k and LIVE kernel
+
  -d::
  --dsos=::
         Only consider symbols in these dsos. CSV that understands
@@ -42,7 +54,7 @@ OPTIONS
  --field-separator=::
  
         Use a special separator character and don't pad with spaces, replacing
-       all occurances of this separator in symbol names (and other output)
+       all occurrences of this separator in symbol names (and other output)
         with a '.' character, that thus it's the only non valid separator.
  
  -v::
@@ -50,6 +62,13 @@ OPTIONS
         Be verbose, for instance, show the raw counts in addition to the
         diff.
  
+-f::
+--force::
+       Don't complain, do it.
+
+--symfs=<directory>::
+        Look for files with symbols relative to this directory.
+
  SEE ALSO
  --------
  linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-kvm.txt b/tools/perf/Documentation/perf-kvm.txt

index d004e19fe6d6ffcc9c9d8e25822d251d8dda9c6e..dd84cb2f0a8861dd8656b4058ae8ed906f17169c 100644 (file)
--- a/tools/perf/Documentation/perf-kvm.txt
+++ b/tools/perf/Documentation/perf-kvm.txt
@@ -22,7 +22,7 @@ There are a couple of variants of perf kvm:
    a performance counter profile of guest os in realtime
    of an arbitrary workload.
  
-  'perf kvm record <command>' to record the performance couinter profile
+  'perf kvm record <command>' to record the performance counter profile
    of an arbitrary workload and save it into a perf data file. If both
    --host and --guest are input, the perf data file name is perf.data.kvm.
    If there is  no --host but --guest, the file name is perf.data.guest.
@@ -40,6 +40,12 @@ There are a couple of variants of perf kvm:
  
  OPTIONS
  -------
+-i::
+--input=::
+        Input file name.
+-o::
+--output::
+        Output file name.
  --host=::
          Collect host side performance profile.
  --guest=::
diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt

index b317102138c82f2a78a58ada898888603f8cc802..921de259ea1086f36ce5b7e6a3426cec5618deba 100644 (file)
--- a/tools/perf/Documentation/perf-lock.txt
+++ b/tools/perf/Documentation/perf-lock.txt
@@ -24,6 +24,21 @@ and statistics with this 'perf lock' command.
  
    'perf lock report' reports statistical data.
  
+OPTIONS
+-------
+
+-i::
+--input=<file>::
+        Input file name.
+
+-v::
+--verbose::
+        Be more verbose (show symbol address, etc).
+
+-D::
+--dump-raw-trace::
+        Dump raw trace in ASCII.
+
  SEE ALSO
  --------
  linkperf:perf[1]
diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt

index 62de1b7f4e760367337042c52760e7a49ded50c7..86b797a35aa6acae540b652345937991257daa4d 100644 (file)
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -115,9 +115,9 @@ Each probe argument follows below syntax.
  
  LINE SYNTAX
  -----------
-Line range is descripted by following syntax.
+Line range is described by following syntax.
  
- "FUNC[:RLN[+NUM|-RLN2]]|SRC:ALN[+NUM|-ALN2]"
+ "FUNC[:RLN[+NUM|-RLN2]]|SRC[:ALN[+NUM|-ALN2]]"
  
  FUNC specifies the function name of showing lines. 'RLN' is the start line
  number from function entry line, and 'RLN2' is the end line number. As same as
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt

index a91f9f9e6e5c27f96623fd10061f12a3041926b6..52462ae26455c264aa83130c7bc50c9ca97807cc 100644 (file)
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -39,15 +39,24 @@ OPTIONS
            be passed as follows: '\mem:addr[:[r][w][x]]'.
            If you want to profile read-write accesses in 0x1000, just set
            'mem:0x1000:rw'.
+
+--filter=<filter>::
+        Event filter.
+
  -a::
-        System-wide collection.
+--all-cpus::
+        System-wide collection from all CPUs.
  
  -l::
          Scale counter values.
  
  -p::
  --pid=::
-       Record events on existing pid.
+       Record events on existing process ID.
+
+-t::
+--tid=::
+        Record events on existing thread ID.
  
  -r::
  --realtime=::
@@ -99,6 +108,11 @@ OPTIONS
  --data::
         Sample addresses.
  
+-T::
+--timestamp::
+       Sample timestamps. Use it with 'perf report -D' to see the timestamps,
+       for instance.
+
  -n::
  --no-samples::
         Don't sample.
@@ -109,8 +123,8 @@ Collect raw sample records from all opened counters (default for tracepoint coun
  
  -C::
  --cpu::
-Collect samples only on the list of cpus provided. Multiple CPUs can be provided as a
-comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+Collect samples only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
  In per-thread mode with inheritance mode on (default), samples are captured only when
  the thread executes on the designated CPUs. Default is to monitor all CPUs.
  
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt

index 12052c9ed0babfc3a1c93cc01758ec3b7747ee10..8ba03d6e5398d8387b11f9caf183bed81a0eb5a2 100644 (file)
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -20,6 +20,11 @@ OPTIONS
  -i::
  --input=::
          Input file name. (default: perf.data)
+
+-v::
+--verbose::
+        Be more verbose. (show symbol address, etc)
+
  -d::
  --dsos=::
         Only consider symbols in these dsos. CSV that understands
@@ -27,6 +32,10 @@ OPTIONS
  -n::
  --show-nr-samples::
         Show the number of samples for each symbol
+
+--showcpuutilization::
+        Show sample percentage for different cpu modes.
+
  -T::
  --threads::
         Show per-thread event counters
@@ -39,12 +48,24 @@ OPTIONS
         Only consider these symbols. CSV that understands
         file://filename entries.
  
+-U::
+--hide-unresolved::
+        Only display entries resolved to a symbol.
+
  -s::
  --sort=::
         Sort by key(s): pid, comm, dso, symbol, parent.
  
+-p::
+--parent=<regex>::
+        regex filter to identify parent, see: '--sort parent'
+
+-x::
+--exclude-other::
+        Only display entries with parent-match.
+
  -w::
---field-width=::
+--column-widths=<width[,width...]>::
         Force each column width to the provided list, for large terminal
         readability.
  
@@ -52,19 +73,26 @@ OPTIONS
  --field-separator=::
  
         Use a special separator character and don't pad with spaces, replacing
-       all occurances of this separator in symbol names (and other output)
+       all occurrences of this separator in symbol names (and other output)
         with a '.' character, that thus it's the only non valid separator.
  
+-D::
+--dump-raw-trace::
+        Dump raw trace in ASCII.
+
  -g [type,min]::
  --call-graph::
-        Display callchains using type and min percent threshold.
+        Display call chains using type and min percent threshold.
         type can be either:
-       - flat: single column, linear exposure of callchains.
+       - flat: single column, linear exposure of call chains.
         - graph: use a graph tree, displaying absolute overhead rates.
         - fractal: like graph, but displays relative rates. Each branch of
                  the tree is considered as a new profiled object. +
         Default: fractal,0.5.
  
+--pretty=<key>::
+        Pretty printing style.  key: normal, raw
+
  --stdio:: Use the stdio interface.
  
  --tui:: Use the TUI interface, that is integrated with annotate and allows
@@ -72,6 +100,25 @@ OPTIONS
         requires a tty, if one is not present, as when piping to other
         commands, the stdio interface is used.
  
+-k::
+--vmlinux=<file>::
+        vmlinux pathname
+
+--kallsyms=<file>::
+        kallsyms pathname
+
+-m::
+--modules::
+        Load module symbols. WARNING: This should only be used with -k and
+        a LIVE kernel.
+
+-f::
+--force::
+        Don't complain, do it.
+
+--symfs=<directory>::
+        Look for files with symbols relative to this directory.
+
  SEE ALSO
  --------
  linkperf:perf-stat[1]
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt

index 8417644a6166b9fcd071b88eebb9dd46f00b8a1f..46822d5fde1c0328ca8af0f7687265adeeca7f0e 100644 (file)
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -8,11 +8,11 @@ perf-sched - Tool to trace/measure scheduler properties (latencies)
  SYNOPSIS
  --------
  [verse]
-'perf sched' {record|latency|replay|trace}
+'perf sched' {record|latency|map|replay|trace}
  
  DESCRIPTION
  -----------
-There are four variants of perf sched:
+There are five variants of perf sched:
  
    'perf sched record <command>' to record the scheduling events
    of an arbitrary workload.
@@ -30,8 +30,22 @@ There are four variants of perf sched:
    of the workload as it occurred when it was recorded - and can repeat
    it a number of times, measuring its performance.)
  
+  'perf sched map' to print a textual context-switching outline of
+  workload captured via perf sched record.  Columns stand for
+  individual CPUs, and the two-letter shortcuts stand for tasks that
+  are running on a CPU. A '*' denotes the CPU that had the event, and
+  a dot signals an idle CPU.
+
  OPTIONS
  -------
+-i::
+--input=<file>::
+        Input file name. (default: perf.data)
+
+-v::
+--verbose::
+        Be more verbose. (show symbol address, etc)
+
  -D::
  --dump-raw-trace=::
          Display verbose dump of the sched data.
diff --git a/tools/perf/Documentation/perf-script-perl.txt b/tools/perf/Documentation/perf-script-perl.txt

new file mode 100644 (file)

index 0000000..5bb41e5
--- /dev/null
+++ b/tools/perf/Documentation/perf-script-perl.txt
@@ -0,0 +1,217 @@
+perf-script-perl(1)
+==================
+
+NAME
+----
+perf-script-perl - Process trace data with a Perl script
+
+SYNOPSIS
+--------
+[verse]
+'perf script' [-s [Perl]:script[.pl] ]
+
+DESCRIPTION
+-----------
+
+This perf script option is used to process perf script data using perf's
+built-in Perl interpreter.  It reads and processes the input file and
+displays the results of the trace analysis implemented in the given
+Perl script, if any.
+
+STARTER SCRIPTS
+---------------
+
+You can avoid reading the rest of this document by running 'perf script
+-g perl' in the same directory as an existing perf.data trace file.
+That will generate a starter script containing a handler for each of
+the event types in the trace file; it simply prints every available
+field for each event in the trace file.
+
+You can also look at the existing scripts in
+~/libexec/perf-core/scripts/perl for typical examples showing how to
+do basic things like aggregate event data, print results, etc.  Also,
+the check-perf-script.pl script, while not interesting for its results,
+attempts to exercise all of the main scripting features.
+
+EVENT HANDLERS
+--------------
+
+When perf script is invoked using a trace script, a user-defined
+'handler function' is called for each event in the trace.  If there's
+no handler function defined for a given event type, the event is
+ignored (or passed to a 'trace_handled' function, see below) and the
+next event is processed.
+
+Most of the event's field values are passed as arguments to the
+handler function; some of the less common ones aren't - those are
+available as calls back into the perf executable (see below).
+
+As an example, the following perf record command can be used to record
+all sched_wakeup events in the system:
+
+ # perf record -a -e sched:sched_wakeup
+
+Traces meant to be processed using a script should be recorded with
+the above option: -a to enable system-wide collection.
+
+The format file for the sched_wakep event defines the following fields
+(see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format):
+
+----
+ format:
+        field:unsigned short common_type;
+        field:unsigned char common_flags;
+        field:unsigned char common_preempt_count;
+        field:int common_pid;
+        field:int common_lock_depth;
+
+        field:char comm[TASK_COMM_LEN];
+        field:pid_t pid;
+        field:int prio;
+        field:int success;
+        field:int target_cpu;
+----
+
+The handler function for this event would be defined as:
+
+----
+sub sched::sched_wakeup
+{
+   my ($event_name, $context, $common_cpu, $common_secs,
+       $common_nsecs, $common_pid, $common_comm,
+       $comm, $pid, $prio, $success, $target_cpu) = @_;
+}
+----
+
+The handler function takes the form subsystem::event_name.
+
+The $common_* arguments in the handler's argument list are the set of
+arguments passed to all event handlers; some of the fields correspond
+to the common_* fields in the format file, but some are synthesized,
+and some of the common_* fields aren't common enough to to be passed
+to every event as arguments but are available as library functions.
+
+Here's a brief description of each of the invariant event args:
+
+ $event_name               the name of the event as text
+ $context                  an opaque 'cookie' used in calls back into perf
+ $common_cpu               the cpu the event occurred on
+ $common_secs              the secs portion of the event timestamp
+ $common_nsecs             the nsecs portion of the event timestamp
+ $common_pid               the pid of the current task
+ $common_comm              the name of the current process
+
+All of the remaining fields in the event's format file have
+counterparts as handler function arguments of the same name, as can be
+seen in the example above.
+
+The above provides the basics needed to directly access every field of
+every event in a trace, which covers 90% of what you need to know to
+write a useful trace script.  The sections below cover the rest.
+
+SCRIPT LAYOUT
+-------------
+
+Every perf script Perl script should start by setting up a Perl module
+search path and 'use'ing a few support modules (see module
+descriptions below):
+
+----
+ use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/perf-script-Util/lib";
+ use lib "./perf-script-Util/lib";
+ use Perf::Trace::Core;
+ use Perf::Trace::Context;
+ use Perf::Trace::Util;
+----
+
+The rest of the script can contain handler functions and support
+functions in any order.
+
+Aside from the event handler functions discussed above, every script
+can implement a set of optional functions:
+
+*trace_begin*, if defined, is called before any event is processed and
+gives scripts a chance to do setup tasks:
+
+----
+ sub trace_begin
+ {
+ }
+----
+
+*trace_end*, if defined, is called after all events have been
+ processed and gives scripts a chance to do end-of-script tasks, such
+ as display results:
+
+----
+sub trace_end
+{
+}
+----
+
+*trace_unhandled*, if defined, is called after for any event that
+ doesn't have a handler explicitly defined for it.  The standard set
+ of common arguments are passed into it:
+
+----
+sub trace_unhandled
+{
+    my ($event_name, $context, $common_cpu, $common_secs,
+        $common_nsecs, $common_pid, $common_comm) = @_;
+}
+----
+
+The remaining sections provide descriptions of each of the available
+built-in perf script Perl modules and their associated functions.
+
+AVAILABLE MODULES AND FUNCTIONS
+-------------------------------
+
+The following sections describe the functions and variables available
+via the various Perf::Trace::* Perl modules.  To use the functions and
+variables from the given module, add the corresponding 'use
+Perf::Trace::XXX' line to your perf script script.
+
+Perf::Trace::Core Module
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+These functions provide some essential functions to user scripts.
+
+The *flag_str* and *symbol_str* functions provide human-readable
+strings for flag and symbolic fields.  These correspond to the strings
+and values parsed from the 'print fmt' fields of the event format
+files:
+
+  flag_str($event_name, $field_name, $field_value) - returns the string represention corresponding to $field_value for the flag field $field_name of event $event_name
+  symbol_str($event_name, $field_name, $field_value) - returns the string represention corresponding to $field_value for the symbolic field $field_name of event $event_name
+
+Perf::Trace::Context Module
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some of the 'common' fields in the event format file aren't all that
+common, but need to be made accessible to user scripts nonetheless.
+
+Perf::Trace::Context defines a set of functions that can be used to
+access this data in the context of the current event.  Each of these
+functions expects a $context variable, which is the same as the
+$context variable passed into every event handler as the second
+argument.
+
+ common_pc($context) - returns common_preempt count for the current event
+ common_flags($context) - returns common_flags for the current event
+ common_lock_depth($context) - returns common_lock_depth for the current event
+
+Perf::Trace::Util Module
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Various utility functions for use with perf script:
+
+  nsecs($secs, $nsecs) - returns total nsecs given secs/nsecs pair
+  nsecs_secs($nsecs) - returns whole secs portion given nsecs
+  nsecs_nsecs($nsecs) - returns nsecs remainder given nsecs
+  nsecs_str($nsecs) - returns printable string in the form secs.nsecs
+  avg($total, $n) - returns average given a sum and a total number of values
+
+SEE ALSO
+--------
+linkperf:perf-script[1]
diff --git a/tools/perf/Documentation/perf-script-python.txt b/tools/perf/Documentation/perf-script-python.txt

new file mode 100644 (file)

index 0000000..36b3827
--- /dev/null
+++ b/tools/perf/Documentation/perf-script-python.txt
@@ -0,0 +1,623 @@
+perf-script-python(1)
+====================
+
+NAME
+----
+perf-script-python - Process trace data with a Python script
+
+SYNOPSIS
+--------
+[verse]
+'perf script' [-s [Python]:script[.py] ]
+
+DESCRIPTION
+-----------
+
+This perf script option is used to process perf script data using perf's
+built-in Python interpreter.  It reads and processes the input file and
+displays the results of the trace analysis implemented in the given
+Python script, if any.
+
+A QUICK EXAMPLE
+---------------
+
+This section shows the process, start to finish, of creating a working
+Python script that aggregates and extracts useful information from a
+raw perf script stream.  You can avoid reading the rest of this
+document if an example is enough for you; the rest of the document
+provides more details on each step and lists the library functions
+available to script writers.
+
+This example actually details the steps that were used to create the
+'syscall-counts' script you see when you list the available perf script
+scripts via 'perf script -l'.  As such, this script also shows how to
+integrate your script into the list of general-purpose 'perf script'
+scripts listed by that command.
+
+The syscall-counts script is a simple script, but demonstrates all the
+basic ideas necessary to create a useful script.  Here's an example
+of its output (syscall names are not yet supported, they will appear
+as numbers):
+
+----
+syscall events:
+
+event                                          count
+----------------------------------------  -----------
+sys_write                                     455067
+sys_getdents                                    4072
+sys_close                                       3037
+sys_swapoff                                     1769
+sys_read                                         923
+sys_sched_setparam                               826
+sys_open                                         331
+sys_newfstat                                     326
+sys_mmap                                         217
+sys_munmap                                       216
+sys_futex                                        141
+sys_select                                       102
+sys_poll                                          84
+sys_setitimer                                     12
+sys_writev                                         8
+15                                                 8
+sys_lseek                                          7
+sys_rt_sigprocmask                                 6
+sys_wait4                                          3
+sys_ioctl                                          3
+sys_set_robust_list                                1
+sys_exit                                           1
+56                                                 1
+sys_access                                         1
+----
+
+Basically our task is to keep a per-syscall tally that gets updated
+every time a system call occurs in the system.  Our script will do
+that, but first we need to record the data that will be processed by
+that script.  Theoretically, there are a couple of ways we could do
+that:
+
+- we could enable every event under the tracing/events/syscalls
+  directory, but this is over 600 syscalls, well beyond the number
+  allowable by perf.  These individual syscall events will however be
+  useful if we want to later use the guidance we get from the
+  general-purpose scripts to drill down and get more detail about
+  individual syscalls of interest.
+
+- we can enable the sys_enter and/or sys_exit syscalls found under
+  tracing/events/raw_syscalls.  These are called for all syscalls; the
+  'id' field can be used to distinguish between individual syscall
+  numbers.
+
+For this script, we only need to know that a syscall was entered; we
+don't care how it exited, so we'll use 'perf record' to record only
+the sys_enter events:
+
+----
+# perf record -a -e raw_syscalls:sys_enter
+
+^C[ perf record: Woken up 1 times to write data ]
+[ perf record: Captured and wrote 56.545 MB perf.data (~2470503 samples) ]
+----
+
+The options basically say to collect data for every syscall event
+system-wide and multiplex the per-cpu output into a single stream.
+That single stream will be recorded in a file in the current directory
+called perf.data.
+
+Once we have a perf.data file containing our data, we can use the -g
+'perf script' option to generate a Python script that will contain a
+callback handler for each event type found in the perf.data trace
+stream (for more details, see the STARTER SCRIPTS section).
+
+----
+# perf script -g python
+generated Python script: perf-script.py
+
+The output file created also in the current directory is named
+perf-script.py.  Here's the file in its entirety:
+
+# perf script event handlers, generated by perf script -g python
+# Licensed under the terms of the GNU GPL License version 2
+
+# The common_* event handler fields are the most useful fields common to
+# all events.  They don't necessarily correspond to the 'common_*' fields
+# in the format files.  Those fields not available as handler params can
+# be retrieved using Python functions of the form common_*(context).
+# See the perf-script-python Documentation for the list of available functions.
+
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+       '/scripts/python/perf-script-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+
+def trace_begin():
+       print "in trace_begin"
+
+def trace_end():
+       print "in trace_end"
+
+def raw_syscalls__sys_enter(event_name, context, common_cpu,
+       common_secs, common_nsecs, common_pid, common_comm,
+       id, args):
+               print_header(event_name, common_cpu, common_secs, common_nsecs,
+                       common_pid, common_comm)
+
+               print "id=%d, args=%s\n" % \
+               (id, args),
+
+def trace_unhandled(event_name, context, common_cpu, common_secs, common_nsecs,
+               common_pid, common_comm):
+               print_header(event_name, common_cpu, common_secs, common_nsecs,
+               common_pid, common_comm)
+
+def print_header(event_name, cpu, secs, nsecs, pid, comm):
+       print "%-20s %5u %05u.%09u %8u %-20s " % \
+       (event_name, cpu, secs, nsecs, pid, comm),
+----
+
+At the top is a comment block followed by some import statements and a
+path append which every perf script script should include.
+
+Following that are a couple generated functions, trace_begin() and
+trace_end(), which are called at the beginning and the end of the
+script respectively (for more details, see the SCRIPT_LAYOUT section
+below).
+
+Following those are the 'event handler' functions generated one for
+every event in the 'perf record' output.  The handler functions take
+the form subsystem__event_name, and contain named parameters, one for
+each field in the event; in this case, there's only one event,
+raw_syscalls__sys_enter().  (see the EVENT HANDLERS section below for
+more info on event handlers).
+
+The final couple of functions are, like the begin and end functions,
+generated for every script.  The first, trace_unhandled(), is called
+every time the script finds an event in the perf.data file that
+doesn't correspond to any event handler in the script.  This could
+mean either that the record step recorded event types that it wasn't
+really interested in, or the script was run against a trace file that
+doesn't correspond to the script.
+
+The script generated by -g option simply prints a line for each
+event found in the trace stream i.e. it basically just dumps the event
+and its parameter values to stdout.  The print_header() function is
+simply a utility function used for that purpose.  Let's rename the
+script and run it to see the default output:
+
+----
+# mv perf-script.py syscall-counts.py
+# perf script -s syscall-counts.py
+
+raw_syscalls__sys_enter     1 00840.847582083     7506 perf                  id=1, args=
+raw_syscalls__sys_enter     1 00840.847595764     7506 perf                  id=1, args=
+raw_syscalls__sys_enter     1 00840.847620860     7506 perf                  id=1, args=
+raw_syscalls__sys_enter     1 00840.847710478     6533 npviewer.bin          id=78, args=
+raw_syscalls__sys_enter     1 00840.847719204     6533 npviewer.bin          id=142, args=
+raw_syscalls__sys_enter     1 00840.847755445     6533 npviewer.bin          id=3, args=
+raw_syscalls__sys_enter     1 00840.847775601     6533 npviewer.bin          id=3, args=
+raw_syscalls__sys_enter     1 00840.847781820     6533 npviewer.bin          id=3, args=
+.
+.
+.
+----
+
+Of course, for this script, we're not interested in printing every
+trace event, but rather aggregating it in a useful way.  So we'll get
+rid of everything to do with printing as well as the trace_begin() and
+trace_unhandled() functions, which we won't be using.  That leaves us
+with this minimalistic skeleton:
+
+----
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+       '/scripts/python/perf-script-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+
+def trace_end():
+       print "in trace_end"
+
+def raw_syscalls__sys_enter(event_name, context, common_cpu,
+       common_secs, common_nsecs, common_pid, common_comm,
+       id, args):
+----
+
+In trace_end(), we'll simply print the results, but first we need to
+generate some results to print.  To do that we need to have our
+sys_enter() handler do the necessary tallying until all events have
+been counted.  A hash table indexed by syscall id is a good way to
+store that information; every time the sys_enter() handler is called,
+we simply increment a count associated with that hash entry indexed by
+that syscall id:
+
+----
+  syscalls = autodict()
+
+  try:
+    syscalls[id] += 1
+  except TypeError:
+    syscalls[id] = 1
+----
+
+The syscalls 'autodict' object is a special kind of Python dictionary
+(implemented in Core.py) that implements Perl's 'autovivifying' hashes
+in Python i.e. with autovivifying hashes, you can assign nested hash
+values without having to go to the trouble of creating intermediate
+levels if they don't exist e.g syscalls[comm][pid][id] = 1 will create
+the intermediate hash levels and finally assign the value 1 to the
+hash entry for 'id' (because the value being assigned isn't a hash
+object itself, the initial value is assigned in the TypeError
+exception.  Well, there may be a better way to do this in Python but
+that's what works for now).
+
+Putting that code into the raw_syscalls__sys_enter() handler, we
+effectively end up with a single-level dictionary keyed on syscall id
+and having the counts we've tallied as values.
+
+The print_syscall_totals() function iterates over the entries in the
+dictionary and displays a line for each entry containing the syscall
+name (the dictonary keys contain the syscall ids, which are passed to
+the Util function syscall_name(), which translates the raw syscall
+numbers to the corresponding syscall name strings).  The output is
+displayed after all the events in the trace have been processed, by
+calling the print_syscall_totals() function from the trace_end()
+handler called at the end of script processing.
+
+The final script producing the output shown above is shown in its
+entirety below (syscall_name() helper is not yet available, you can
+only deal with id's for now):
+
+----
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+       '/scripts/python/perf-script-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+from Util import *
+
+syscalls = autodict()
+
+def trace_end():
+       print_syscall_totals()
+
+def raw_syscalls__sys_enter(event_name, context, common_cpu,
+       common_secs, common_nsecs, common_pid, common_comm,
+       id, args):
+       try:
+               syscalls[id] += 1
+       except TypeError:
+               syscalls[id] = 1
+
+def print_syscall_totals():
+    if for_comm is not None:
+           print "\nsyscall events for %s:\n\n" % (for_comm),
+    else:
+           print "\nsyscall events:\n\n",
+
+    print "%-40s  %10s\n" % ("event", "count"),
+    print "%-40s  %10s\n" % ("----------------------------------------", \
+                                 "-----------"),
+
+    for id, val in sorted(syscalls.iteritems(), key = lambda(k, v): (v, k), \
+                                 reverse = True):
+           print "%-40s  %10d\n" % (syscall_name(id), val),
+----
+
+The script can be run just as before:
+
+  # perf script -s syscall-counts.py
+
+So those are the essential steps in writing and running a script.  The
+process can be generalized to any tracepoint or set of tracepoints
+you're interested in - basically find the tracepoint(s) you're
+interested in by looking at the list of available events shown by
+'perf list' and/or look in /sys/kernel/debug/tracing events for
+detailed event and field info, record the corresponding trace data
+using 'perf record', passing it the list of interesting events,
+generate a skeleton script using 'perf script -g python' and modify the
+code to aggregate and display it for your particular needs.
+
+After you've done that you may end up with a general-purpose script
+that you want to keep around and have available for future use.  By
+writing a couple of very simple shell scripts and putting them in the
+right place, you can have your script listed alongside the other
+scripts listed by the 'perf script -l' command e.g.:
+
+----
+root@tropicana:~# perf script -l
+List of available trace scripts:
+  workqueue-stats                      workqueue stats (ins/exe/create/destroy)
+  wakeup-latency                       system-wide min/max/avg wakeup latency
+  rw-by-file <comm>                    r/w activity for a program, by file
+  rw-by-pid                            system-wide r/w activity
+----
+
+A nice side effect of doing this is that you also then capture the
+probably lengthy 'perf record' command needed to record the events for
+the script.
+
+To have the script appear as a 'built-in' script, you write two simple
+scripts, one for recording and one for 'reporting'.
+
+The 'record' script is a shell script with the same base name as your
+script, but with -record appended.  The shell script should be put
+into the perf/scripts/python/bin directory in the kernel source tree.
+In that script, you write the 'perf record' command-line needed for
+your script:
+
+----
+# cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-record
+
+#!/bin/bash
+perf record -a -e raw_syscalls:sys_enter
+----
+
+The 'report' script is also a shell script with the same base name as
+your script, but with -report appended.  It should also be located in
+the perf/scripts/python/bin directory.  In that script, you write the
+'perf script -s' command-line needed for running your script:
+
+----
+# cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-report
+
+#!/bin/bash
+# description: system-wide syscall counts
+perf script -s ~/libexec/perf-core/scripts/python/syscall-counts.py
+----
+
+Note that the location of the Python script given in the shell script
+is in the libexec/perf-core/scripts/python directory - this is where
+the script will be copied by 'make install' when you install perf.
+For the installation to install your script there, your script needs
+to be located in the perf/scripts/python directory in the kernel
+source tree:
+
+----
+# ls -al kernel-source/tools/perf/scripts/python
+
+root@tropicana:/home/trz/src/tip# ls -al tools/perf/scripts/python
+total 32
+drwxr-xr-x 4 trz trz 4096 2010-01-26 22:30 .
+drwxr-xr-x 4 trz trz 4096 2010-01-26 22:29 ..
+drwxr-xr-x 2 trz trz 4096 2010-01-26 22:29 bin
+-rw-r--r-- 1 trz trz 2548 2010-01-26 22:29 check-perf-script.py
+drwxr-xr-x 3 trz trz 4096 2010-01-26 22:49 perf-script-Util
+-rw-r--r-- 1 trz trz 1462 2010-01-26 22:30 syscall-counts.py
+----
+
+Once you've done that (don't forget to do a new 'make install',
+otherwise your script won't show up at run-time), 'perf script -l'
+should show a new entry for your script:
+
+----
+root@tropicana:~# perf script -l
+List of available trace scripts:
+  workqueue-stats                      workqueue stats (ins/exe/create/destroy)
+  wakeup-latency                       system-wide min/max/avg wakeup latency
+  rw-by-file <comm>                    r/w activity for a program, by file
+  rw-by-pid                            system-wide r/w activity
+  syscall-counts                       system-wide syscall counts
+----
+
+You can now perform the record step via 'perf script record':
+
+  # perf script record syscall-counts
+
+and display the output using 'perf script report':
+
+  # perf script report syscall-counts
+
+STARTER SCRIPTS
+---------------
+
+You can quickly get started writing a script for a particular set of
+trace data by generating a skeleton script using 'perf script -g
+python' in the same directory as an existing perf.data trace file.
+That will generate a starter script containing a handler for each of
+the event types in the trace file; it simply prints every available
+field for each event in the trace file.
+
+You can also look at the existing scripts in
+~/libexec/perf-core/scripts/python for typical examples showing how to
+do basic things like aggregate event data, print results, etc.  Also,
+the check-perf-script.py script, while not interesting for its results,
+attempts to exercise all of the main scripting features.
+
+EVENT HANDLERS
+--------------
+
+When perf script is invoked using a trace script, a user-defined
+'handler function' is called for each event in the trace.  If there's
+no handler function defined for a given event type, the event is
+ignored (or passed to a 'trace_handled' function, see below) and the
+next event is processed.
+
+Most of the event's field values are passed as arguments to the
+handler function; some of the less common ones aren't - those are
+available as calls back into the perf executable (see below).
+
+As an example, the following perf record command can be used to record
+all sched_wakeup events in the system:
+
+ # perf record -a -e sched:sched_wakeup
+
+Traces meant to be processed using a script should be recorded with
+the above option: -a to enable system-wide collection.
+
+The format file for the sched_wakep event defines the following fields
+(see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format):
+
+----
+ format:
+        field:unsigned short common_type;
+        field:unsigned char common_flags;
+        field:unsigned char common_preempt_count;
+        field:int common_pid;
+        field:int common_lock_depth;
+
+        field:char comm[TASK_COMM_LEN];
+        field:pid_t pid;
+        field:int prio;
+        field:int success;
+        field:int target_cpu;
+----
+
+The handler function for this event would be defined as:
+
+----
+def sched__sched_wakeup(event_name, context, common_cpu, common_secs,
+       common_nsecs, common_pid, common_comm,
+       comm, pid, prio, success, target_cpu):
+       pass
+----
+
+The handler function takes the form subsystem__event_name.
+
+The common_* arguments in the handler's argument list are the set of
+arguments passed to all event handlers; some of the fields correspond
+to the common_* fields in the format file, but some are synthesized,
+and some of the common_* fields aren't common enough to to be passed
+to every event as arguments but are available as library functions.
+
+Here's a brief description of each of the invariant event args:
+
+ event_name                the name of the event as text
+ context                   an opaque 'cookie' used in calls back into perf
+ common_cpu                the cpu the event occurred on
+ common_secs               the secs portion of the event timestamp
+ common_nsecs              the nsecs portion of the event timestamp
+ common_pid                the pid of the current task
+ common_comm               the name of the current process
+
+All of the remaining fields in the event's format file have
+counterparts as handler function arguments of the same name, as can be
+seen in the example above.
+
+The above provides the basics needed to directly access every field of
+every event in a trace, which covers 90% of what you need to know to
+write a useful trace script.  The sections below cover the rest.
+
+SCRIPT LAYOUT
+-------------
+
+Every perf script Python script should start by setting up a Python
+module search path and 'import'ing a few support modules (see module
+descriptions below):
+
+----
+ import os
+ import sys
+
+ sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+             '/scripts/python/perf-script-Util/lib/Perf/Trace')
+
+ from perf_trace_context import *
+ from Core import *
+----
+
+The rest of the script can contain handler functions and support
+functions in any order.
+
+Aside from the event handler functions discussed above, every script
+can implement a set of optional functions:
+
+*trace_begin*, if defined, is called before any event is processed and
+gives scripts a chance to do setup tasks:
+
+----
+def trace_begin:
+    pass
+----
+
+*trace_end*, if defined, is called after all events have been
+ processed and gives scripts a chance to do end-of-script tasks, such
+ as display results:
+
+----
+def trace_end:
+    pass
+----
+
+*trace_unhandled*, if defined, is called after for any event that
+ doesn't have a handler explicitly defined for it.  The standard set
+ of common arguments are passed into it:
+
+----
+def trace_unhandled(event_name, context, common_cpu, common_secs,
+        common_nsecs, common_pid, common_comm):
+    pass
+----
+
+The remaining sections provide descriptions of each of the available
+built-in perf script Python modules and their associated functions.
+
+AVAILABLE MODULES AND FUNCTIONS
+-------------------------------
+
+The following sections describe the functions and variables available
+via the various perf script Python modules.  To use the functions and
+variables from the given module, add the corresponding 'from XXXX
+import' line to your perf script script.
+
+Core.py Module
+~~~~~~~~~~~~~~
+
+These functions provide some essential functions to user scripts.
+
+The *flag_str* and *symbol_str* functions provide human-readable
+strings for flag and symbolic fields.  These correspond to the strings
+and values parsed from the 'print fmt' fields of the event format
+files:
+
+  flag_str(event_name, field_name, field_value) - returns the string represention corresponding to field_value for the flag field field_name of event event_name
+  symbol_str(event_name, field_name, field_value) - returns the string represention corresponding to field_value for the symbolic field field_name of event event_name
+
+The *autodict* function returns a special kind of Python
+dictionary that implements Perl's 'autovivifying' hashes in Python
+i.e. with autovivifying hashes, you can assign nested hash values
+without having to go to the trouble of creating intermediate levels if
+they don't exist.
+
+  autodict() - returns an autovivifying dictionary instance
+
+
+perf_trace_context Module
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some of the 'common' fields in the event format file aren't all that
+common, but need to be made accessible to user scripts nonetheless.
+
+perf_trace_context defines a set of functions that can be used to
+access this data in the context of the current event.  Each of these
+functions expects a context variable, which is the same as the
+context variable passed into every event handler as the second
+argument.
+
+ common_pc(context) - returns common_preempt count for the current event
+ common_flags(context) - returns common_flags for the current event
+ common_lock_depth(context) - returns common_lock_depth for the current event
+
+Util.py Module
+~~~~~~~~~~~~~~
+
+Various utility functions for use with perf script:
+
+  nsecs(secs, nsecs) - returns total nsecs given secs/nsecs pair
+  nsecs_secs(nsecs) - returns whole secs portion given nsecs
+  nsecs_nsecs(nsecs) - returns nsecs remainder given nsecs
+  nsecs_str(nsecs) - returns printable string in the form secs.nsecs
+  avg(total, n) - returns average given a sum and a total number of values
+
+SEE ALSO
+--------
+linkperf:perf-script[1]
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt

new file mode 100644 (file)

index 0000000..29ad942
--- /dev/null
+++ b/tools/perf/Documentation/perf-script.txt
@@ -0,0 +1,118 @@
+perf-script(1)
+=============
+
+NAME
+----
+perf-script - Read perf.data (created by perf record) and display trace output
+
+SYNOPSIS
+--------
+[verse]
+'perf script' [<options>]
+'perf script' [<options>] record <script> [<record-options>] <command>
+'perf script' [<options>] report <script> [script-args]
+'perf script' [<options>] <script> <required-script-args> [<record-options>] <command>
+'perf script' [<options>] <top-script> [script-args]
+
+DESCRIPTION
+-----------
+This command reads the input file and displays the trace recorded.
+
+There are several variants of perf script:
+
+  'perf script' to see a detailed trace of the workload that was
+  recorded.
+
+  You can also run a set of pre-canned scripts that aggregate and
+  summarize the raw trace data in various ways (the list of scripts is
+  available via 'perf script -l').  The following variants allow you to
+  record and run those scripts:
+
+  'perf script record <script> <command>' to record the events required
+  for 'perf script report'.  <script> is the name displayed in the
+  output of 'perf script --list' i.e. the actual script name minus any
+  language extension.  If <command> is not specified, the events are
+  recorded using the -a (system-wide) 'perf record' option.
+
+  'perf script report <script> [args]' to run and display the results
+  of <script>.  <script> is the name displayed in the output of 'perf
+  trace --list' i.e. the actual script name minus any language
+  extension.  The perf.data output from a previous run of 'perf script
+  record <script>' is used and should be present for this command to
+  succeed.  [args] refers to the (mainly optional) args expected by
+  the script.
+
+  'perf script <script> <required-script-args> <command>' to both
+  record the events required for <script> and to run the <script>
+  using 'live-mode' i.e. without writing anything to disk.  <script>
+  is the name displayed in the output of 'perf script --list' i.e. the
+  actual script name minus any language extension.  If <command> is
+  not specified, the events are recorded using the -a (system-wide)
+  'perf record' option.  If <script> has any required args, they
+  should be specified before <command>.  This mode doesn't allow for
+  optional script args to be specified; if optional script args are
+  desired, they can be specified using separate 'perf script record'
+  and 'perf script report' commands, with the stdout of the record step
+  piped to the stdin of the report script, using the '-o -' and '-i -'
+  options of the corresponding commands.
+
+  'perf script <top-script>' to both record the events required for
+  <top-script> and to run the <top-script> using 'live-mode'
+  i.e. without writing anything to disk.  <top-script> is the name
+  displayed in the output of 'perf script --list' i.e. the actual
+  script name minus any language extension; a <top-script> is defined
+  as any script name ending with the string 'top'.
+
+  [<record-options>] can be passed to the record steps of 'perf script
+  record' and 'live-mode' variants; this isn't possible however for
+  <top-script> 'live-mode' or 'perf script report' variants.
+
+  See the 'SEE ALSO' section for links to language-specific
+  information on how to write and run your own trace scripts.
+
+OPTIONS
+-------
+<command>...::
+       Any command you can specify in a shell.
+
+-D::
+--dump-raw-script=::
+        Display verbose dump of the trace data.
+
+-L::
+--Latency=::
+        Show latency attributes (irqs/preemption disabled, etc).
+
+-l::
+--list=::
+        Display a list of available trace scripts.
+
+-s ['lang']::
+--script=::
+        Process trace data with the given script ([lang]:script[.ext]).
+       If the string 'lang' is specified in place of a script name, a
+        list of supported languages will be displayed instead.
+
+-g::
+--gen-script=::
+        Generate perf-script.[ext] starter script for given language,
+        using current perf.data.
+
+-a::
+        Force system-wide collection.  Scripts run without a <command>
+        normally use -a by default, while scripts run with a <command>
+        normally don't - this option allows the latter to be run in
+        system-wide mode.
+
+-i::
+--input=::
+        Input file name.
+
+-d::
+--debug-mode::
+        Do various checks like samples ordering and lost events.
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-script-perl[1],
+linkperf:perf-script-python[1]
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt

index 4b3a2d46b4378607f5195d12328646f5b1d7a638..b6da7affbbeeb82533387e9ba3f7c788d3e28dec 100644 (file)
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -8,8 +8,8 @@ perf-stat - Run a command and gather performance counter statistics
  SYNOPSIS
  --------
  [verse]
-'perf stat' [-e <EVENT> | --event=EVENT] [-S] [-a] <command>
-'perf stat' [-e <EVENT> | --event=EVENT] [-S] [-a] -- <command> [<options>]
+'perf stat' [-e <EVENT> | --event=EVENT] [-a] <command>
+'perf stat' [-e <EVENT> | --event=EVENT] [-a] -- <command> [<options>]
  
  DESCRIPTION
  -----------
@@ -35,24 +35,54 @@ OPTIONS
          child tasks do not inherit counters
  -p::
  --pid=<pid>::
-        stat events on existing pid
+        stat events on existing process id
+
+-t::
+--tid=<tid>::
+        stat events on existing thread id
+
  
  -a::
-        system-wide collection
+--all-cpus::
+        system-wide collection from all CPUs
  
  -c::
-        scale counter values
+--scale::
+       scale/normalize counter values
+
+-r::
+--repeat=<n>::
+       repeat command and print average + stddev (max: 100)
  
  -B::
+--big-num::
          print large numbers with thousands' separators according to locale
  
  -C::
  --cpu=::
-Count only on the list of cpus provided. Multiple CPUs can be provided as a
-comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+Count only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
  In per-thread mode, this option is ignored. The -a option is still necessary
  to activate system-wide monitoring. Default is to count on all CPUs.
  
+-A::
+--no-aggr::
+Do not aggregate counts across all monitored CPUs in system-wide mode (-a).
+This option is only valid in system-wide mode.
+
+-n::
+--null::
+        null run - don't start any counters
+
+-v::
+--verbose::
+        be more verbose (show counter open errors, etc)
+
+-x SEP::
+--field-separator SEP::
+print counts using a CSV-style output to make it easy to import directly into
+spreadsheets. Columns are separated by the string specified in SEP.
+
  EXAMPLES
  --------
  
diff --git a/tools/perf/Documentation/perf-test.txt b/tools/perf/Documentation/perf-test.txt

index 1c4b5f5b7f71ec7047be7f02a369e8e27fc8710d..2c3b462f64b00531b4c8a7e5fc5cb6224dfc88d6 100644 (file)
--- a/tools/perf/Documentation/perf-test.txt
+++ b/tools/perf/Documentation/perf-test.txt
@@ -12,7 +12,7 @@ SYNOPSIS
  
  DESCRIPTION
  -----------
-This command does assorted sanity tests, initially thru linked routines but
+This command does assorted sanity tests, initially through linked routines but
  also will look for a directory with more tests in the form of scripts.
  
  OPTIONS
diff --git a/tools/perf/Documentation/perf-timechart.txt b/tools/perf/Documentation/perf-timechart.txt

index 4b1788355ecac3d305bf72e6f58d5a477e08ba7b..d7b79e2ba2adbe2cc0cb6468a9d84d6b73b8ed0f 100644 (file)
--- a/tools/perf/Documentation/perf-timechart.txt
+++ b/tools/perf/Documentation/perf-timechart.txt
@@ -38,6 +38,8 @@ OPTIONS
  --process::
          Select the processes to display, by name or PID
  
+--symfs=<directory>::
+        Look for files with symbols relative to this directory.
  
  SEE ALSO
  --------
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt

index 1f9687663f2a9cd62d6cff9f597395523b3da931..f6eb1cdafb7758162463b0ca8a25525f94c44d19 100644 (file)
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -12,7 +12,7 @@ SYNOPSIS
  
  DESCRIPTION
  -----------
-This command generates and displays a performance counter profile in realtime.
+This command generates and displays a performance counter profile in real time.
  
  
  OPTIONS
@@ -27,8 +27,8 @@ OPTIONS
  
  -C <cpu-list>::
  --cpu=<cpu>::
-Monitor only on the list of cpus provided. Multiple CPUs can be provided as a
-comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+Monitor only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
  Default is to monitor all CPUS.
  
  -d <seconds>::
@@ -50,6 +50,10 @@ Default is to monitor all CPUS.
  --count-filter=<count>::
         Only display functions with more events than this.
  
+-g::
+--group::
+        Put the counters into a counter group.
+
  -F <freq>::
  --freq=<freq>::
         Profile at this frequency.
@@ -68,7 +72,11 @@ Default is to monitor all CPUS.
  
  -p <pid>::
  --pid=<pid>::
-       Profile events on existing pid.
+       Profile events on existing Process ID.
+
+-t <tid>::
+--tid=<tid>::
+        Profile events on existing thread ID.
  
  -r <priority>::
  --realtime=<priority>::
@@ -78,6 +86,18 @@ Default is to monitor all CPUS.
  --sym-annotate=<symbol>::
          Annotate this symbol.
  
+-K::
+--hide_kernel_symbols::
+        Hide kernel symbols.
+
+-U::
+--hide_user_symbols::
+        Hide user symbols.
+
+-D::
+--dump-symtab::
+        Dump the symbol table used for profiling.
+
  -v::
  --verbose::
         Be more verbose (show counter open errors, etc).
diff --git a/tools/perf/Documentation/perf-trace-perl.txt b/tools/perf/Documentation/perf-trace-perl.txt

deleted file mode 100644 (file)

index ee6525e..0000000
--- a/tools/perf/Documentation/perf-trace-perl.txt
+++ /dev/null
@@ -1,217 +0,0 @@
-perf-trace-perl(1)
-==================
-
-NAME
-----
-perf-trace-perl - Process trace data with a Perl script
-
-SYNOPSIS
---------
-[verse]
-'perf trace' [-s [Perl]:script[.pl] ]
-
-DESCRIPTION
------------
-
-This perf trace option is used to process perf trace data using perf's
-built-in Perl interpreter.  It reads and processes the input file and
-displays the results of the trace analysis implemented in the given
-Perl script, if any.
-
-STARTER SCRIPTS
----------------
-
-You can avoid reading the rest of this document by running 'perf trace
--g perl' in the same directory as an existing perf.data trace file.
-That will generate a starter script containing a handler for each of
-the event types in the trace file; it simply prints every available
-field for each event in the trace file.
-
-You can also look at the existing scripts in
-~/libexec/perf-core/scripts/perl for typical examples showing how to
-do basic things like aggregate event data, print results, etc.  Also,
-the check-perf-trace.pl script, while not interesting for its results,
-attempts to exercise all of the main scripting features.
-
-EVENT HANDLERS
---------------
-
-When perf trace is invoked using a trace script, a user-defined
-'handler function' is called for each event in the trace.  If there's
-no handler function defined for a given event type, the event is
-ignored (or passed to a 'trace_handled' function, see below) and the
-next event is processed.
-
-Most of the event's field values are passed as arguments to the
-handler function; some of the less common ones aren't - those are
-available as calls back into the perf executable (see below).
-
-As an example, the following perf record command can be used to record
-all sched_wakeup events in the system:
-
- # perf record -a -e sched:sched_wakeup
-
-Traces meant to be processed using a script should be recorded with
-the above option: -a to enable system-wide collection.
-
-The format file for the sched_wakep event defines the following fields
-(see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format):
-
-----
- format:
-        field:unsigned short common_type;
-        field:unsigned char common_flags;
-        field:unsigned char common_preempt_count;
-        field:int common_pid;
-        field:int common_lock_depth;
-
-        field:char comm[TASK_COMM_LEN];
-        field:pid_t pid;
-        field:int prio;
-        field:int success;
-        field:int target_cpu;
-----
-
-The handler function for this event would be defined as:
-
-----
-sub sched::sched_wakeup
-{
-   my ($event_name, $context, $common_cpu, $common_secs,
-       $common_nsecs, $common_pid, $common_comm,
-       $comm, $pid, $prio, $success, $target_cpu) = @_;
-}
-----
-
-The handler function takes the form subsystem::event_name.
-
-The $common_* arguments in the handler's argument list are the set of
-arguments passed to all event handlers; some of the fields correspond
-to the common_* fields in the format file, but some are synthesized,
-and some of the common_* fields aren't common enough to to be passed
-to every event as arguments but are available as library functions.
-
-Here's a brief description of each of the invariant event args:
-
- $event_name               the name of the event as text
- $context                  an opaque 'cookie' used in calls back into perf
- $common_cpu               the cpu the event occurred on
- $common_secs              the secs portion of the event timestamp
- $common_nsecs             the nsecs portion of the event timestamp
- $common_pid               the pid of the current task
- $common_comm              the name of the current process
-
-All of the remaining fields in the event's format file have
-counterparts as handler function arguments of the same name, as can be
-seen in the example above.
-
-The above provides the basics needed to directly access every field of
-every event in a trace, which covers 90% of what you need to know to
-write a useful trace script.  The sections below cover the rest.
-
-SCRIPT LAYOUT
--------------
-
-Every perf trace Perl script should start by setting up a Perl module
-search path and 'use'ing a few support modules (see module
-descriptions below):
-
-----
- use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib";
- use lib "./Perf-Trace-Util/lib";
- use Perf::Trace::Core;
- use Perf::Trace::Context;
- use Perf::Trace::Util;
-----
-
-The rest of the script can contain handler functions and support
-functions in any order.
-
-Aside from the event handler functions discussed above, every script
-can implement a set of optional functions:
-
-*trace_begin*, if defined, is called before any event is processed and
-gives scripts a chance to do setup tasks:
-
-----
- sub trace_begin
- {
- }
-----
-
-*trace_end*, if defined, is called after all events have been
- processed and gives scripts a chance to do end-of-script tasks, such
- as display results:
-
-----
-sub trace_end
-{
-}
-----
-
-*trace_unhandled*, if defined, is called after for any event that
- doesn't have a handler explicitly defined for it.  The standard set
- of common arguments are passed into it:
-
-----
-sub trace_unhandled
-{
-    my ($event_name, $context, $common_cpu, $common_secs,
-        $common_nsecs, $common_pid, $common_comm) = @_;
-}
-----
-
-The remaining sections provide descriptions of each of the available
-built-in perf trace Perl modules and their associated functions.
-
-AVAILABLE MODULES AND FUNCTIONS
--------------------------------
-
-The following sections describe the functions and variables available
-via the various Perf::Trace::* Perl modules.  To use the functions and
-variables from the given module, add the corresponding 'use
-Perf::Trace::XXX' line to your perf trace script.
-
-Perf::Trace::Core Module
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-These functions provide some essential functions to user scripts.
-
-The *flag_str* and *symbol_str* functions provide human-readable
-strings for flag and symbolic fields.  These correspond to the strings
-and values parsed from the 'print fmt' fields of the event format
-files:
-
-  flag_str($event_name, $field_name, $field_value) - returns the string represention corresponding to $field_value for the flag field $field_name of event $event_name
-  symbol_str($event_name, $field_name, $field_value) - returns the string represention corresponding to $field_value for the symbolic field $field_name of event $event_name
-
-Perf::Trace::Context Module
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Some of the 'common' fields in the event format file aren't all that
-common, but need to be made accessible to user scripts nonetheless.
-
-Perf::Trace::Context defines a set of functions that can be used to
-access this data in the context of the current event.  Each of these
-functions expects a $context variable, which is the same as the
-$context variable passed into every event handler as the second
-argument.
-
- common_pc($context) - returns common_preempt count for the current event
- common_flags($context) - returns common_flags for the current event
- common_lock_depth($context) - returns common_lock_depth for the current event
-
-Perf::Trace::Util Module
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-Various utility functions for use with perf trace:
-
-  nsecs($secs, $nsecs) - returns total nsecs given secs/nsecs pair
-  nsecs_secs($nsecs) - returns whole secs portion given nsecs
-  nsecs_nsecs($nsecs) - returns nsecs remainder given nsecs
-  nsecs_str($nsecs) - returns printable string in the form secs.nsecs
-  avg($total, $n) - returns average given a sum and a total number of values
-
-SEE ALSO
---------
-linkperf:perf-trace[1]
diff --git a/tools/perf/Documentation/perf-trace-python.txt b/tools/perf/Documentation/perf-trace-python.txt

deleted file mode 100644 (file)

index 693be80..0000000
--- a/tools/perf/Documentation/perf-trace-python.txt
+++ /dev/null
@@ -1,623 +0,0 @@
-perf-trace-python(1)
-====================
-
-NAME
-----
-perf-trace-python - Process trace data with a Python script
-
-SYNOPSIS
---------
-[verse]
-'perf trace' [-s [Python]:script[.py] ]
-
-DESCRIPTION
------------
-
-This perf trace option is used to process perf trace data using perf's
-built-in Python interpreter.  It reads and processes the input file and
-displays the results of the trace analysis implemented in the given
-Python script, if any.
-
-A QUICK EXAMPLE
----------------
-
-This section shows the process, start to finish, of creating a working
-Python script that aggregates and extracts useful information from a
-raw perf trace stream.  You can avoid reading the rest of this
-document if an example is enough for you; the rest of the document
-provides more details on each step and lists the library functions
-available to script writers.
-
-This example actually details the steps that were used to create the
-'syscall-counts' script you see when you list the available perf trace
-scripts via 'perf trace -l'.  As such, this script also shows how to
-integrate your script into the list of general-purpose 'perf trace'
-scripts listed by that command.
-
-The syscall-counts script is a simple script, but demonstrates all the
-basic ideas necessary to create a useful script.  Here's an example
-of its output (syscall names are not yet supported, they will appear
-as numbers):
-
-----
-syscall events:
-
-event                                          count
-----------------------------------------  -----------
-sys_write                                     455067
-sys_getdents                                    4072
-sys_close                                       3037
-sys_swapoff                                     1769
-sys_read                                         923
-sys_sched_setparam                               826
-sys_open                                         331
-sys_newfstat                                     326
-sys_mmap                                         217
-sys_munmap                                       216
-sys_futex                                        141
-sys_select                                       102
-sys_poll                                          84
-sys_setitimer                                     12
-sys_writev                                         8
-15                                                 8
-sys_lseek                                          7
-sys_rt_sigprocmask                                 6
-sys_wait4                                          3
-sys_ioctl                                          3
-sys_set_robust_list                                1
-sys_exit                                           1
-56                                                 1
-sys_access                                         1
-----
-
-Basically our task is to keep a per-syscall tally that gets updated
-every time a system call occurs in the system.  Our script will do
-that, but first we need to record the data that will be processed by
-that script.  Theoretically, there are a couple of ways we could do
-that:
-
-- we could enable every event under the tracing/events/syscalls
-  directory, but this is over 600 syscalls, well beyond the number
-  allowable by perf.  These individual syscall events will however be
-  useful if we want to later use the guidance we get from the
-  general-purpose scripts to drill down and get more detail about
-  individual syscalls of interest.
-
-- we can enable the sys_enter and/or sys_exit syscalls found under
-  tracing/events/raw_syscalls.  These are called for all syscalls; the
-  'id' field can be used to distinguish between individual syscall
-  numbers.
-
-For this script, we only need to know that a syscall was entered; we
-don't care how it exited, so we'll use 'perf record' to record only
-the sys_enter events:
-
-----
-# perf record -a -e raw_syscalls:sys_enter
-
-^C[ perf record: Woken up 1 times to write data ]
-[ perf record: Captured and wrote 56.545 MB perf.data (~2470503 samples) ]
-----
-
-The options basically say to collect data for every syscall event
-system-wide and multiplex the per-cpu output into a single stream.
-That single stream will be recorded in a file in the current directory
-called perf.data.
-
-Once we have a perf.data file containing our data, we can use the -g
-'perf trace' option to generate a Python script that will contain a
-callback handler for each event type found in the perf.data trace
-stream (for more details, see the STARTER SCRIPTS section).
-
-----
-# perf trace -g python
-generated Python script: perf-trace.py
-
-The output file created also in the current directory is named
-perf-trace.py.  Here's the file in its entirety:
-
-# perf trace event handlers, generated by perf trace -g python
-# Licensed under the terms of the GNU GPL License version 2
-
-# The common_* event handler fields are the most useful fields common to
-# all events.  They don't necessarily correspond to the 'common_*' fields
-# in the format files.  Those fields not available as handler params can
-# be retrieved using Python functions of the form common_*(context).
-# See the perf-trace-python Documentation for the list of available functions.
-
-import os
-import sys
-
-sys.path.append(os.environ['PERF_EXEC_PATH'] + \
-       '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
-
-from perf_trace_context import *
-from Core import *
-
-def trace_begin():
-       print "in trace_begin"
-
-def trace_end():
-       print "in trace_end"
-
-def raw_syscalls__sys_enter(event_name, context, common_cpu,
-       common_secs, common_nsecs, common_pid, common_comm,
-       id, args):
-               print_header(event_name, common_cpu, common_secs, common_nsecs,
-                       common_pid, common_comm)
-
-               print "id=%d, args=%s\n" % \
-               (id, args),
-
-def trace_unhandled(event_name, context, common_cpu, common_secs, common_nsecs,
-               common_pid, common_comm):
-               print_header(event_name, common_cpu, common_secs, common_nsecs,
-               common_pid, common_comm)
-
-def print_header(event_name, cpu, secs, nsecs, pid, comm):
-       print "%-20s %5u %05u.%09u %8u %-20s " % \
-       (event_name, cpu, secs, nsecs, pid, comm),
-----
-
-At the top is a comment block followed by some import statements and a
-path append which every perf trace script should include.
-
-Following that are a couple generated functions, trace_begin() and
-trace_end(), which are called at the beginning and the end of the
-script respectively (for more details, see the SCRIPT_LAYOUT section
-below).
-
-Following those are the 'event handler' functions generated one for
-every event in the 'perf record' output.  The handler functions take
-the form subsystem__event_name, and contain named parameters, one for
-each field in the event; in this case, there's only one event,
-raw_syscalls__sys_enter().  (see the EVENT HANDLERS section below for
-more info on event handlers).
-
-The final couple of functions are, like the begin and end functions,
-generated for every script.  The first, trace_unhandled(), is called
-every time the script finds an event in the perf.data file that
-doesn't correspond to any event handler in the script.  This could
-mean either that the record step recorded event types that it wasn't
-really interested in, or the script was run against a trace file that
-doesn't correspond to the script.
-
-The script generated by -g option simply prints a line for each
-event found in the trace stream i.e. it basically just dumps the event
-and its parameter values to stdout.  The print_header() function is
-simply a utility function used for that purpose.  Let's rename the
-script and run it to see the default output:
-
-----
-# mv perf-trace.py syscall-counts.py
-# perf trace -s syscall-counts.py
-
-raw_syscalls__sys_enter     1 00840.847582083     7506 perf                  id=1, args=
-raw_syscalls__sys_enter     1 00840.847595764     7506 perf                  id=1, args=
-raw_syscalls__sys_enter     1 00840.847620860     7506 perf                  id=1, args=
-raw_syscalls__sys_enter     1 00840.847710478     6533 npviewer.bin          id=78, args=
-raw_syscalls__sys_enter     1 00840.847719204     6533 npviewer.bin          id=142, args=
-raw_syscalls__sys_enter     1 00840.847755445     6533 npviewer.bin          id=3, args=
-raw_syscalls__sys_enter     1 00840.847775601     6533 npviewer.bin          id=3, args=
-raw_syscalls__sys_enter     1 00840.847781820     6533 npviewer.bin          id=3, args=
-.
-.
-.
-----
-
-Of course, for this script, we're not interested in printing every
-trace event, but rather aggregating it in a useful way.  So we'll get
-rid of everything to do with printing as well as the trace_begin() and
-trace_unhandled() functions, which we won't be using.  That leaves us
-with this minimalistic skeleton:
-
-----
-import os
-import sys
-
-sys.path.append(os.environ['PERF_EXEC_PATH'] + \
-       '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
-
-from perf_trace_context import *
-from Core import *
-
-def trace_end():
-       print "in trace_end"
-
-def raw_syscalls__sys_enter(event_name, context, common_cpu,
-       common_secs, common_nsecs, common_pid, common_comm,
-       id, args):
-----
-
-In trace_end(), we'll simply print the results, but first we need to
-generate some results to print.  To do that we need to have our
-sys_enter() handler do the necessary tallying until all events have
-been counted.  A hash table indexed by syscall id is a good way to
-store that information; every time the sys_enter() handler is called,
-we simply increment a count associated with that hash entry indexed by
-that syscall id:
-
-----
-  syscalls = autodict()
-
-  try:
-    syscalls[id] += 1
-  except TypeError:
-    syscalls[id] = 1
-----
-
-The syscalls 'autodict' object is a special kind of Python dictionary
-(implemented in Core.py) that implements Perl's 'autovivifying' hashes
-in Python i.e. with autovivifying hashes, you can assign nested hash
-values without having to go to the trouble of creating intermediate
-levels if they don't exist e.g syscalls[comm][pid][id] = 1 will create
-the intermediate hash levels and finally assign the value 1 to the
-hash entry for 'id' (because the value being assigned isn't a hash
-object itself, the initial value is assigned in the TypeError
-exception.  Well, there may be a better way to do this in Python but
-that's what works for now).
-
-Putting that code into the raw_syscalls__sys_enter() handler, we
-effectively end up with a single-level dictionary keyed on syscall id
-and having the counts we've tallied as values.
-
-The print_syscall_totals() function iterates over the entries in the
-dictionary and displays a line for each entry containing the syscall
-name (the dictonary keys contain the syscall ids, which are passed to
-the Util function syscall_name(), which translates the raw syscall
-numbers to the corresponding syscall name strings).  The output is
-displayed after all the events in the trace have been processed, by
-calling the print_syscall_totals() function from the trace_end()
-handler called at the end of script processing.
-
-The final script producing the output shown above is shown in its
-entirety below (syscall_name() helper is not yet available, you can
-only deal with id's for now):
-
-----
-import os
-import sys
-
-sys.path.append(os.environ['PERF_EXEC_PATH'] + \
-       '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
-
-from perf_trace_context import *
-from Core import *
-from Util import *
-
-syscalls = autodict()
-
-def trace_end():
-       print_syscall_totals()
-
-def raw_syscalls__sys_enter(event_name, context, common_cpu,
-       common_secs, common_nsecs, common_pid, common_comm,
-       id, args):
-       try:
-               syscalls[id] += 1
-       except TypeError:
-               syscalls[id] = 1
-
-def print_syscall_totals():
-    if for_comm is not None:
-           print "\nsyscall events for %s:\n\n" % (for_comm),
-    else:
-           print "\nsyscall events:\n\n",
-
-    print "%-40s  %10s\n" % ("event", "count"),
-    print "%-40s  %10s\n" % ("----------------------------------------", \
-                                 "-----------"),
-
-    for id, val in sorted(syscalls.iteritems(), key = lambda(k, v): (v, k), \
-                                 reverse = True):
-           print "%-40s  %10d\n" % (syscall_name(id), val),
-----
-
-The script can be run just as before:
-
-  # perf trace -s syscall-counts.py
-
-So those are the essential steps in writing and running a script.  The
-process can be generalized to any tracepoint or set of tracepoints
-you're interested in - basically find the tracepoint(s) you're
-interested in by looking at the list of available events shown by
-'perf list' and/or look in /sys/kernel/debug/tracing events for
-detailed event and field info, record the corresponding trace data
-using 'perf record', passing it the list of interesting events,
-generate a skeleton script using 'perf trace -g python' and modify the
-code to aggregate and display it for your particular needs.
-
-After you've done that you may end up with a general-purpose script
-that you want to keep around and have available for future use.  By
-writing a couple of very simple shell scripts and putting them in the
-right place, you can have your script listed alongside the other
-scripts listed by the 'perf trace -l' command e.g.:
-
-----
-root@tropicana:~# perf trace -l
-List of available trace scripts:
-  workqueue-stats                      workqueue stats (ins/exe/create/destroy)
-  wakeup-latency                       system-wide min/max/avg wakeup latency
-  rw-by-file <comm>                    r/w activity for a program, by file
-  rw-by-pid                            system-wide r/w activity
-----
-
-A nice side effect of doing this is that you also then capture the
-probably lengthy 'perf record' command needed to record the events for
-the script.
-
-To have the script appear as a 'built-in' script, you write two simple
-scripts, one for recording and one for 'reporting'.
-
-The 'record' script is a shell script with the same base name as your
-script, but with -record appended.  The shell script should be put
-into the perf/scripts/python/bin directory in the kernel source tree.
-In that script, you write the 'perf record' command-line needed for
-your script:
-
-----
-# cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-record
-
-#!/bin/bash
-perf record -a -e raw_syscalls:sys_enter
-----
-
-The 'report' script is also a shell script with the same base name as
-your script, but with -report appended.  It should also be located in
-the perf/scripts/python/bin directory.  In that script, you write the
-'perf trace -s' command-line needed for running your script:
-
-----
-# cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-report
-
-#!/bin/bash
-# description: system-wide syscall counts
-perf trace -s ~/libexec/perf-core/scripts/python/syscall-counts.py
-----
-
-Note that the location of the Python script given in the shell script
-is in the libexec/perf-core/scripts/python directory - this is where
-the script will be copied by 'make install' when you install perf.
-For the installation to install your script there, your script needs
-to be located in the perf/scripts/python directory in the kernel
-source tree:
-
-----
-# ls -al kernel-source/tools/perf/scripts/python
-
-root@tropicana:/home/trz/src/tip# ls -al tools/perf/scripts/python
-total 32
-drwxr-xr-x 4 trz trz 4096 2010-01-26 22:30 .
-drwxr-xr-x 4 trz trz 4096 2010-01-26 22:29 ..
-drwxr-xr-x 2 trz trz 4096 2010-01-26 22:29 bin
--rw-r--r-- 1 trz trz 2548 2010-01-26 22:29 check-perf-trace.py
-drwxr-xr-x 3 trz trz 4096 2010-01-26 22:49 Perf-Trace-Util
--rw-r--r-- 1 trz trz 1462 2010-01-26 22:30 syscall-counts.py
-----
-
-Once you've done that (don't forget to do a new 'make install',
-otherwise your script won't show up at run-time), 'perf trace -l'
-should show a new entry for your script:
-
-----
-root@tropicana:~# perf trace -l
-List of available trace scripts:
-  workqueue-stats                      workqueue stats (ins/exe/create/destroy)
-  wakeup-latency                       system-wide min/max/avg wakeup latency
-  rw-by-file <comm>                    r/w activity for a program, by file
-  rw-by-pid                            system-wide r/w activity
-  syscall-counts                       system-wide syscall counts
-----
-
-You can now perform the record step via 'perf trace record':
-
-  # perf trace record syscall-counts
-
-and display the output using 'perf trace report':
-
-  # perf trace report syscall-counts
-
-STARTER SCRIPTS
----------------
-
-You can quickly get started writing a script for a particular set of
-trace data by generating a skeleton script using 'perf trace -g
-python' in the same directory as an existing perf.data trace file.
-That will generate a starter script containing a handler for each of
-the event types in the trace file; it simply prints every available
-field for each event in the trace file.
-
-You can also look at the existing scripts in
-~/libexec/perf-core/scripts/python for typical examples showing how to
-do basic things like aggregate event data, print results, etc.  Also,
-the check-perf-trace.py script, while not interesting for its results,
-attempts to exercise all of the main scripting features.
-
-EVENT HANDLERS
---------------
-
-When perf trace is invoked using a trace script, a user-defined
-'handler function' is called for each event in the trace.  If there's
-no handler function defined for a given event type, the event is
-ignored (or passed to a 'trace_handled' function, see below) and the
-next event is processed.
-
-Most of the event's field values are passed as arguments to the
-handler function; some of the less common ones aren't - those are
-available as calls back into the perf executable (see below).
-
-As an example, the following perf record command can be used to record
-all sched_wakeup events in the system:
-
- # perf record -a -e sched:sched_wakeup
-
-Traces meant to be processed using a script should be recorded with
-the above option: -a to enable system-wide collection.
-
-The format file for the sched_wakep event defines the following fields
-(see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format):
-
-----
- format:
-        field:unsigned short common_type;
-        field:unsigned char common_flags;
-        field:unsigned char common_preempt_count;
-        field:int common_pid;
-        field:int common_lock_depth;
-
-        field:char comm[TASK_COMM_LEN];
-        field:pid_t pid;
-        field:int prio;
-        field:int success;
-        field:int target_cpu;
-----
-
-The handler function for this event would be defined as:
-
-----
-def sched__sched_wakeup(event_name, context, common_cpu, common_secs,
-       common_nsecs, common_pid, common_comm,
-       comm, pid, prio, success, target_cpu):
-       pass
-----
-
-The handler function takes the form subsystem__event_name.
-
-The common_* arguments in the handler's argument list are the set of
-arguments passed to all event handlers; some of the fields correspond
-to the common_* fields in the format file, but some are synthesized,
-and some of the common_* fields aren't common enough to to be passed
-to every event as arguments but are available as library functions.
-
-Here's a brief description of each of the invariant event args:
-
- event_name                the name of the event as text
- context                   an opaque 'cookie' used in calls back into perf
- common_cpu                the cpu the event occurred on
- common_secs               the secs portion of the event timestamp
- common_nsecs              the nsecs portion of the event timestamp
- common_pid                the pid of the current task
- common_comm               the name of the current process
-
-All of the remaining fields in the event's format file have
-counterparts as handler function arguments of the same name, as can be
-seen in the example above.
-
-The above provides the basics needed to directly access every field of
-every event in a trace, which covers 90% of what you need to know to
-write a useful trace script.  The sections below cover the rest.
-
-SCRIPT LAYOUT
--------------
-
-Every perf trace Python script should start by setting up a Python
-module search path and 'import'ing a few support modules (see module
-descriptions below):
-
-----
- import os
- import sys
-
- sys.path.append(os.environ['PERF_EXEC_PATH'] + \
-             '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
-
- from perf_trace_context import *
- from Core import *
-----
-
-The rest of the script can contain handler functions and support
-functions in any order.
-
-Aside from the event handler functions discussed above, every script
-can implement a set of optional functions:
-
-*trace_begin*, if defined, is called before any event is processed and
-gives scripts a chance to do setup tasks:
-
-----
-def trace_begin:
-    pass
-----
-
-*trace_end*, if defined, is called after all events have been
- processed and gives scripts a chance to do end-of-script tasks, such
- as display results:
-
-----
-def trace_end:
-    pass
-----
-
-*trace_unhandled*, if defined, is called after for any event that
- doesn't have a handler explicitly defined for it.  The standard set
- of common arguments are passed into it:
-
-----
-def trace_unhandled(event_name, context, common_cpu, common_secs,
-        common_nsecs, common_pid, common_comm):
-    pass
-----
-
-The remaining sections provide descriptions of each of the available
-built-in perf trace Python modules and their associated functions.
-
-AVAILABLE MODULES AND FUNCTIONS
--------------------------------
-
-The following sections describe the functions and variables available
-via the various perf trace Python modules.  To use the functions and
-variables from the given module, add the corresponding 'from XXXX
-import' line to your perf trace script.
-
-Core.py Module
-~~~~~~~~~~~~~~
-
-These functions provide some essential functions to user scripts.
-
-The *flag_str* and *symbol_str* functions provide human-readable
-strings for flag and symbolic fields.  These correspond to the strings
-and values parsed from the 'print fmt' fields of the event format
-files:
-
-  flag_str(event_name, field_name, field_value) - returns the string represention corresponding to field_value for the flag field field_name of event event_name
-  symbol_str(event_name, field_name, field_value) - returns the string represention corresponding to field_value for the symbolic field field_name of event event_name
-
-The *autodict* function returns a special kind of Python
-dictionary that implements Perl's 'autovivifying' hashes in Python
-i.e. with autovivifying hashes, you can assign nested hash values
-without having to go to the trouble of creating intermediate levels if
-they don't exist.
-
-  autodict() - returns an autovivifying dictionary instance
-
-
-perf_trace_context Module
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Some of the 'common' fields in the event format file aren't all that
-common, but need to be made accessible to user scripts nonetheless.
-
-perf_trace_context defines a set of functions that can be used to
-access this data in the context of the current event.  Each of these
-functions expects a context variable, which is the same as the
-context variable passed into every event handler as the second
-argument.
-
- common_pc(context) - returns common_preempt count for the current event
- common_flags(context) - returns common_flags for the current event
- common_lock_depth(context) - returns common_lock_depth for the current event
-
-Util.py Module
-~~~~~~~~~~~~~~
-
-Various utility functions for use with perf trace:
-
-  nsecs(secs, nsecs) - returns total nsecs given secs/nsecs pair
-  nsecs_secs(nsecs) - returns whole secs portion given nsecs
-  nsecs_nsecs(nsecs) - returns nsecs remainder given nsecs
-  nsecs_str(nsecs) - returns printable string in the form secs.nsecs
-  avg(total, n) - returns average given a sum and a total number of values
-
-SEE ALSO
---------
-linkperf:perf-trace[1]
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt

deleted file mode 100644 (file)

index 26aff6b..0000000
--- a/tools/perf/Documentation/perf-trace.txt
+++ /dev/null
@@ -1,111 +0,0 @@
-perf-trace(1)
-=============
-
-NAME
-----
-perf-trace - Read perf.data (created by perf record) and display trace output
-
-SYNOPSIS
---------
-[verse]
-'perf trace' [<options>]
-'perf trace' [<options>] record <script> [<record-options>] <command>
-'perf trace' [<options>] report <script> [script-args]
-'perf trace' [<options>] <script> <required-script-args> [<record-options>] <command>
-'perf trace' [<options>] <top-script> [script-args]
-
-DESCRIPTION
------------
-This command reads the input file and displays the trace recorded.
-
-There are several variants of perf trace:
-
-  'perf trace' to see a detailed trace of the workload that was
-  recorded.
-
-  You can also run a set of pre-canned scripts that aggregate and
-  summarize the raw trace data in various ways (the list of scripts is
-  available via 'perf trace -l').  The following variants allow you to
-  record and run those scripts:
-
-  'perf trace record <script> <command>' to record the events required
-  for 'perf trace report'.  <script> is the name displayed in the
-  output of 'perf trace --list' i.e. the actual script name minus any
-  language extension.  If <command> is not specified, the events are
-  recorded using the -a (system-wide) 'perf record' option.
-
-  'perf trace report <script> [args]' to run and display the results
-  of <script>.  <script> is the name displayed in the output of 'perf
-  trace --list' i.e. the actual script name minus any language
-  extension.  The perf.data output from a previous run of 'perf trace
-  record <script>' is used and should be present for this command to
-  succeed.  [args] refers to the (mainly optional) args expected by
-  the script.
-
-  'perf trace <script> <required-script-args> <command>' to both
-  record the events required for <script> and to run the <script>
-  using 'live-mode' i.e. without writing anything to disk.  <script>
-  is the name displayed in the output of 'perf trace --list' i.e. the
-  actual script name minus any language extension.  If <command> is
-  not specified, the events are recorded using the -a (system-wide)
-  'perf record' option.  If <script> has any required args, they
-  should be specified before <command>.  This mode doesn't allow for
-  optional script args to be specified; if optional script args are
-  desired, they can be specified using separate 'perf trace record'
-  and 'perf trace report' commands, with the stdout of the record step
-  piped to the stdin of the report script, using the '-o -' and '-i -'
-  options of the corresponding commands.
-
-  'perf trace <top-script>' to both record the events required for
-  <top-script> and to run the <top-script> using 'live-mode'
-  i.e. without writing anything to disk.  <top-script> is the name
-  displayed in the output of 'perf trace --list' i.e. the actual
-  script name minus any language extension; a <top-script> is defined
-  as any script name ending with the string 'top'.
-
-  [<record-options>] can be passed to the record steps of 'perf trace
-  record' and 'live-mode' variants; this isn't possible however for
-  <top-script> 'live-mode' or 'perf trace report' variants.
-
-  See the 'SEE ALSO' section for links to language-specific
-  information on how to write and run your own trace scripts.
-
-OPTIONS
--------
-<command>...::
-       Any command you can specify in a shell.
-
--D::
---dump-raw-trace=::
-        Display verbose dump of the trace data.
-
--L::
---Latency=::
-        Show latency attributes (irqs/preemption disabled, etc).
-
--l::
---list=::
-        Display a list of available trace scripts.
-
--s ['lang']::
---script=::
-        Process trace data with the given script ([lang]:script[.ext]).
-       If the string 'lang' is specified in place of a script name, a
-        list of supported languages will be displayed instead.
-
--g::
---gen-script=::
-        Generate perf-trace.[ext] starter script for given language,
-        using current perf.data.
-
--a::
-        Force system-wide collection.  Scripts run without a <command>
-        normally use -a by default, while scripts run with a <command>
-        normally don't - this option allows the latter to be run in
-        system-wide mode.
-
-
-SEE ALSO
---------
-linkperf:perf-record[1], linkperf:perf-trace-perl[1],
-linkperf:perf-trace-python[1]
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST

index 8c7fc0c8f0b8cd0a77ddb58fb5b712379276ebc8..c12659d8cb26fcefd8449581921aef2962cb651c 100644 (file)
--- a/tools/perf/MANIFEST
+++ b/tools/perf/MANIFEST
@@ -7,6 +7,7 @@ include/linux/stringify.h
  lib/rbtree.c
  include/linux/swab.h
  arch/*/include/asm/unistd*.h
+arch/*/lib/memcpy*.S
  include/linux/poison.h
  include/linux/magic.h
  include/linux/hw_breakpoint.h
diff --git a/tools/perf/Makefile b/tools/perf/Makefile

index d1db0f676a4bf14850fa0264e78fe3d482d376dc..1b9b13ee2a726848bbfe2bdd9d81a9a98f8b5940 100644 (file)
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -185,7 +185,10 @@ ifeq ($(ARCH),i386)
          ARCH := x86
  endif
  ifeq ($(ARCH),x86_64)
+       RAW_ARCH := x86_64
          ARCH := x86
+       ARCH_CFLAGS := -DARCH_X86_64
+       ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S
  endif
  
  # CFLAGS and LDFLAGS are for the users to override from the command line.
@@ -375,6 +378,7 @@ LIB_H += util/include/linux/prefetch.h
  LIB_H += util/include/linux/rbtree.h
  LIB_H += util/include/linux/string.h
  LIB_H += util/include/linux/types.h
+LIB_H += util/include/linux/linkage.h
  LIB_H += util/include/asm/asm-offsets.h
  LIB_H += util/include/asm/bug.h
  LIB_H += util/include/asm/byteorder.h
@@ -383,6 +387,8 @@ LIB_H += util/include/asm/swab.h
  LIB_H += util/include/asm/system.h
  LIB_H += util/include/asm/uaccess.h
  LIB_H += util/include/dwarf-regs.h
+LIB_H += util/include/asm/dwarf2.h
+LIB_H += util/include/asm/cpufeature.h
  LIB_H += perf.h
  LIB_H += util/cache.h
  LIB_H += util/callchain.h
@@ -390,6 +396,7 @@ LIB_H += util/build-id.h
  LIB_H += util/debug.h
  LIB_H += util/debugfs.h
  LIB_H += util/event.h
+LIB_H += util/evsel.h
  LIB_H += util/exec_cmd.h
  LIB_H += util/types.h
  LIB_H += util/levenshtein.h
@@ -398,6 +405,7 @@ LIB_H += util/parse-options.h
  LIB_H += util/parse-events.h
  LIB_H += util/quote.h
  LIB_H += util/util.h
+LIB_H += util/xyarray.h
  LIB_H += util/header.h
  LIB_H += util/help.h
  LIB_H += util/session.h
@@ -417,6 +425,7 @@ LIB_H += util/probe-finder.h
  LIB_H += util/probe-event.h
  LIB_H += util/pstack.h
  LIB_H += util/cpumap.h
+LIB_H += $(ARCH_INCLUDE)
  
  LIB_OBJS += $(OUTPUT)util/abspath.o
  LIB_OBJS += $(OUTPUT)util/alias.o
@@ -426,6 +435,7 @@ LIB_OBJS += $(OUTPUT)util/ctype.o
  LIB_OBJS += $(OUTPUT)util/debugfs.o
  LIB_OBJS += $(OUTPUT)util/environment.o
  LIB_OBJS += $(OUTPUT)util/event.o
+LIB_OBJS += $(OUTPUT)util/evsel.o
  LIB_OBJS += $(OUTPUT)util/exec_cmd.o
  LIB_OBJS += $(OUTPUT)util/help.o
  LIB_OBJS += $(OUTPUT)util/levenshtein.o
@@ -463,6 +473,7 @@ LIB_OBJS += $(OUTPUT)util/sort.o
  LIB_OBJS += $(OUTPUT)util/hist.o
  LIB_OBJS += $(OUTPUT)util/probe-event.o
  LIB_OBJS += $(OUTPUT)util/util.o
+LIB_OBJS += $(OUTPUT)util/xyarray.o
  LIB_OBJS += $(OUTPUT)util/cpumap.o
  
  BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o
@@ -472,6 +483,9 @@ BUILTIN_OBJS += $(OUTPUT)builtin-bench.o
  # Benchmark modules
  BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o
  BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
+ifeq ($(RAW_ARCH),x86_64)
+BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
+endif
  BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o
  
  BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
@@ -485,7 +499,7 @@ BUILTIN_OBJS += $(OUTPUT)builtin-report.o
  BUILTIN_OBJS += $(OUTPUT)builtin-stat.o
  BUILTIN_OBJS += $(OUTPUT)builtin-timechart.o
  BUILTIN_OBJS += $(OUTPUT)builtin-top.o
-BUILTIN_OBJS += $(OUTPUT)builtin-trace.o
+BUILTIN_OBJS += $(OUTPUT)builtin-script.o
  BUILTIN_OBJS += $(OUTPUT)builtin-probe.o
  BUILTIN_OBJS += $(OUTPUT)builtin-kmem.o
  BUILTIN_OBJS += $(OUTPUT)builtin-lock.o
@@ -507,7 +521,7 @@ PERFLIBS = $(LIB_FILE)
  -include config.mak
  
  ifndef NO_DWARF
-FLAGS_DWARF=$(ALL_CFLAGS) -I/usr/include/elfutils -ldw -lelf $(ALL_LDFLAGS) $(EXTLIBS)
+FLAGS_DWARF=$(ALL_CFLAGS) -ldw -lelf $(ALL_LDFLAGS) $(EXTLIBS)
  ifneq ($(call try-cc,$(SOURCE_DWARF),$(FLAGS_DWARF)),y)
         msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev);
         NO_DWARF := 1
@@ -554,7 +568,7 @@ ifndef NO_DWARF
  ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined)
         msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled);
  else
-       BASIC_CFLAGS += -I/usr/include/elfutils -DDWARF_SUPPORT
+       BASIC_CFLAGS += -DDWARF_SUPPORT
         EXTLIBS += -lelf -ldw
         LIB_OBJS += $(OUTPUT)util/probe-finder.o
  endif # PERF_HAVE_DWARF_REGS
@@ -891,13 +905,14 @@ prefix_SQ = $(subst ','\'',$(prefix))
  SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
  PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
  
-LIBS = $(PERFLIBS) $(EXTLIBS)
+LIBS = -Wl,--whole-archive $(PERFLIBS) -Wl,--no-whole-archive $(EXTLIBS)
  
  BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
         $(COMPAT_CFLAGS)
  LIB_OBJS += $(COMPAT_OBJS)
  
  ALL_CFLAGS += $(BASIC_CFLAGS)
+ALL_CFLAGS += $(ARCH_CFLAGS)
  ALL_LDFLAGS += $(BASIC_LDFLAGS)
  
  export TAR INSTALL DESTDIR SHELL_PATH
diff --git a/tools/perf/bench/mem-memcpy-arch.h b/tools/perf/bench/mem-memcpy-arch.h

new file mode 100644 (file)

index 0000000..a72e36c
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-arch.h
@@ -0,0 +1,12 @@
+
+#ifdef ARCH_X86_64
+
+#define MEMCPY_FN(fn, name, desc)              \
+       extern void *fn(void *, const void *, size_t);
+
+#include "mem-memcpy-x86-64-asm-def.h"
+
+#undef MEMCPY_FN
+
+#endif
+
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h

new file mode 100644 (file)

index 0000000..d588b87
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -0,0 +1,4 @@
+
+MEMCPY_FN(__memcpy,
+       "x86-64-unrolled",
+       "unrolled memcpy() in arch/x86/lib/memcpy_64.S")
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S

new file mode 100644 (file)

index 0000000..a57b66e
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -0,0 +1,2 @@
+
+#include "../../../arch/x86/lib/memcpy_64.S"
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c

index 38dae7465142fdb86037cbae95a85a3ede9e24ed..db82021f4b91c7172a2fe43b4902e67f022af7b9 100644 (file)
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -12,6 +12,7 @@
  #include "../util/parse-options.h"
  #include "../util/header.h"
  #include "bench.h"
+#include "mem-memcpy-arch.h"
  
  #include <stdio.h>
  #include <stdlib.h>
@@ -23,8 +24,10 @@
  
  static const char      *length_str     = "1MB";
  static const char      *routine        = "default";
-static bool            use_clock       = false;
+static bool            use_clock;
  static int             clock_fd;
+static bool            only_prefault;
+static bool            no_prefault;
  
  static const struct option options[] = {
         OPT_STRING('l', "length", &length_str, "1MB",
@@ -34,19 +37,33 @@ static const struct option options[] = {
                     "Specify routine to copy"),
         OPT_BOOLEAN('c', "clock", &use_clock,
                     "Use CPU clock for measuring"),
+       OPT_BOOLEAN('o', "only-prefault", &only_prefault,
+                   "Show only the result with page faults before memcpy()"),
+       OPT_BOOLEAN('n', "no-prefault", &no_prefault,
+                   "Show only the result without page faults before memcpy()"),
         OPT_END()
  };
  
+typedef void *(*memcpy_t)(void *, const void *, size_t);
+
  struct routine {
         const char *name;
         const char *desc;
-       void * (*fn)(void *dst, const void *src, size_t len);
+       memcpy_t fn;
  };
  
  struct routine routines[] = {
         { "default",
           "Default memcpy() provided by glibc",
           memcpy },
+#ifdef ARCH_X86_64
+
+#define MEMCPY_FN(fn, name, desc) { name, desc, fn },
+#include "mem-memcpy-x86-64-asm-def.h"
+#undef MEMCPY_FN
+
+#endif
+
         { NULL,
           NULL,
           NULL   }
@@ -89,29 +106,98 @@ static double timeval2double(struct timeval *ts)
                 (double)ts->tv_usec / (double)1000000;
  }
  
+static void alloc_mem(void **dst, void **src, size_t length)
+{
+       *dst = zalloc(length);
+       if (!dst)
+               die("memory allocation failed - maybe length is too large?\n");
+
+       *src = zalloc(length);
+       if (!src)
+               die("memory allocation failed - maybe length is too large?\n");
+}
+
+static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault)
+{
+       u64 clock_start = 0ULL, clock_end = 0ULL;
+       void *src = NULL, *dst = NULL;
+
+       alloc_mem(&src, &dst, len);
+
+       if (prefault)
+               fn(dst, src, len);
+
+       clock_start = get_clock();
+       fn(dst, src, len);
+       clock_end = get_clock();
+
+       free(src);
+       free(dst);
+       return clock_end - clock_start;
+}
+
+static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault)
+{
+       struct timeval tv_start, tv_end, tv_diff;
+       void *src = NULL, *dst = NULL;
+
+       alloc_mem(&src, &dst, len);
+
+       if (prefault)
+               fn(dst, src, len);
+
+       BUG_ON(gettimeofday(&tv_start, NULL));
+       fn(dst, src, len);
+       BUG_ON(gettimeofday(&tv_end, NULL));
+
+       timersub(&tv_end, &tv_start, &tv_diff);
+
+       free(src);
+       free(dst);
+       return (double)((double)len / timeval2double(&tv_diff));
+}
+
+#define pf (no_prefault ? 0 : 1)
+
+#define print_bps(x) do {                                      \
+               if (x < K)                                      \
+                       printf(" %14lf B/Sec", x);              \
+               else if (x < K * K)                             \
+                       printf(" %14lfd KB/Sec", x / K);        \
+               else if (x < K * K * K)                         \
+                       printf(" %14lf MB/Sec", x / K / K);     \
+               else                                            \
+                       printf(" %14lf GB/Sec", x / K / K / K); \
+       } while (0)
+
  int bench_mem_memcpy(int argc, const char **argv,
                      const char *prefix __used)
  {
         int i;
-       void *dst, *src;
-       size_t length;
-       double bps = 0.0;
-       struct timeval tv_start, tv_end, tv_diff;
-       u64 clock_start, clock_end, clock_diff;
+       size_t len;
+       double result_bps[2];
+       u64 result_clock[2];
  
-       clock_start = clock_end = clock_diff = 0ULL;
         argc = parse_options(argc, argv, options,
                              bench_mem_memcpy_usage, 0);
  
-       tv_diff.tv_sec = 0;
-       tv_diff.tv_usec = 0;
-       length = (size_t)perf_atoll((char *)length_str);
+       if (use_clock)
+               init_clock();
+
+       len = (size_t)perf_atoll((char *)length_str);
  
-       if ((s64)length <= 0) {
+       result_clock[0] = result_clock[1] = 0ULL;
+       result_bps[0] = result_bps[1] = 0.0;
+
+       if ((s64)len <= 0) {
                 fprintf(stderr, "Invalid length:%s\n", length_str);
                 return 1;
         }
  
+       /* same to without specifying either of prefault and no-prefault */
+       if (only_prefault && no_prefault)
+               only_prefault = no_prefault = false;
+
         for (i = 0; routines[i].name; i++) {
                 if (!strcmp(routines[i].name, routine))
                         break;
@@ -126,61 +212,80 @@ int bench_mem_memcpy(int argc, const char **argv,
                 return 1;
         }
  
-       dst = zalloc(length);
-       if (!dst)
-               die("memory allocation failed - maybe length is too large?\n");
-
-       src = zalloc(length);
-       if (!src)
-               die("memory allocation failed - maybe length is too large?\n");
-
-       if (bench_format == BENCH_FORMAT_DEFAULT) {
-               printf("# Copying %s Bytes from %p to %p ...\n\n",
-                      length_str, src, dst);
-       }
-
-       if (use_clock) {
-               init_clock();
-               clock_start = get_clock();
-       } else {
-               BUG_ON(gettimeofday(&tv_start, NULL));
-       }
-
-       routines[i].fn(dst, src, length);
+       if (bench_format == BENCH_FORMAT_DEFAULT)
+               printf("# Copying %s Bytes ...\n\n", length_str);
  
-       if (use_clock) {
-               clock_end = get_clock();
-               clock_diff = clock_end - clock_start;
+       if (!only_prefault && !no_prefault) {
+               /* show both of results */
+               if (use_clock) {
+                       result_clock[0] =
+                               do_memcpy_clock(routines[i].fn, len, false);
+                       result_clock[1] =
+                               do_memcpy_clock(routines[i].fn, len, true);
+               } else {
+                       result_bps[0] =
+                               do_memcpy_gettimeofday(routines[i].fn,
+                                               len, false);
+                       result_bps[1] =
+                               do_memcpy_gettimeofday(routines[i].fn,
+                                               len, true);
+               }
         } else {
-               BUG_ON(gettimeofday(&tv_end, NULL));
-               timersub(&tv_end, &tv_start, &tv_diff);
-               bps = (double)((double)length / timeval2double(&tv_diff));
+               if (use_clock) {
+                       result_clock[pf] =
+                               do_memcpy_clock(routines[i].fn,
+                                               len, only_prefault);
+               } else {
+                       result_bps[pf] =
+                               do_memcpy_gettimeofday(routines[i].fn,
+                                               len, only_prefault);
+               }
         }
  
         switch (bench_format) {
         case BENCH_FORMAT_DEFAULT:
-               if (use_clock) {
-                       printf(" %14lf Clock/Byte\n",
-                              (double)clock_diff / (double)length);
-               } else {
-                       if (bps < K)
-                               printf(" %14lf B/Sec\n", bps);
-                       else if (bps < K * K)
-                               printf(" %14lfd KB/Sec\n", bps / 1024);
-                       else if (bps < K * K * K)
-                               printf(" %14lf MB/Sec\n", bps / 1024 / 1024);
-                       else {
-                               printf(" %14lf GB/Sec\n",
-                                      bps / 1024 / 1024 / 1024);
+               if (!only_prefault && !no_prefault) {
+                       if (use_clock) {
+                               printf(" %14lf Clock/Byte\n",
+                                       (double)result_clock[0]
+                                       / (double)len);
+                               printf(" %14lf Clock/Byte (with prefault)\n",
+                                       (double)result_clock[1]
+                                       / (double)len);
+                       } else {
+                               print_bps(result_bps[0]);
+                               printf("\n");
+                               print_bps(result_bps[1]);
+                               printf(" (with prefault)\n");
                         }
+               } else {
+                       if (use_clock) {
+                               printf(" %14lf Clock/Byte",
+                                       (double)result_clock[pf]
+                                       / (double)len);
+                       } else
+                               print_bps(result_bps[pf]);
+
+                       printf("%s\n", only_prefault ? " (with prefault)" : "");
                 }
                 break;
         case BENCH_FORMAT_SIMPLE:
-               if (use_clock) {
-                       printf("%14lf\n",
-                              (double)clock_diff / (double)length);
-               } else
-                       printf("%lf\n", bps);
+               if (!only_prefault && !no_prefault) {
+                       if (use_clock) {
+                               printf("%lf %lf\n",
+                                       (double)result_clock[0] / (double)len,
+                                       (double)result_clock[1] / (double)len);
+                       } else {
+                               printf("%lf %lf\n",
+                                       result_bps[0], result_bps[1]);
+                       }
+               } else {
+                       if (use_clock) {
+                               printf("%lf\n", (double)result_clock[pf]
+                                       / (double)len);
+                       } else
+                               printf("%lf\n", result_bps[pf]);
+               }
                 break;
         default:
                 /* reaching this means there's some disaster: */
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c

index 6d5604d8df9599acb55d87017f5d58e19d906395..c056cdc0691258b159665ca3e8c74d2963543ccf 100644 (file)
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -58,12 +58,12 @@ static int hists__add_entry(struct hists *self, struct addr_location *al)
         return hist_entry__inc_addr_samples(he, al->addr);
  }
  
-static int process_sample_event(event_t *event, struct perf_session *session)
+static int process_sample_event(event_t *event, struct sample_data *sample,
+                               struct perf_session *session)
  {
         struct addr_location al;
-       struct sample_data data;
  
-       if (event__preprocess_sample(event, session, &al, &data, NULL) < 0) {
+       if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) {
                 pr_warning("problem processing %d event, skipping it.\n",
                            event->header.type);
                 return -1;
@@ -375,6 +375,8 @@ static struct perf_event_ops event_ops = {
         .mmap   = event__process_mmap,
         .comm   = event__process_comm,
         .fork   = event__process_task,
+       .ordered_samples = true,
+       .ordering_requires_timestamps = true,
  };
  
  static int __cmd_annotate(void)
@@ -382,7 +384,7 @@ static int __cmd_annotate(void)
         int ret;
         struct perf_session *session;
  
-       session = perf_session__new(input_name, O_RDONLY, force, false);
+       session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops);
         if (session == NULL)
                 return -ENOMEM;
  
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c

index c49837de7d3f74eb9254d83e4f6457b11ae38c17..5af32ae9031ec83fc2db926df9360d1ce8f1620c 100644 (file)
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -38,7 +38,8 @@ static int __cmd_buildid_list(void)
  {
         struct perf_session *session;
  
-       session = perf_session__new(input_name, O_RDONLY, force, false);
+       session = perf_session__new(input_name, O_RDONLY, force, false,
+                                   &build_id__mark_dso_hit_ops);
         if (session == NULL)
                 return -1;
  
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c

index fca1d4402910ab13a6f7aa45299e31c0289cb27c..3153e492dbcc29e1593b6df29357424dd012da99 100644 (file)
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -30,12 +30,13 @@ static int hists__add_entry(struct hists *self,
         return -ENOMEM;
  }
  
-static int diff__process_sample_event(event_t *event, struct perf_session *session)
+static int diff__process_sample_event(event_t *event,
+                                     struct sample_data *sample,
+                                     struct perf_session *session)
  {
         struct addr_location al;
-       struct sample_data data = { .period = 1, };
  
-       if (event__preprocess_sample(event, session, &al, &data, NULL) < 0) {
+       if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) {
                 pr_warning("problem processing %d event, skipping it.\n",
                            event->header.type);
                 return -1;
@@ -44,12 +45,12 @@ static int diff__process_sample_event(event_t *event, struct perf_session *sessi
         if (al.filtered || al.sym == NULL)
                 return 0;
  
-       if (hists__add_entry(&session->hists, &al, data.period)) {
+       if (hists__add_entry(&session->hists, &al, sample->period)) {
                 pr_warning("problem incrementing symbol period, skipping event\n");
                 return -1;
         }
  
-       session->hists.stats.total_period += data.period;
+       session->hists.stats.total_period += sample->period;
         return 0;
  }
  
@@ -60,6 +61,8 @@ static struct perf_event_ops event_ops = {
         .exit   = event__process_task,
         .fork   = event__process_task,
         .lost   = event__process_lost,
+       .ordered_samples = true,
+       .ordering_requires_timestamps = true,
  };
  
  static void perf_session__insert_hist_entry_by_name(struct rb_root *root,
@@ -141,8 +144,8 @@ static int __cmd_diff(void)
         int ret, i;
         struct perf_session *session[2];
  
-       session[0] = perf_session__new(input_old, O_RDONLY, force, false);
-       session[1] = perf_session__new(input_new, O_RDONLY, force, false);
+       session[0] = perf_session__new(input_old, O_RDONLY, force, false, &event_ops);
+       session[1] = perf_session__new(input_new, O_RDONLY, force, false, &event_ops);
         if (session[0] == NULL || session[1] == NULL)
                 return -ENOMEM;
  
@@ -173,7 +176,7 @@ static const char * const diff_usage[] = {
  static const struct option options[] = {
         OPT_INCR('v', "verbose", &verbose,
                     "be more verbose (show symbol address, etc)"),
-       OPT_BOOLEAN('m', "displacement", &show_displacement,
+       OPT_BOOLEAN('M', "displacement", &show_displacement,
                     "Show position displacement relative to baseline"),
         OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
                     "dump raw trace in ASCII"),
@@ -191,6 +194,8 @@ static const struct option options[] = {
         OPT_STRING('t', "field-separator", &symbol_conf.field_sep, "separator",
                    "separator for columns, no spaces will be added between "
                    "columns '.' is reserved."),
+       OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
+                   "Look for files with symbols relative to this directory"),
         OPT_END()
  };
  
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c

index 8e3e47b064cea7ddea4e98f0fb98202b35643632..0c78ffa7bf675f46c9e631d9fa8d51fbc71aded4 100644 (file)
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -16,8 +16,8 @@
  static char            const *input_name = "-";
  static bool            inject_build_ids;
  
-static int event__repipe(event_t *event __used,
-                        struct perf_session *session __used)
+static int event__repipe_synth(event_t *event,
+                              struct perf_session *session __used)
  {
         uint32_t size;
         void *buf = event;
@@ -36,22 +36,30 @@ static int event__repipe(event_t *event __used,
         return 0;
  }
  
-static int event__repipe_mmap(event_t *self, struct perf_session *session)
+static int event__repipe(event_t *event, struct sample_data *sample __used,
+                        struct perf_session *session)
+{
+       return event__repipe_synth(event, session);
+}
+
+static int event__repipe_mmap(event_t *self, struct sample_data *sample,
+                             struct perf_session *session)
  {
         int err;
  
-       err = event__process_mmap(self, session);
-       event__repipe(self, session);
+       err = event__process_mmap(self, sample, session);
+       event__repipe(self, sample, session);
  
         return err;
  }
  
-static int event__repipe_task(event_t *self, struct perf_session *session)
+static int event__repipe_task(event_t *self, struct sample_data *sample,
+                             struct perf_session *session)
  {
         int err;
  
-       err = event__process_task(self, session);
-       event__repipe(self, session);
+       err = event__process_task(self, sample, session);
+       event__repipe(self, sample, session);
  
         return err;
  }
@@ -61,7 +69,7 @@ static int event__repipe_tracing_data(event_t *self,
  {
         int err;
  
-       event__repipe(self, session);
+       event__repipe_synth(self, session);
         err = event__process_tracing_data(self, session);
  
         return err;
@@ -111,7 +119,8 @@ static int dso__inject_build_id(struct dso *self, struct perf_session *session)
         return 0;
  }
  
-static int event__inject_buildid(event_t *event, struct perf_session *session)
+static int event__inject_buildid(event_t *event, struct sample_data *sample,
+                                struct perf_session *session)
  {
         struct addr_location al;
         struct thread *thread;
@@ -146,7 +155,7 @@ static int event__inject_buildid(event_t *event, struct perf_session *session)
         }
  
  repipe:
-       event__repipe(event, session);
+       event__repipe(event, sample, session);
         return 0;
  }
  
@@ -160,10 +169,10 @@ struct perf_event_ops inject_ops = {
         .read           = event__repipe,
         .throttle       = event__repipe,
         .unthrottle     = event__repipe,
-       .attr           = event__repipe,
-       .event_type     = event__repipe,
-       .tracing_data   = event__repipe,
-       .build_id       = event__repipe,
+       .attr           = event__repipe_synth,
+       .event_type     = event__repipe_synth,
+       .tracing_data   = event__repipe_synth,
+       .build_id       = event__repipe_synth,
  };
  
  extern volatile int session_done;
@@ -187,7 +196,7 @@ static int __cmd_inject(void)
                 inject_ops.tracing_data = event__repipe_tracing_data;
         }
  
-       session = perf_session__new(input_name, O_RDONLY, false, true);
+       session = perf_session__new(input_name, O_RDONLY, false, true, &inject_ops);
         if (session == NULL)
                 return -ENOMEM;
  
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c

index 31f60a2535e0ec95b60e6b90e3e24818fa0dd972..def7ddc2fd4fbc1b1c57795f4c10c519729b035d 100644 (file)
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -304,22 +304,11 @@ process_raw_event(event_t *raw_event __used, void *data,
         }
  }
  
-static int process_sample_event(event_t *event, struct perf_session *session)
+static int process_sample_event(event_t *event, struct sample_data *sample,
+                               struct perf_session *session)
  {
-       struct sample_data data;
-       struct thread *thread;
+       struct thread *thread = perf_session__findnew(session, event->ip.pid);
  
-       memset(&data, 0, sizeof(data));
-       data.time = -1;
-       data.cpu = -1;
-       data.period = 1;
-
-       event__parse_sample(event, session->sample_type, &data);
-
-       dump_printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc,
-                   data.pid, data.tid, data.ip, data.period);
-
-       thread = perf_session__findnew(session, event->ip.pid);
         if (thread == NULL) {
                 pr_debug("problem processing %d event, skipping it.\n",
                          event->header.type);
@@ -328,8 +317,8 @@ static int process_sample_event(event_t *event, struct perf_session *session)
  
         dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
  
-       process_raw_event(event, data.raw_data, data.cpu,
-                         data.time, thread);
+       process_raw_event(event, sample->raw_data, sample->cpu,
+                         sample->time, thread);
  
         return 0;
  }
@@ -492,7 +481,8 @@ static void sort_result(void)
  static int __cmd_kmem(void)
  {
         int err = -EINVAL;
-       struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0, false);
+       struct perf_session *session = perf_session__new(input_name, O_RDONLY,
+                                                        0, false, &event_ops);
         if (session == NULL)
                 return -ENOMEM;
  
@@ -747,6 +737,9 @@ static int __cmd_record(int argc, const char **argv)
         rec_argc = ARRAY_SIZE(record_args) + argc - 1;
         rec_argv = calloc(rec_argc + 1, sizeof(char *));
  
+       if (rec_argv == NULL)
+               return -ENOMEM;
+
         for (i = 0; i < ARRAY_SIZE(record_args); i++)
                 rec_argv[i] = strdup(record_args[i]);
  
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c

index 821c1586a22b7da92cd732ad1a4ee4e05b037085..b9c6e54329713e326d74da9b08164e562e521d03 100644 (file)
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -834,22 +834,18 @@ static void dump_info(void)
                 die("Unknown type of information\n");
  }
  
-static int process_sample_event(event_t *self, struct perf_session *s)
+static int process_sample_event(event_t *self, struct sample_data *sample,
+                               struct perf_session *s)
  {
-       struct sample_data data;
-       struct thread *thread;
+       struct thread *thread = perf_session__findnew(s, sample->tid);
  
-       bzero(&data, sizeof(data));
-       event__parse_sample(self, s->sample_type, &data);
-
-       thread = perf_session__findnew(s, data.tid);
         if (thread == NULL) {
                 pr_debug("problem processing %d event, skipping it.\n",
                         self->header.type);
                 return -1;
         }
  
-       process_raw_event(data.raw_data, data.cpu, data.time, thread);
+       process_raw_event(sample->raw_data, sample->cpu, sample->time, thread);
  
         return 0;
  }
@@ -862,7 +858,7 @@ static struct perf_event_ops eops = {
  
  static int read_events(void)
  {
-       session = perf_session__new(input_name, O_RDONLY, 0, false);
+       session = perf_session__new(input_name, O_RDONLY, 0, false, &eops);
         if (!session)
                 die("Initializing perf session failed\n");
  
@@ -947,6 +943,9 @@ static int __cmd_record(int argc, const char **argv)
         rec_argc = ARRAY_SIZE(record_args) + argc - 1;
         rec_argv = calloc(rec_argc + 1, sizeof(char *));
  
+       if (rec_argv == NULL)
+               return -ENOMEM;
+
         for (i = 0; i < ARRAY_SIZE(record_args); i++)
                 rec_argv[i] = strdup(record_args[i]);
  
@@ -982,9 +981,9 @@ int cmd_lock(int argc, const char **argv, const char *prefix __used)
                                 usage_with_options(report_usage, report_options);
                 }
                 __cmd_report();
-       } else if (!strcmp(argv[0], "trace")) {
-               /* Aliased to 'perf trace' */
-               return cmd_trace(argc, argv, prefix);
+       } else if (!strcmp(argv[0], "script")) {
+               /* Aliased to 'perf script' */
+               return cmd_script(argc, argv, prefix);
         } else if (!strcmp(argv[0], "info")) {
                 if (argc) {
                         argc = parse_options(argc, argv,
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c

index 564491fa18b27838dd79125954bc744f51f7fe2c..7bc0490354847a949d2f2105cca86e67f8e0b1b4 100644 (file)
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -18,6 +18,7 @@
  
  #include "util/header.h"
  #include "util/event.h"
+#include "util/evsel.h"
  #include "util/debug.h"
  #include "util/session.h"
  #include "util/symbol.h"
@@ -27,17 +28,18 @@
  #include <sched.h>
  #include <sys/mman.h>
  
+#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
+
  enum write_mode_t {
         WRITE_FORCE,
         WRITE_APPEND
  };
  
-static int                     *fd[MAX_NR_CPUS][MAX_COUNTERS];
-
  static u64                     user_interval                   = ULLONG_MAX;
  static u64                     default_interval                =      0;
+static u64                     sample_type;
  
-static int                     nr_cpus                         =      0;
+static struct cpu_map          *cpus;
  static unsigned int            page_size;
  static unsigned int            mmap_pages                      =    128;
  static unsigned int            user_freq                       = UINT_MAX;
@@ -48,11 +50,11 @@ static const char           *output_name                    = "perf.data";
  static int                     group                           =      0;
  static int                     realtime_prio                   =      0;
  static bool                    raw_samples                     =  false;
+static bool                    sample_id_all_avail             =   true;
  static bool                    system_wide                     =  false;
  static pid_t                   target_pid                      =     -1;
  static pid_t                   target_tid                      =     -1;
-static pid_t                   *all_tids                       =      NULL;
-static int                     thread_num                      =      0;
+static struct thread_map       *threads;
  static pid_t                   child_pid                       =     -1;
  static bool                    no_inherit                      =  false;
  static enum write_mode_t       write_mode                      = WRITE_FORCE;
@@ -60,7 +62,9 @@ static bool                   call_graph                      =  false;
  static bool                    inherit_stat                    =  false;
  static bool                    no_samples                      =  false;
  static bool                    sample_address                  =  false;
+static bool                    sample_time                     =  false;
  static bool                    no_buildid                      =  false;
+static bool                    no_buildid_cache                =  false;
  
  static long                    samples                         =      0;
  static u64                     bytes_written                   =      0;
@@ -77,7 +81,6 @@ static struct perf_session    *session;
  static const char              *cpu_list;
  
  struct mmap_data {
-       int                     counter;
         void                    *base;
         unsigned int            mask;
         unsigned int            prev;
@@ -128,6 +131,7 @@ static void write_output(void *buf, size_t size)
  }
  
  static int process_synthesized_event(event_t *event,
+                                    struct sample_data *sample __used,
                                      struct perf_session *self __used)
  {
         write_output(event, event->header.size);
@@ -224,12 +228,12 @@ static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int n
         return h_attr;
  }
  
-static void create_counter(int counter, int cpu)
+static void create_counter(struct perf_evsel *evsel, int cpu)
  {
-       char *filter = filters[counter];
-       struct perf_event_attr *attr = attrs + counter;
+       char *filter = evsel->filter;
+       struct perf_event_attr *attr = &evsel->attr;
         struct perf_header_attr *h_attr;
-       int track = !counter; /* only the first counter needs these */
+       int track = !evsel->idx; /* only the first counter needs these */
         int thread_index;
         int ret;
         struct {
@@ -238,6 +242,19 @@ static void create_counter(int counter, int cpu)
                 u64 time_running;
                 u64 id;
         } read_data;
+       /*
+        * Check if parse_single_tracepoint_event has already asked for
+        * PERF_SAMPLE_TIME.
+        *
+        * XXX this is kludgy but short term fix for problems introduced by
+        * eac23d1c that broke 'perf script' by having different sample_types
+        * when using multiple tracepoint events when we use a perf binary
+        * that tries to use sample_id_all on an older kernel.
+        *
+        * We need to move counter creation to perf_session, support
+        * different sample_types, etc.
+        */
+       bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
  
         attr->read_format       = PERF_FORMAT_TOTAL_TIME_ENABLED |
                                   PERF_FORMAT_TOTAL_TIME_RUNNING |
@@ -280,6 +297,10 @@ static void create_counter(int counter, int cpu)
         if (system_wide)
                 attr->sample_type       |= PERF_SAMPLE_CPU;
  
+       if (sample_id_all_avail &&
+           (sample_time || system_wide || !no_inherit || cpu_list))
+               attr->sample_type       |= PERF_SAMPLE_TIME;
+
         if (raw_samples) {
                 attr->sample_type       |= PERF_SAMPLE_TIME;
                 attr->sample_type       |= PERF_SAMPLE_RAW;
@@ -293,13 +314,14 @@ static void create_counter(int counter, int cpu)
                 attr->disabled = 1;
                 attr->enable_on_exec = 1;
         }
+retry_sample_id:
+       attr->sample_id_all = sample_id_all_avail ? 1 : 0;
  
-       for (thread_index = 0; thread_index < thread_num; thread_index++) {
+       for (thread_index = 0; thread_index < threads->nr; thread_index++) {
  try_again:
-               fd[nr_cpu][counter][thread_index] = sys_perf_event_open(attr,
-                               all_tids[thread_index], cpu, group_fd, 0);
+               FD(evsel, nr_cpu, thread_index) = sys_perf_event_open(attr, threads->map[thread_index], cpu, group_fd, 0);
  
-               if (fd[nr_cpu][counter][thread_index] < 0) {
+               if (FD(evsel, nr_cpu, thread_index) < 0) {
                         int err = errno;
  
                         if (err == EPERM || err == EACCES)
@@ -309,6 +331,15 @@ try_again:
                         else if (err ==  ENODEV && cpu_list) {
                                 die("No such device - did you specify"
                                         " an out-of-range profile CPU?\n");
+                       } else if (err == EINVAL && sample_id_all_avail) {
+                               /*
+                                * Old kernel, no attr->sample_id_type_all field
+                                */
+                               sample_id_all_avail = false;
+                               if (!sample_time && !raw_samples && !time_needed)
+                                       attr->sample_type &= ~PERF_SAMPLE_TIME;
+
+                               goto retry_sample_id;
                         }
  
                         /*
@@ -326,8 +357,8 @@ try_again:
                                 goto try_again;
                         }
                         printf("\n");
-                       error("perfcounter syscall returned with %d (%s)\n",
-                                       fd[nr_cpu][counter][thread_index], strerror(err));
+                       error("sys_perf_event_open() syscall returned with %d (%s).  /bin/dmesg may provide additional information.\n",
+                             FD(evsel, nr_cpu, thread_index), strerror(err));
  
  #if defined(__i386__) || defined(__x86_64__)
                         if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
@@ -341,7 +372,7 @@ try_again:
                         exit(-1);
                 }
  
-               h_attr = get_header_attr(attr, counter);
+               h_attr = get_header_attr(attr, evsel->idx);
                 if (h_attr == NULL)
                         die("nomem\n");
  
@@ -352,7 +383,7 @@ try_again:
                         }
                 }
  
-               if (read(fd[nr_cpu][counter][thread_index], &read_data, sizeof(read_data)) == -1) {
+               if (read(FD(evsel, nr_cpu, thread_index), &read_data, sizeof(read_data)) == -1) {
                         perror("Unable to read perf file descriptor");
                         exit(-1);
                 }
@@ -362,43 +393,44 @@ try_again:
                         exit(-1);
                 }
  
-               assert(fd[nr_cpu][counter][thread_index] >= 0);
-               fcntl(fd[nr_cpu][counter][thread_index], F_SETFL, O_NONBLOCK);
+               assert(FD(evsel, nr_cpu, thread_index) >= 0);
+               fcntl(FD(evsel, nr_cpu, thread_index), F_SETFL, O_NONBLOCK);
  
                 /*
                  * First counter acts as the group leader:
                  */
                 if (group && group_fd == -1)
-                       group_fd = fd[nr_cpu][counter][thread_index];
-
-               if (counter || thread_index) {
-                       ret = ioctl(fd[nr_cpu][counter][thread_index],
-                                       PERF_EVENT_IOC_SET_OUTPUT,
-                                       fd[nr_cpu][0][0]);
+                       group_fd = FD(evsel, nr_cpu, thread_index);
+
+               if (evsel->idx || thread_index) {
+                       struct perf_evsel *first;
+                       first = list_entry(evsel_list.next, struct perf_evsel, node);
+                       ret = ioctl(FD(evsel, nr_cpu, thread_index),
+                                   PERF_EVENT_IOC_SET_OUTPUT,
+                                   FD(first, nr_cpu, 0));
                         if (ret) {
                                 error("failed to set output: %d (%s)\n", errno,
                                                 strerror(errno));
                                 exit(-1);
                         }
                 } else {
-                       mmap_array[nr_cpu].counter = counter;
                         mmap_array[nr_cpu].prev = 0;
                         mmap_array[nr_cpu].mask = mmap_pages*page_size - 1;
                         mmap_array[nr_cpu].base = mmap(NULL, (mmap_pages+1)*page_size,
-                               PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter][thread_index], 0);
+                               PROT_READ | PROT_WRITE, MAP_SHARED, FD(evsel, nr_cpu, thread_index), 0);
                         if (mmap_array[nr_cpu].base == MAP_FAILED) {
                                 error("failed to mmap with %d (%s)\n", errno, strerror(errno));
                                 exit(-1);
                         }
  
-                       event_array[nr_poll].fd = fd[nr_cpu][counter][thread_index];
+                       event_array[nr_poll].fd = FD(evsel, nr_cpu, thread_index);
                         event_array[nr_poll].events = POLLIN;
                         nr_poll++;
                 }
  
                 if (filter != NULL) {
-                       ret = ioctl(fd[nr_cpu][counter][thread_index],
-                                       PERF_EVENT_IOC_SET_FILTER, filter);
+                       ret = ioctl(FD(evsel, nr_cpu, thread_index),
+                                   PERF_EVENT_IOC_SET_FILTER, filter);
                         if (ret) {
                                 error("failed to set filter with %d (%s)\n", errno,
                                                 strerror(errno));
@@ -406,15 +438,19 @@ try_again:
                         }
                 }
         }
+
+       if (!sample_type)
+               sample_type = attr->sample_type;
  }
  
  static void open_counters(int cpu)
  {
-       int counter;
+       struct perf_evsel *pos;
  
         group_fd = -1;
-       for (counter = 0; counter < nr_counters; counter++)
-               create_counter(counter, cpu);
+
+       list_for_each_entry(pos, &evsel_list, node)
+               create_counter(pos, cpu);
  
         nr_cpu++;
  }
@@ -437,7 +473,8 @@ static void atexit_header(void)
         if (!pipe_output) {
                 session->header.data_size += bytes_written;
  
-               process_buildids();
+               if (!no_buildid)
+                       process_buildids();
                 perf_header__write(&session->header, output, true);
                 perf_session__delete(session);
                 symbol__exit();
@@ -500,7 +537,7 @@ static void mmap_read_all(void)
  
  static int __cmd_record(int argc, const char **argv)
  {
-       int i, counter;
+       int i;
         struct stat st;
         int flags;
         int err;
@@ -552,19 +589,22 @@ static int __cmd_record(int argc, const char **argv)
         }
  
         session = perf_session__new(output_name, O_WRONLY,
-                                   write_mode == WRITE_FORCE, false);
+                                   write_mode == WRITE_FORCE, false, NULL);
         if (session == NULL) {
                 pr_err("Not enough memory for reading perf file header\n");
                 return -1;
         }
  
+       if (!no_buildid)
+               perf_header__set_feat(&session->header, HEADER_BUILD_ID);
+
         if (!file_new) {
                 err = perf_header__read(session, output);
                 if (err < 0)
                         goto out_delete_session;
         }
  
-       if (have_tracepoints(attrs, nr_counters))
+       if (have_tracepoints(&evsel_list))
                 perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
  
         /*
@@ -612,7 +652,7 @@ static int __cmd_record(int argc, const char **argv)
                 }
  
                 if (!system_wide && target_tid == -1 && target_pid == -1)
-                       all_tids[0] = child_pid;
+                       threads->map[0] = child_pid;
  
                 close(child_ready_pipe[1]);
                 close(go_pipe[0]);
@@ -626,19 +666,15 @@ static int __cmd_record(int argc, const char **argv)
                 close(child_ready_pipe[0]);
         }
  
-       nr_cpus = read_cpu_map(cpu_list);
-       if (nr_cpus < 1) {
-               perror("failed to collect number of CPUs");
-               return -1;
-       }
-
         if (!system_wide && no_inherit && !cpu_list) {
                 open_counters(-1);
         } else {
-               for (i = 0; i < nr_cpus; i++)
-                       open_counters(cpumap[i]);
+               for (i = 0; i < cpus->nr; i++)
+                       open_counters(cpus->map[i]);
         }
  
+       perf_session__set_sample_type(session, sample_type);
+
         if (pipe_output) {
                 err = perf_header__write_pipe(output);
                 if (err < 0)
@@ -651,6 +687,8 @@ static int __cmd_record(int argc, const char **argv)
  
         post_processing_offset = lseek(output, 0, SEEK_CUR);
  
+       perf_session__set_sample_id_all(session, sample_id_all_avail);
+
         if (pipe_output) {
                 err = event__synthesize_attrs(&session->header,
                                               process_synthesized_event,
@@ -667,7 +705,7 @@ static int __cmd_record(int argc, const char **argv)
                         return err;
                 }
  
-               if (have_tracepoints(attrs, nr_counters)) {
+               if (have_tracepoints(&evsel_list)) {
                         /*
                          * FIXME err <= 0 here actually means that
                          * there were no tracepoints so its not really
@@ -676,8 +714,7 @@ static int __cmd_record(int argc, const char **argv)
                          * return this more properly and also
                          * propagate errors that now are calling die()
                          */
-                       err = event__synthesize_tracing_data(output, attrs,
-                                                            nr_counters,
+                       err = event__synthesize_tracing_data(output, &evsel_list,
                                                              process_synthesized_event,
                                                              session);
                         if (err <= 0) {
@@ -751,13 +788,13 @@ static int __cmd_record(int argc, const char **argv)
  
                 if (done) {
                         for (i = 0; i < nr_cpu; i++) {
-                               for (counter = 0;
-                                       counter < nr_counters;
-                                       counter++) {
+                               struct perf_evsel *pos;
+
+                               list_for_each_entry(pos, &evsel_list, node) {
                                         for (thread = 0;
-                                               thread < thread_num;
+                                               thread < threads->nr;
                                                 thread++)
-                                               ioctl(fd[i][counter][thread],
+                                               ioctl(FD(pos, i, thread),
                                                         PERF_EVENT_IOC_DISABLE);
                                 }
                         }
@@ -831,16 +868,20 @@ const struct option record_options[] = {
                     "per thread counts"),
         OPT_BOOLEAN('d', "data", &sample_address,
                     "Sample addresses"),
+       OPT_BOOLEAN('T', "timestamp", &sample_time, "Sample timestamps"),
         OPT_BOOLEAN('n', "no-samples", &no_samples,
                     "don't sample"),
-       OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid,
+       OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache,
                     "do not update the buildid cache"),
+       OPT_BOOLEAN('B', "no-buildid", &no_buildid,
+                   "do not collect buildids in perf.data"),
         OPT_END()
  };
  
  int cmd_record(int argc, const char **argv, const char *prefix __used)
  {
-       int i, j, err = -ENOMEM;
+       int err = -ENOMEM;
+       struct perf_evsel *pos;
  
         argc = parse_options(argc, argv, record_options, record_usage,
                             PARSE_OPT_STOP_AT_NON_OPTION);
@@ -859,41 +900,36 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
         }
  
         symbol__init();
-       if (no_buildid)
+
+       if (no_buildid_cache || no_buildid)
                 disable_buildid_cache();
  
-       if (!nr_counters) {
-               nr_counters     = 1;
-               attrs[0].type   = PERF_TYPE_HARDWARE;
-               attrs[0].config = PERF_COUNT_HW_CPU_CYCLES;
+       if (list_empty(&evsel_list) && perf_evsel_list__create_default() < 0) {
+               pr_err("Not enough memory for event selector list\n");
+               goto out_symbol_exit;
         }
  
-       if (target_pid != -1) {
+       if (target_pid != -1)
                 target_tid = target_pid;
-               thread_num = find_all_tid(target_pid, &all_tids);
-               if (thread_num <= 0) {
-                       fprintf(stderr, "Can't find all threads of pid %d\n",
-                                       target_pid);
-                       usage_with_options(record_usage, record_options);
-               }
-       } else {
-               all_tids=malloc(sizeof(pid_t));
-               if (!all_tids)
-                       goto out_symbol_exit;
  
-               all_tids[0] = target_tid;
-               thread_num = 1;
+       threads = thread_map__new(target_pid, target_tid);
+       if (threads == NULL) {
+               pr_err("Problems finding threads of monitor\n");
+               usage_with_options(record_usage, record_options);
         }
  
-       for (i = 0; i < MAX_NR_CPUS; i++) {
-               for (j = 0; j < MAX_COUNTERS; j++) {
-                       fd[i][j] = malloc(sizeof(int)*thread_num);
-                       if (!fd[i][j])
-                               goto out_free_fd;
-               }
+       cpus = cpu_map__new(cpu_list);
+       if (cpus == NULL) {
+               perror("failed to parse CPUs map");
+               return -1;
         }
-       event_array = malloc(
-               sizeof(struct pollfd)*MAX_NR_CPUS*MAX_COUNTERS*thread_num);
+
+       list_for_each_entry(pos, &evsel_list, node) {
+               if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
+                       goto out_free_fd;
+       }
+       event_array = malloc((sizeof(struct pollfd) * MAX_NR_CPUS *
+                             MAX_COUNTERS * threads->nr));
         if (!event_array)
                 goto out_free_fd;
  
@@ -920,12 +956,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
  out_free_event_array:
         free(event_array);
  out_free_fd:
-       for (i = 0; i < MAX_NR_CPUS; i++) {
-               for (j = 0; j < MAX_COUNTERS; j++)
-                       free(fd[i][j]);
-       }
-       free(all_tids);
-       all_tids = NULL;
+       thread_map__delete(threads);
+       threads = NULL;
  out_symbol_exit:
         symbol__exit();
         return err;
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c

index 5de405d452300318541338293563d8ebc41ccb87..75183a4518e60d23db05e36e585fc178df731a19 100644 (file)
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -150,13 +150,13 @@ static int add_event_total(struct perf_session *session,
         return 0;
  }
  
-static int process_sample_event(event_t *event, struct perf_session *session)
+static int process_sample_event(event_t *event, struct sample_data *sample,
+                               struct perf_session *session)
  {
-       struct sample_data data = { .period = 1, };
         struct addr_location al;
         struct perf_event_attr *attr;
  
-       if (event__preprocess_sample(event, session, &al, &data, NULL) < 0) {
+       if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) {
                 fprintf(stderr, "problem processing %d event, skipping it.\n",
                         event->header.type);
                 return -1;
@@ -165,14 +165,14 @@ static int process_sample_event(event_t *event, struct perf_session *session)
         if (al.filtered || (hide_unresolved && al.sym == NULL))
                 return 0;
  
-       if (perf_session__add_hist_entry(session, &al, &data)) {
+       if (perf_session__add_hist_entry(session, &al, sample)) {
                 pr_debug("problem incrementing symbol period, skipping event\n");
                 return -1;
         }
  
-       attr = perf_header__find_attr(data.id, &session->header);
+       attr = perf_header__find_attr(sample->id, &session->header);
  
-       if (add_event_total(session, &data, attr)) {
+       if (add_event_total(session, sample, attr)) {
                 pr_debug("problem adding event period\n");
                 return -1;
         }
@@ -180,7 +180,8 @@ static int process_sample_event(event_t *event, struct perf_session *session)
         return 0;
  }
  
-static int process_read_event(event_t *event, struct perf_session *session __used)
+static int process_read_event(event_t *event, struct sample_data *sample __used,
+                             struct perf_session *session __used)
  {
         struct perf_event_attr *attr;
  
@@ -243,6 +244,8 @@ static struct perf_event_ops event_ops = {
         .event_type = event__process_event_type,
         .tracing_data = event__process_tracing_data,
         .build_id = event__process_build_id,
+       .ordered_samples = true,
+       .ordering_requires_timestamps = true,
  };
  
  extern volatile int session_done;
@@ -307,7 +310,7 @@ static int __cmd_report(void)
  
         signal(SIGINT, sig_handler);
  
-       session = perf_session__new(input_name, O_RDONLY, force, false);
+       session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops);
         if (session == NULL)
                 return -ENOMEM;
  
@@ -442,6 +445,8 @@ static const struct option options[] = {
                     "dump raw trace in ASCII"),
         OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
                    "file", "vmlinux pathname"),
+       OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
+                  "file", "kallsyms pathname"),
         OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
         OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules,
                     "load module symbols - WARNING: use only with -k and LIVE kernel"),
@@ -478,6 +483,8 @@ static const struct option options[] = {
                    "columns '.' is reserved."),
         OPT_BOOLEAN('U', "hide-unresolved", &hide_unresolved,
                     "Only display entries resolved to a symbol"),
+       OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
+                   "Look for files with symbols relative to this directory"),
         OPT_END()
  };
  
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c

index 55f3b5dcc731417198a2e5fd29ac8eefd96e1a5e..7a4ebeb8b016b4ca01c14f393b3c5fab98567523 100644 (file)
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1606,25 +1606,15 @@ process_raw_event(event_t *raw_event __used, struct perf_session *session,
                 process_sched_migrate_task_event(data, session, event, cpu, timestamp, thread);
  }
  
-static int process_sample_event(event_t *event, struct perf_session *session)
+static int process_sample_event(event_t *event, struct sample_data *sample,
+                               struct perf_session *session)
  {
-       struct sample_data data;
         struct thread *thread;
  
         if (!(session->sample_type & PERF_SAMPLE_RAW))
                 return 0;
  
-       memset(&data, 0, sizeof(data));
-       data.time = -1;
-       data.cpu = -1;
-       data.period = -1;
-
-       event__parse_sample(event, session->sample_type, &data);
-
-       dump_printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc,
-                   data.pid, data.tid, data.ip, data.period);
-
-       thread = perf_session__findnew(session, data.pid);
+       thread = perf_session__findnew(session, sample->pid);
         if (thread == NULL) {
                 pr_debug("problem processing %d event, skipping it.\n",
                          event->header.type);
@@ -1633,10 +1623,11 @@ static int process_sample_event(event_t *event, struct perf_session *session)
  
         dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
  
-       if (profile_cpu != -1 && profile_cpu != (int)data.cpu)
+       if (profile_cpu != -1 && profile_cpu != (int)sample->cpu)
                 return 0;
  
-       process_raw_event(event, session, data.raw_data, data.cpu, data.time, thread);
+       process_raw_event(event, session, sample->raw_data, sample->cpu,
+                         sample->time, thread);
  
         return 0;
  }
@@ -1652,7 +1643,8 @@ static struct perf_event_ops event_ops = {
  static int read_events(void)
  {
         int err = -EINVAL;
-       struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0, false);
+       struct perf_session *session = perf_session__new(input_name, O_RDONLY,
+                                                        0, false, &event_ops);
         if (session == NULL)
                 return -ENOMEM;
  
@@ -1869,6 +1861,9 @@ static int __cmd_record(int argc, const char **argv)
         rec_argc = ARRAY_SIZE(record_args) + argc - 1;
         rec_argv = calloc(rec_argc + 1, sizeof(char *));
  
+       if (rec_argv)
+               return -ENOMEM;
+
         for (i = 0; i < ARRAY_SIZE(record_args); i++)
                 rec_argv[i] = strdup(record_args[i]);
  
@@ -1888,10 +1883,10 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
                 usage_with_options(sched_usage, sched_options);
  
         /*
-        * Aliased to 'perf trace' for now:
+        * Aliased to 'perf script' for now:
          */
-       if (!strcmp(argv[0], "trace"))
-               return cmd_trace(argc, argv, prefix);
+       if (!strcmp(argv[0], "script"))
+               return cmd_script(argc, argv, prefix);
  
         symbol__init();
         if (!strncmp(argv[0], "rec", 3)) {
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c

new file mode 100644 (file)

index 0000000..150a606
--- /dev/null
+++ b/tools/perf/builtin-script.c
@@ -0,0 +1,821 @@
+#include "builtin.h"
+
+#include "perf.h"
+#include "util/cache.h"
+#include "util/debug.h"
+#include "util/exec_cmd.h"
+#include "util/header.h"
+#include "util/parse-options.h"
+#include "util/session.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+#include "util/trace-event.h"
+#include "util/parse-options.h"
+#include "util/util.h"
+
+static char const              *script_name;
+static char const              *generate_script_lang;
+static bool                    debug_mode;
+static u64                     last_timestamp;
+static u64                     nr_unordered;
+extern const struct option     record_options[];
+
+static int default_start_script(const char *script __unused,
+                               int argc __unused,
+                               const char **argv __unused)
+{
+       return 0;
+}
+
+static int default_stop_script(void)
+{
+       return 0;
+}
+
+static int default_generate_script(const char *outfile __unused)
+{
+       return 0;
+}
+
+static struct scripting_ops default_scripting_ops = {
+       .start_script           = default_start_script,
+       .stop_script            = default_stop_script,
+       .process_event          = print_event,
+       .generate_script        = default_generate_script,
+};
+
+static struct scripting_ops    *scripting_ops;
+
+static void setup_scripting(void)
+{
+       setup_perl_scripting();
+       setup_python_scripting();
+
+       scripting_ops = &default_scripting_ops;
+}
+
+static int cleanup_scripting(void)
+{
+       pr_debug("\nperf script stopped\n");
+
+       return scripting_ops->stop_script();
+}
+
+static char const              *input_name = "perf.data";
+
+static int process_sample_event(event_t *event, struct sample_data *sample,
+                               struct perf_session *session)
+{
+       struct thread *thread = perf_session__findnew(session, event->ip.pid);
+
+       if (thread == NULL) {
+               pr_debug("problem processing %d event, skipping it.\n",
+                        event->header.type);
+               return -1;
+       }
+
+       if (session->sample_type & PERF_SAMPLE_RAW) {
+               if (debug_mode) {
+                       if (sample->time < last_timestamp) {
+                               pr_err("Samples misordered, previous: %llu "
+                                       "this: %llu\n", last_timestamp,
+                                       sample->time);
+                               nr_unordered++;
+                       }
+                       last_timestamp = sample->time;
+                       return 0;
+               }
+               /*
+                * FIXME: better resolve from pid from the struct trace_entry
+                * field, although it should be the same than this perf
+                * event pid
+                */
+               scripting_ops->process_event(sample->cpu, sample->raw_data,
+                                            sample->raw_size,
+                                            sample->time, thread->comm);
+       }
+
+       session->hists.stats.total_period += sample->period;
+       return 0;
+}
+
+static struct perf_event_ops event_ops = {
+       .sample = process_sample_event,
+       .comm   = event__process_comm,
+       .attr   = event__process_attr,
+       .event_type = event__process_event_type,
+       .tracing_data = event__process_tracing_data,
+       .build_id = event__process_build_id,
+       .ordering_requires_timestamps = true,
+       .ordered_samples = true,
+};
+
+extern volatile int session_done;
+
+static void sig_handler(int sig __unused)
+{
+       session_done = 1;
+}
+
+static int __cmd_script(struct perf_session *session)
+{
+       int ret;
+
+       signal(SIGINT, sig_handler);
+
+       ret = perf_session__process_events(session, &event_ops);
+
+       if (debug_mode)
+               pr_err("Misordered timestamps: %llu\n", nr_unordered);
+
+       return ret;
+}
+
+struct script_spec {
+       struct list_head        node;
+       struct scripting_ops    *ops;
+       char                    spec[0];
+};
+
+static LIST_HEAD(script_specs);
+
+static struct script_spec *script_spec__new(const char *spec,
+                                           struct scripting_ops *ops)
+{
+       struct script_spec *s = malloc(sizeof(*s) + strlen(spec) + 1);
+
+       if (s != NULL) {
+               strcpy(s->spec, spec);
+               s->ops = ops;
+       }
+
+       return s;
+}
+
+static void script_spec__delete(struct script_spec *s)
+{
+       free(s->spec);
+       free(s);
+}
+
+static void script_spec__add(struct script_spec *s)
+{
+       list_add_tail(&s->node, &script_specs);
+}
+
+static struct script_spec *script_spec__find(const char *spec)
+{
+       struct script_spec *s;
+
+       list_for_each_entry(s, &script_specs, node)
+               if (strcasecmp(s->spec, spec) == 0)
+                       return s;
+       return NULL;
+}
+
+static struct script_spec *script_spec__findnew(const char *spec,
+                                               struct scripting_ops *ops)
+{
+       struct script_spec *s = script_spec__find(spec);
+
+       if (s)
+               return s;
+
+       s = script_spec__new(spec, ops);
+       if (!s)
+               goto out_delete_spec;
+
+       script_spec__add(s);
+
+       return s;
+
+out_delete_spec:
+       script_spec__delete(s);
+
+       return NULL;
+}
+
+int script_spec_register(const char *spec, struct scripting_ops *ops)
+{
+       struct script_spec *s;
+
+       s = script_spec__find(spec);
+       if (s)
+               return -1;
+
+       s = script_spec__findnew(spec, ops);
+       if (!s)
+               return -1;
+
+       return 0;
+}
+
+static struct scripting_ops *script_spec__lookup(const char *spec)
+{
+       struct script_spec *s = script_spec__find(spec);
+       if (!s)
+               return NULL;
+
+       return s->ops;
+}
+
+static void list_available_languages(void)
+{
+       struct script_spec *s;
+
+       fprintf(stderr, "\n");
+       fprintf(stderr, "Scripting language extensions (used in "
+               "perf script -s [spec:]script.[spec]):\n\n");
+
+       list_for_each_entry(s, &script_specs, node)
+               fprintf(stderr, "  %-42s [%s]\n", s->spec, s->ops->name);
+
+       fprintf(stderr, "\n");
+}
+
+static int parse_scriptname(const struct option *opt __used,
+                           const char *str, int unset __used)
+{
+       char spec[PATH_MAX];
+       const char *script, *ext;
+       int len;
+
+       if (strcmp(str, "lang") == 0) {
+               list_available_languages();
+               exit(0);
+       }
+
+       script = strchr(str, ':');
+       if (script) {
+               len = script - str;
+               if (len >= PATH_MAX) {
+                       fprintf(stderr, "invalid language specifier");
+                       return -1;
+               }
+               strncpy(spec, str, len);
+               spec[len] = '\0';
+               scripting_ops = script_spec__lookup(spec);
+               if (!scripting_ops) {
+                       fprintf(stderr, "invalid language specifier");
+                       return -1;
+               }
+               script++;
+       } else {
+               script = str;
+               ext = strrchr(script, '.');
+               if (!ext) {
+                       fprintf(stderr, "invalid script extension");
+                       return -1;
+               }
+               scripting_ops = script_spec__lookup(++ext);
+               if (!scripting_ops) {
+                       fprintf(stderr, "invalid script extension");
+                       return -1;
+               }
+       }
+
+       script_name = strdup(script);
+
+       return 0;
+}
+
+/* Helper function for filesystems that return a dent->d_type DT_UNKNOWN */
+static int is_directory(const char *base_path, const struct dirent *dent)
+{
+       char path[PATH_MAX];
+       struct stat st;
+
+       sprintf(path, "%s/%s", base_path, dent->d_name);
+       if (stat(path, &st))
+               return 0;
+
+       return S_ISDIR(st.st_mode);
+}
+
+#define for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next)\
+       while (!readdir_r(scripts_dir, &lang_dirent, &lang_next) &&     \
+              lang_next)                                               \
+               if ((lang_dirent.d_type == DT_DIR ||                    \
+                    (lang_dirent.d_type == DT_UNKNOWN &&               \
+                     is_directory(scripts_path, &lang_dirent))) &&     \
+                   (strcmp(lang_dirent.d_name, ".")) &&                \
+                   (strcmp(lang_dirent.d_name, "..")))
+
+#define for_each_script(lang_path, lang_dir, script_dirent, script_next)\
+       while (!readdir_r(lang_dir, &script_dirent, &script_next) &&    \
+              script_next)                                             \
+               if (script_dirent.d_type != DT_DIR &&                   \
+                   (script_dirent.d_type != DT_UNKNOWN ||              \
+                    !is_directory(lang_path, &script_dirent)))
+
+
+#define RECORD_SUFFIX                  "-record"
+#define REPORT_SUFFIX                  "-report"
+
+struct script_desc {
+       struct list_head        node;
+       char                    *name;
+       char                    *half_liner;
+       char                    *args;
+};
+
+static LIST_HEAD(script_descs);
+
+static struct script_desc *script_desc__new(const char *name)
+{
+       struct script_desc *s = zalloc(sizeof(*s));
+
+       if (s != NULL && name)
+               s->name = strdup(name);
+
+       return s;
+}
+
+static void script_desc__delete(struct script_desc *s)
+{
+       free(s->name);
+       free(s->half_liner);
+       free(s->args);
+       free(s);
+}
+
+static void script_desc__add(struct script_desc *s)
+{
+       list_add_tail(&s->node, &script_descs);
+}
+
+static struct script_desc *script_desc__find(const char *name)
+{
+       struct script_desc *s;
+
+       list_for_each_entry(s, &script_descs, node)
+               if (strcasecmp(s->name, name) == 0)
+                       return s;
+       return NULL;
+}
+
+static struct script_desc *script_desc__findnew(const char *name)
+{
+       struct script_desc *s = script_desc__find(name);
+
+       if (s)
+               return s;
+
+       s = script_desc__new(name);
+       if (!s)
+               goto out_delete_desc;
+
+       script_desc__add(s);
+
+       return s;
+
+out_delete_desc:
+       script_desc__delete(s);
+
+       return NULL;
+}
+
+static const char *ends_with(const char *str, const char *suffix)
+{
+       size_t suffix_len = strlen(suffix);
+       const char *p = str;
+
+       if (strlen(str) > suffix_len) {
+               p = str + strlen(str) - suffix_len;
+               if (!strncmp(p, suffix, suffix_len))
+                       return p;
+       }
+
+       return NULL;
+}
+
+static char *ltrim(char *str)
+{
+       int len = strlen(str);
+
+       while (len && isspace(*str)) {
+               len--;
+               str++;
+       }
+
+       return str;
+}
+
+static int read_script_info(struct script_desc *desc, const char *filename)
+{
+       char line[BUFSIZ], *p;
+       FILE *fp;
+
+       fp = fopen(filename, "r");
+       if (!fp)
+               return -1;
+
+       while (fgets(line, sizeof(line), fp)) {
+               p = ltrim(line);
+               if (strlen(p) == 0)
+                       continue;
+               if (*p != '#')
+                       continue;
+               p++;
+               if (strlen(p) && *p == '!')
+                       continue;
+
+               p = ltrim(p);
+               if (strlen(p) && p[strlen(p) - 1] == '\n')
+                       p[strlen(p) - 1] = '\0';
+
+               if (!strncmp(p, "description:", strlen("description:"))) {
+                       p += strlen("description:");
+                       desc->half_liner = strdup(ltrim(p));
+                       continue;
+               }
+
+               if (!strncmp(p, "args:", strlen("args:"))) {
+                       p += strlen("args:");
+                       desc->args = strdup(ltrim(p));
+                       continue;
+               }
+       }
+
+       fclose(fp);
+
+       return 0;
+}
+
+static int list_available_scripts(const struct option *opt __used,
+                                 const char *s __used, int unset __used)
+{
+       struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
+       char scripts_path[MAXPATHLEN];
+       DIR *scripts_dir, *lang_dir;
+       char script_path[MAXPATHLEN];
+       char lang_path[MAXPATHLEN];
+       struct script_desc *desc;
+       char first_half[BUFSIZ];
+       char *script_root;
+       char *str;
+
+       snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
+
+       scripts_dir = opendir(scripts_path);
+       if (!scripts_dir)
+               return -1;
+
+       for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
+               snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
+                        lang_dirent.d_name);
+               lang_dir = opendir(lang_path);
+               if (!lang_dir)
+                       continue;
+
+               for_each_script(lang_path, lang_dir, script_dirent, script_next) {
+                       script_root = strdup(script_dirent.d_name);
+                       str = (char *)ends_with(script_root, REPORT_SUFFIX);
+                       if (str) {
+                               *str = '\0';
+                               desc = script_desc__findnew(script_root);
+                               snprintf(script_path, MAXPATHLEN, "%s/%s",
+                                        lang_path, script_dirent.d_name);
+                               read_script_info(desc, script_path);
+                       }
+                       free(script_root);
+               }
+       }
+
+       fprintf(stdout, "List of available trace scripts:\n");
+       list_for_each_entry(desc, &script_descs, node) {
+               sprintf(first_half, "%s %s", desc->name,
+                       desc->args ? desc->args : "");
+               fprintf(stdout, "  %-36s %s\n", first_half,
+                       desc->half_liner ? desc->half_liner : "");
+       }
+
+       exit(0);
+}
+
+static char *get_script_path(const char *script_root, const char *suffix)
+{
+       struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
+       char scripts_path[MAXPATHLEN];
+       char script_path[MAXPATHLEN];
+       DIR *scripts_dir, *lang_dir;
+       char lang_path[MAXPATHLEN];
+       char *str, *__script_root;
+       char *path = NULL;
+
+       snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
+
+       scripts_dir = opendir(scripts_path);
+       if (!scripts_dir)
+               return NULL;
+
+       for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
+               snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
+                        lang_dirent.d_name);
+               lang_dir = opendir(lang_path);
+               if (!lang_dir)
+                       continue;
+
+               for_each_script(lang_path, lang_dir, script_dirent, script_next) {
+                       __script_root = strdup(script_dirent.d_name);
+                       str = (char *)ends_with(__script_root, suffix);
+                       if (str) {
+                               *str = '\0';
+                               if (strcmp(__script_root, script_root))
+                                       continue;
+                               snprintf(script_path, MAXPATHLEN, "%s/%s",
+                                        lang_path, script_dirent.d_name);
+                               path = strdup(script_path);
+                               free(__script_root);
+                               break;
+                       }
+                       free(__script_root);
+               }
+       }
+
+       return path;
+}
+
+static bool is_top_script(const char *script_path)
+{
+       return ends_with(script_path, "top") == NULL ? false : true;
+}
+
+static int has_required_arg(char *script_path)
+{
+       struct script_desc *desc;
+       int n_args = 0;
+       char *p;
+
+       desc = script_desc__new(NULL);
+
+       if (read_script_info(desc, script_path))
+               goto out;
+
+       if (!desc->args)
+               goto out;
+
+       for (p = desc->args; *p; p++)
+               if (*p == '<')
+                       n_args++;
+out:
+       script_desc__delete(desc);
+
+       return n_args;
+}
+
+static const char * const script_usage[] = {
+       "perf script [<options>]",
+       "perf script [<options>] record <script> [<record-options>] <command>",
+       "perf script [<options>] report <script> [script-args]",
+       "perf script [<options>] <script> [<record-options>] <command>",
+       "perf script [<options>] <top-script> [script-args]",
+       NULL
+};
+
+static const struct option options[] = {
+       OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+                   "dump raw trace in ASCII"),
+       OPT_INCR('v', "verbose", &verbose,
+                   "be more verbose (show symbol address, etc)"),
+       OPT_BOOLEAN('L', "Latency", &latency_format,
+                   "show latency attributes (irqs/preemption disabled, etc)"),
+       OPT_CALLBACK_NOOPT('l', "list", NULL, NULL, "list available scripts",
+                          list_available_scripts),
+       OPT_CALLBACK('s', "script", NULL, "name",
+                    "script file name (lang:script name, script name, or *)",
+                    parse_scriptname),
+       OPT_STRING('g', "gen-script", &generate_script_lang, "lang",
+                  "generate perf-script.xx script in specified language"),
+       OPT_STRING('i', "input", &input_name, "file",
+                   "input file name"),
+       OPT_BOOLEAN('d', "debug-mode", &debug_mode,
+                  "do various checks like samples ordering and lost events"),
+
+       OPT_END()
+};
+
+static bool have_cmd(int argc, const char **argv)
+{
+       char **__argv = malloc(sizeof(const char *) * argc);
+
+       if (!__argv)
+               die("malloc");
+       memcpy(__argv, argv, sizeof(const char *) * argc);
+       argc = parse_options(argc, (const char **)__argv, record_options,
+                            NULL, PARSE_OPT_STOP_AT_NON_OPTION);
+       free(__argv);
+
+       return argc != 0;
+}
+
+int cmd_script(int argc, const char **argv, const char *prefix __used)
+{
+       char *rec_script_path = NULL;
+       char *rep_script_path = NULL;
+       struct perf_session *session;
+       char *script_path = NULL;
+       const char **__argv;
+       bool system_wide;
+       int i, j, err;
+
+       setup_scripting();
+
+       argc = parse_options(argc, argv, options, script_usage,
+                            PARSE_OPT_STOP_AT_NON_OPTION);
+
+       if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) {
+               rec_script_path = get_script_path(argv[1], RECORD_SUFFIX);
+               if (!rec_script_path)
+                       return cmd_record(argc, argv, NULL);
+       }
+
+       if (argc > 1 && !strncmp(argv[0], "rep", strlen("rep"))) {
+               rep_script_path = get_script_path(argv[1], REPORT_SUFFIX);
+               if (!rep_script_path) {
+                       fprintf(stderr,
+                               "Please specify a valid report script"
+                               "(see 'perf script -l' for listing)\n");
+                       return -1;
+               }
+       }
+
+       /* make sure PERF_EXEC_PATH is set for scripts */
+       perf_set_argv_exec_path(perf_exec_path());
+
+       if (argc && !script_name && !rec_script_path && !rep_script_path) {
+               int live_pipe[2];
+               int rep_args;
+               pid_t pid;
+
+               rec_script_path = get_script_path(argv[0], RECORD_SUFFIX);
+               rep_script_path = get_script_path(argv[0], REPORT_SUFFIX);
+
+               if (!rec_script_path && !rep_script_path) {
+                       fprintf(stderr, " Couldn't find script %s\n\n See perf"
+                               " script -l for available scripts.\n", argv[0]);
+                       usage_with_options(script_usage, options);
+               }
+
+               if (is_top_script(argv[0])) {
+                       rep_args = argc - 1;
+               } else {
+                       int rec_args;
+
+                       rep_args = has_required_arg(rep_script_path);
+                       rec_args = (argc - 1) - rep_args;
+                       if (rec_args < 0) {
+                               fprintf(stderr, " %s script requires options."
+                                       "\n\n See perf script -l for available "
+                                       "scripts and options.\n", argv[0]);
+                               usage_with_options(script_usage, options);
+                       }
+               }
+
+               if (pipe(live_pipe) < 0) {
+                       perror("failed to create pipe");
+                       exit(-1);
+               }
+
+               pid = fork();
+               if (pid < 0) {
+                       perror("failed to fork");
+                       exit(-1);
+               }
+
+               if (!pid) {
+                       system_wide = true;
+                       j = 0;
+
+                       dup2(live_pipe[1], 1);
+                       close(live_pipe[0]);
+
+                       if (!is_top_script(argv[0]))
+                               system_wide = !have_cmd(argc - rep_args,
+                                                       &argv[rep_args]);
+
+                       __argv = malloc((argc + 6) * sizeof(const char *));
+                       if (!__argv)
+                               die("malloc");
+
+                       __argv[j++] = "/bin/sh";
+                       __argv[j++] = rec_script_path;
+                       if (system_wide)
+                               __argv[j++] = "-a";
+                       __argv[j++] = "-q";
+                       __argv[j++] = "-o";
+                       __argv[j++] = "-";
+                       for (i = rep_args + 1; i < argc; i++)
+                               __argv[j++] = argv[i];
+                       __argv[j++] = NULL;
+
+                       execvp("/bin/sh", (char **)__argv);
+                       free(__argv);
+                       exit(-1);
+               }
+
+               dup2(live_pipe[0], 0);
+               close(live_pipe[1]);
+
+               __argv = malloc((argc + 4) * sizeof(const char *));
+               if (!__argv)
+                       die("malloc");
+               j = 0;
+               __argv[j++] = "/bin/sh";
+               __argv[j++] = rep_script_path;
+               for (i = 1; i < rep_args + 1; i++)
+                       __argv[j++] = argv[i];
+               __argv[j++] = "-i";
+               __argv[j++] = "-";
+               __argv[j++] = NULL;
+
+               execvp("/bin/sh", (char **)__argv);
+               free(__argv);
+               exit(-1);
+       }
+
+       if (rec_script_path)
+               script_path = rec_script_path;
+       if (rep_script_path)
+               script_path = rep_script_path;
+
+       if (script_path) {
+               system_wide = false;
+               j = 0;
+
+               if (rec_script_path)
+                       system_wide = !have_cmd(argc - 1, &argv[1]);
+
+               __argv = malloc((argc + 2) * sizeof(const char *));
+               if (!__argv)
+                       die("malloc");
+               __argv[j++] = "/bin/sh";
+               __argv[j++] = script_path;
+               if (system_wide)
+                       __argv[j++] = "-a";
+               for (i = 2; i < argc; i++)
+                       __argv[j++] = argv[i];
+               __argv[j++] = NULL;
+
+               execvp("/bin/sh", (char **)__argv);
+               free(__argv);
+               exit(-1);
+       }
+
+       if (symbol__init() < 0)
+               return -1;
+       if (!script_name)
+               setup_pager();
+
+       session = perf_session__new(input_name, O_RDONLY, 0, false, &event_ops);
+       if (session == NULL)
+               return -ENOMEM;
+
+       if (strcmp(input_name, "-") &&
+           !perf_session__has_traces(session, "record -R"))
+               return -EINVAL;
+
+       if (generate_script_lang) {
+               struct stat perf_stat;
+
+               int input = open(input_name, O_RDONLY);
+               if (input < 0) {
+                       perror("failed to open file");
+                       exit(-1);
+               }
+
+               err = fstat(input, &perf_stat);
+               if (err < 0) {
+                       perror("failed to stat file");
+                       exit(-1);
+               }
+
+               if (!perf_stat.st_size) {
+                       fprintf(stderr, "zero-sized file, nothing to do!\n");
+                       exit(0);
+               }
+
+               scripting_ops = script_spec__lookup(generate_script_lang);
+               if (!scripting_ops) {
+                       fprintf(stderr, "invalid language specifier");
+                       return -1;
+               }
+
+               err = scripting_ops->generate_script("perf-script");
+               goto out;
+       }
+
+       if (script_name) {
+               err = scripting_ops->start_script(script_name, argc, argv);
+               if (err)
+                       goto out;
+               pr_debug("perf script started with script %s\n\n", script_name);
+       }
+
+       err = __cmd_script(session);
+
+       perf_session__delete(session);
+       cleanup_scripting();
+out:
+       return err;
+}
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c

index a6b4d44f950246e27d4cb6b0bc3e6d5afd27adcc..02b2d8013a61e537bf80e5d110d6c2f2c3f6b150 100644 (file)
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -43,6 +43,7 @@
  #include "util/parse-options.h"
  #include "util/parse-events.h"
  #include "util/event.h"
+#include "util/evsel.h"
  #include "util/debug.h"
  #include "util/header.h"
  #include "util/cpumap.h"
@@ -52,6 +53,8 @@
  #include <math.h>
  #include <locale.h>
  
+#define DEFAULT_SEPARATOR      " "
+
  static struct perf_event_attr default_attrs[] = {
  
    { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK             },
@@ -69,25 +72,23 @@ static struct perf_event_attr default_attrs[] = {
  };
  
  static bool                    system_wide                     =  false;
-static int                     nr_cpus                         =  0;
+static struct cpu_map          *cpus;
  static int                     run_idx                         =  0;
  
  static int                     run_count                       =  1;
  static bool                    no_inherit                      = false;
  static bool                    scale                           =  true;
+static bool                    no_aggr                         = false;
  static pid_t                   target_pid                      = -1;
  static pid_t                   target_tid                      = -1;
-static pid_t                   *all_tids                       =  NULL;
-static int                     thread_num                      =  0;
+static struct thread_map       *threads;
  static pid_t                   child_pid                       = -1;
  static bool                    null_run                        =  false;
-static bool                    big_num                         =  false;
+static bool                    big_num                         =  true;
+static int                     big_num_opt                     =  -1;
  static const char              *cpu_list;
-
-
-static int                     *fd[MAX_NR_CPUS][MAX_COUNTERS];
-
-static int                     event_scaled[MAX_COUNTERS];
+static const char              *csv_sep                        = NULL;
+static bool                    csv_output                      = false;
  
  static volatile int done = 0;
  
@@ -96,6 +97,22 @@ struct stats
         double n, mean, M2;
  };
  
+struct perf_stat {
+       struct stats      res_stats[3];
+};
+
+static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
+{
+       evsel->priv = zalloc(sizeof(struct perf_stat));
+       return evsel->priv == NULL ? -ENOMEM : 0;
+}
+
+static void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
+{
+       free(evsel->priv);
+       evsel->priv = NULL;
+}
+
  static void update_stats(struct stats *stats, u64 val)
  {
         double delta;
@@ -135,69 +152,38 @@ static double stddev_stats(struct stats *stats)
         return sqrt(variance_mean);
  }
  
-struct stats                   event_res_stats[MAX_COUNTERS][3];
-struct stats                   runtime_nsecs_stats;
+struct stats                   runtime_nsecs_stats[MAX_NR_CPUS];
+struct stats                   runtime_cycles_stats[MAX_NR_CPUS];
+struct stats                   runtime_branches_stats[MAX_NR_CPUS];
  struct stats                   walltime_nsecs_stats;
-struct stats                   runtime_cycles_stats;
-struct stats                   runtime_branches_stats;
  
-#define MATCH_EVENT(t, c, counter)                     \
-       (attrs[counter].type == PERF_TYPE_##t &&        \
-        attrs[counter].config == PERF_COUNT_##c)
-
-#define ERR_PERF_OPEN \
-"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n"
-
-static int create_perf_stat_counter(int counter)
+static int create_perf_stat_counter(struct perf_evsel *evsel)
  {
-       struct perf_event_attr *attr = attrs + counter;
-       int thread;
-       int ncreated = 0;
+       struct perf_event_attr *attr = &evsel->attr;
  
         if (scale)
                 attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
                                     PERF_FORMAT_TOTAL_TIME_RUNNING;
  
-       if (system_wide) {
-               int cpu;
-
-               for (cpu = 0; cpu < nr_cpus; cpu++) {
-                       fd[cpu][counter][0] = sys_perf_event_open(attr,
-                                       -1, cpumap[cpu], -1, 0);
-                       if (fd[cpu][counter][0] < 0)
-                               pr_debug(ERR_PERF_OPEN, counter,
-                                        fd[cpu][counter][0], strerror(errno));
-                       else
-                               ++ncreated;
-               }
-       } else {
-               attr->inherit = !no_inherit;
-               if (target_pid == -1 && target_tid == -1) {
-                       attr->disabled = 1;
-                       attr->enable_on_exec = 1;
-               }
-               for (thread = 0; thread < thread_num; thread++) {
-                       fd[0][counter][thread] = sys_perf_event_open(attr,
-                               all_tids[thread], -1, -1, 0);
-                       if (fd[0][counter][thread] < 0)
-                               pr_debug(ERR_PERF_OPEN, counter,
-                                        fd[0][counter][thread],
-                                        strerror(errno));
-                       else
-                               ++ncreated;
-               }
+       if (system_wide)
+               return perf_evsel__open_per_cpu(evsel, cpus);
+
+       attr->inherit = !no_inherit;
+       if (target_pid == -1 && target_tid == -1) {
+               attr->disabled = 1;
+               attr->enable_on_exec = 1;
         }
  
-       return ncreated;
+       return perf_evsel__open_per_thread(evsel, threads);
  }
  
  /*
   * Does the counter have nsecs as a unit?
   */
-static inline int nsec_counter(int counter)
+static inline int nsec_counter(struct perf_evsel *evsel)
  {
-       if (MATCH_EVENT(SOFTWARE, SW_CPU_CLOCK, counter) ||
-           MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
+       if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) ||
+           perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
                 return 1;
  
         return 0;
@@ -205,55 +191,19 @@ static inline int nsec_counter(int counter)
  
  /*
   * Read out the results of a single counter:
+ * aggregate counts across CPUs in system-wide mode
   */
-static void read_counter(int counter)
+static int read_counter_aggr(struct perf_evsel *counter)
  {
-       u64 count[3], single_count[3];
-       int cpu;
-       size_t res, nv;
-       int scaled;
-       int i, thread;
-
-       count[0] = count[1] = count[2] = 0;
-
-       nv = scale ? 3 : 1;
-       for (cpu = 0; cpu < nr_cpus; cpu++) {
-               for (thread = 0; thread < thread_num; thread++) {
-                       if (fd[cpu][counter][thread] < 0)
-                               continue;
-
-                       res = read(fd[cpu][counter][thread],
-                                       single_count, nv * sizeof(u64));
-                       assert(res == nv * sizeof(u64));
-
-                       close(fd[cpu][counter][thread]);
-                       fd[cpu][counter][thread] = -1;
-
-                       count[0] += single_count[0];
-                       if (scale) {
-                               count[1] += single_count[1];
-                               count[2] += single_count[2];
-                       }
-               }
-       }
-
-       scaled = 0;
-       if (scale) {
-               if (count[2] == 0) {
-                       event_scaled[counter] = -1;
-                       count[0] = 0;
-                       return;
-               }
+       struct perf_stat *ps = counter->priv;
+       u64 *count = counter->counts->aggr.values;
+       int i;
  
-               if (count[2] < count[1]) {
-                       event_scaled[counter] = 1;
-                       count[0] = (unsigned long long)
-                               ((double)count[0] * count[1] / count[2] + 0.5);
-               }
-       }
+       if (__perf_evsel__read(counter, cpus->nr, threads->nr, scale) < 0)
+               return -1;
  
         for (i = 0; i < 3; i++)
-               update_stats(&event_res_stats[counter][i], count[i]);
+               update_stats(&ps->res_stats[i], count[i]);
  
         if (verbose) {
                 fprintf(stderr, "%s: %Ld %Ld %Ld\n", event_name(counter),
@@ -263,26 +213,51 @@ static void read_counter(int counter)
         /*
          * Save the full runtime - to allow normalization during printout:
          */
-       if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
-               update_stats(&runtime_nsecs_stats, count[0]);
-       if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
-               update_stats(&runtime_cycles_stats, count[0]);
-       if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter))
-               update_stats(&runtime_branches_stats, count[0]);
+       if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
+               update_stats(&runtime_nsecs_stats[0], count[0]);
+       if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
+               update_stats(&runtime_cycles_stats[0], count[0]);
+       if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
+               update_stats(&runtime_branches_stats[0], count[0]);
+
+       return 0;
+}
+
+/*
+ * Read out the results of a single counter:
+ * do not aggregate counts across CPUs in system-wide mode
+ */
+static int read_counter(struct perf_evsel *counter)
+{
+       u64 *count;
+       int cpu;
+
+       for (cpu = 0; cpu < cpus->nr; cpu++) {
+               if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0)
+                       return -1;
+
+               count = counter->counts->cpu[cpu].values;
+
+               if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
+                       update_stats(&runtime_nsecs_stats[cpu], count[0]);
+               if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
+                       update_stats(&runtime_cycles_stats[cpu], count[0]);
+               if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
+                       update_stats(&runtime_branches_stats[cpu], count[0]);
+       }
+
+       return 0;
  }
  
  static int run_perf_stat(int argc __used, const char **argv)
  {
         unsigned long long t0, t1;
+       struct perf_evsel *counter;
         int status = 0;
-       int counter, ncreated = 0;
         int child_ready_pipe[2], go_pipe[2];
         const bool forks = (argc > 0);
         char buf;
  
-       if (!system_wide)
-               nr_cpus = 1;
-
         if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
                 perror("failed to create pipes");
                 exit(1);
@@ -322,7 +297,7 @@ static int run_perf_stat(int argc __used, const char **argv)
                 }
  
                 if (target_tid == -1 && target_pid == -1 && !system_wide)
-                       all_tids[0] = child_pid;
+                       threads->map[0] = child_pid;
  
                 /*
                  * Wait for the child to be ready to exec.
@@ -334,16 +309,23 @@ static int run_perf_stat(int argc __used, const char **argv)
                 close(child_ready_pipe[0]);
         }
  
-       for (counter = 0; counter < nr_counters; counter++)
-               ncreated += create_perf_stat_counter(counter);
-
-       if (ncreated == 0) {
-               pr_err("No permission to collect %sstats.\n"
-                      "Consider tweaking /proc/sys/kernel/perf_event_paranoid.\n",
-                      system_wide ? "system-wide " : "");
-               if (child_pid != -1)
-                       kill(child_pid, SIGTERM);
-               return -1;
+       list_for_each_entry(counter, &evsel_list, node) {
+               if (create_perf_stat_counter(counter) < 0) {
+                       if (errno == -EPERM || errno == -EACCES) {
+                               error("You may not have permission to collect %sstats.\n"
+                                     "\t Consider tweaking"
+                                     " /proc/sys/kernel/perf_event_paranoid or running as root.",
+                                     system_wide ? "system-wide " : "");
+                       } else {
+                               error("open_counter returned with %d (%s). "
+                                     "/bin/dmesg may provide additional information.\n",
+                                      errno, strerror(errno));
+                       }
+                       if (child_pid != -1)
+                               kill(child_pid, SIGTERM);
+                       die("Not all events could be opened.\n");
+                       return -1;
+               }
         }
  
         /*
@@ -362,60 +344,97 @@ static int run_perf_stat(int argc __used, const char **argv)
  
         update_stats(&walltime_nsecs_stats, t1 - t0);
  
-       for (counter = 0; counter < nr_counters; counter++)
-               read_counter(counter);
+       if (no_aggr) {
+               list_for_each_entry(counter, &evsel_list, node) {
+                       read_counter(counter);
+                       perf_evsel__close_fd(counter, cpus->nr, 1);
+               }
+       } else {
+               list_for_each_entry(counter, &evsel_list, node) {
+                       read_counter_aggr(counter);
+                       perf_evsel__close_fd(counter, cpus->nr, threads->nr);
+               }
+       }
  
         return WEXITSTATUS(status);
  }
  
-static void print_noise(int counter, double avg)
+static void print_noise(struct perf_evsel *evsel, double avg)
  {
+       struct perf_stat *ps;
+
         if (run_count == 1)
                 return;
  
+       ps = evsel->priv;
         fprintf(stderr, "   ( +- %7.3f%% )",
-                       100 * stddev_stats(&event_res_stats[counter][0]) / avg);
+                       100 * stddev_stats(&ps->res_stats[0]) / avg);
  }
  
-static void nsec_printout(int counter, double avg)
+static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg)
  {
         double msecs = avg / 1e6;
+       char cpustr[16] = { '\0', };
+       const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-24s";
  
-       fprintf(stderr, " %18.6f  %-24s", msecs, event_name(counter));
+       if (no_aggr)
+               sprintf(cpustr, "CPU%*d%s",
+                       csv_output ? 0 : -4,
+                       cpus->map[cpu], csv_sep);
+
+       fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(evsel));
+
+       if (csv_output)
+               return;
  
-       if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) {
+       if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
                 fprintf(stderr, " # %10.3f CPUs ",
                                 avg / avg_stats(&walltime_nsecs_stats));
-       }
  }
  
-static void abs_printout(int counter, double avg)
+static void abs_printout(int cpu, struct perf_evsel *evsel, double avg)
  {
         double total, ratio = 0.0;
+       char cpustr[16] = { '\0', };
+       const char *fmt;
+
+       if (csv_output)
+               fmt = "%s%.0f%s%s";
+       else if (big_num)
+               fmt = "%s%'18.0f%s%-24s";
+       else
+               fmt = "%s%18.0f%s%-24s";
  
-       if (big_num)
-               fprintf(stderr, " %'18.0f  %-24s", avg, event_name(counter));
+       if (no_aggr)
+               sprintf(cpustr, "CPU%*d%s",
+                       csv_output ? 0 : -4,
+                       cpus->map[cpu], csv_sep);
         else
-               fprintf(stderr, " %18.0f  %-24s", avg, event_name(counter));
+               cpu = 0;
+
+       fprintf(stderr, fmt, cpustr, avg, csv_sep, event_name(evsel));
  
-       if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) {
-               total = avg_stats(&runtime_cycles_stats);
+       if (csv_output)
+               return;
+
+       if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
+               total = avg_stats(&runtime_cycles_stats[cpu]);
  
                 if (total)
                         ratio = avg / total;
  
                 fprintf(stderr, " # %10.3f IPC  ", ratio);
-       } else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) &&
-                       runtime_branches_stats.n != 0) {
-               total = avg_stats(&runtime_branches_stats);
+       } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
+                       runtime_branches_stats[cpu].n != 0) {
+               total = avg_stats(&runtime_branches_stats[cpu]);
  
                 if (total)
                         ratio = avg * 100 / total;
  
                 fprintf(stderr, " # %10.3f %%    ", ratio);
  
-       } else if (runtime_nsecs_stats.n != 0) {
-               total = avg_stats(&runtime_nsecs_stats);
+       } else if (runtime_nsecs_stats[cpu].n != 0) {
+               total = avg_stats(&runtime_nsecs_stats[cpu]);
  
                 if (total)
                         ratio = 1000.0 * avg / total;
@@ -426,30 +445,38 @@ static void abs_printout(int counter, double avg)
  
  /*
   * Print out the results of a single counter:
+ * aggregated counts in system-wide mode
   */
-static void print_counter(int counter)
+static void print_counter_aggr(struct perf_evsel *counter)
  {
-       double avg = avg_stats(&event_res_stats[counter][0]);
-       int scaled = event_scaled[counter];
+       struct perf_stat *ps = counter->priv;
+       double avg = avg_stats(&ps->res_stats[0]);
+       int scaled = counter->counts->scaled;
  
         if (scaled == -1) {
-               fprintf(stderr, " %18s  %-24s\n",
-                       "<not counted>", event_name(counter));
+               fprintf(stderr, "%*s%s%-24s\n",
+                       csv_output ? 0 : 18,
+                       "<not counted>", csv_sep, event_name(counter));
                 return;
         }
  
         if (nsec_counter(counter))
-               nsec_printout(counter, avg);
+               nsec_printout(-1, counter, avg);
         else
-               abs_printout(counter, avg);
+               abs_printout(-1, counter, avg);
+
+       if (csv_output) {
+               fputc('\n', stderr);
+               return;
+       }
  
         print_noise(counter, avg);
  
         if (scaled) {
                 double avg_enabled, avg_running;
  
-               avg_enabled = avg_stats(&event_res_stats[counter][1]);
-               avg_running = avg_stats(&event_res_stats[counter][2]);
+               avg_enabled = avg_stats(&ps->res_stats[1]);
+               avg_running = avg_stats(&ps->res_stats[2]);
  
                 fprintf(stderr, "  (scaled from %.2f%%)",
                                 100 * avg_running / avg_enabled);
@@ -458,40 +485,92 @@ static void print_counter(int counter)
         fprintf(stderr, "\n");
  }
  
+/*
+ * Print out the results of a single counter:
+ * does not use aggregated count in system-wide
+ */
+static void print_counter(struct perf_evsel *counter)
+{
+       u64 ena, run, val;
+       int cpu;
+
+       for (cpu = 0; cpu < cpus->nr; cpu++) {
+               val = counter->counts->cpu[cpu].val;
+               ena = counter->counts->cpu[cpu].ena;
+               run = counter->counts->cpu[cpu].run;
+               if (run == 0 || ena == 0) {
+                       fprintf(stderr, "CPU%*d%s%*s%s%-24s",
+                               csv_output ? 0 : -4,
+                               cpus->map[cpu], csv_sep,
+                               csv_output ? 0 : 18,
+                               "<not counted>", csv_sep,
+                               event_name(counter));
+
+                       fprintf(stderr, "\n");
+                       continue;
+               }
+
+               if (nsec_counter(counter))
+                       nsec_printout(cpu, counter, val);
+               else
+                       abs_printout(cpu, counter, val);
+
+               if (!csv_output) {
+                       print_noise(counter, 1.0);
+
+                       if (run != ena) {
+                               fprintf(stderr, "  (scaled from %.2f%%)",
+                                       100.0 * run / ena);
+                       }
+               }
+               fprintf(stderr, "\n");
+       }
+}
+
  static void print_stat(int argc, const char **argv)
  {
-       int i, counter;
+       struct perf_evsel *counter;
+       int i;
  
         fflush(stdout);
  
-       fprintf(stderr, "\n");
-       fprintf(stderr, " Performance counter stats for ");
-       if(target_pid == -1 && target_tid == -1) {
-               fprintf(stderr, "\'%s", argv[0]);
-               for (i = 1; i < argc; i++)
-                       fprintf(stderr, " %s", argv[i]);
-       } else if (target_pid != -1)
-               fprintf(stderr, "process id \'%d", target_pid);
-       else
-               fprintf(stderr, "thread id \'%d", target_tid);
-
-       fprintf(stderr, "\'");
-       if (run_count > 1)
-               fprintf(stderr, " (%d runs)", run_count);
-       fprintf(stderr, ":\n\n");
+       if (!csv_output) {
+               fprintf(stderr, "\n");
+               fprintf(stderr, " Performance counter stats for ");
+               if(target_pid == -1 && target_tid == -1) {
+                       fprintf(stderr, "\'%s", argv[0]);
+                       for (i = 1; i < argc; i++)
+                               fprintf(stderr, " %s", argv[i]);
+               } else if (target_pid != -1)
+                       fprintf(stderr, "process id \'%d", target_pid);
+               else
+                       fprintf(stderr, "thread id \'%d", target_tid);
+
+               fprintf(stderr, "\'");
+               if (run_count > 1)
+                       fprintf(stderr, " (%d runs)", run_count);
+               fprintf(stderr, ":\n\n");
+       }
  
-       for (counter = 0; counter < nr_counters; counter++)
-               print_counter(counter);
+       if (no_aggr) {
+               list_for_each_entry(counter, &evsel_list, node)
+                       print_counter(counter);
+       } else {
+               list_for_each_entry(counter, &evsel_list, node)
+                       print_counter_aggr(counter);
+       }
  
-       fprintf(stderr, "\n");
-       fprintf(stderr, " %18.9f  seconds time elapsed",
-                       avg_stats(&walltime_nsecs_stats)/1e9);
-       if (run_count > 1) {
-               fprintf(stderr, "   ( +- %7.3f%% )",
+       if (!csv_output) {
+               fprintf(stderr, "\n");
+               fprintf(stderr, " %18.9f  seconds time elapsed",
+                               avg_stats(&walltime_nsecs_stats)/1e9);
+               if (run_count > 1) {
+                       fprintf(stderr, "   ( +- %7.3f%% )",
                                 100*stddev_stats(&walltime_nsecs_stats) /
                                 avg_stats(&walltime_nsecs_stats));
+               }
+               fprintf(stderr, "\n\n");
         }
-       fprintf(stderr, "\n\n");
  }
  
  static volatile int signr = -1;
@@ -521,6 +600,13 @@ static const char * const stat_usage[] = {
         NULL
  };
  
+static int stat__set_big_num(const struct option *opt __used,
+                            const char *s __used, int unset)
+{
+       big_num_opt = unset ? 0 : 1;
+       return 0;
+}
+
  static const struct option options[] = {
         OPT_CALLBACK('e', "event", NULL, "event",
                      "event selector. use 'perf list' to list available events",
@@ -541,64 +627,96 @@ static const struct option options[] = {
                     "repeat command and print average + stddev (max: 100)"),
         OPT_BOOLEAN('n', "null", &null_run,
                     "null run - dont start any counters"),
-       OPT_BOOLEAN('B', "big-num", &big_num,
-                   "print large numbers with thousands\' separators"),
+       OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, 
+                          "print large numbers with thousands\' separators",
+                          stat__set_big_num),
         OPT_STRING('C', "cpu", &cpu_list, "cpu",
                     "list of cpus to monitor in system-wide"),
+       OPT_BOOLEAN('A', "no-aggr", &no_aggr,
+                   "disable CPU count aggregation"),
+       OPT_STRING('x', "field-separator", &csv_sep, "separator",
+                  "print counts with custom separator"),
         OPT_END()
  };
  
  int cmd_stat(int argc, const char **argv, const char *prefix __used)
  {
-       int status;
-       int i,j;
+       struct perf_evsel *pos;
+       int status = -ENOMEM;
  
         setlocale(LC_ALL, "");
  
         argc = parse_options(argc, argv, options, stat_usage,
                 PARSE_OPT_STOP_AT_NON_OPTION);
+
+       if (csv_sep)
+               csv_output = true;
+       else
+               csv_sep = DEFAULT_SEPARATOR;
+
+       /*
+        * let the spreadsheet do the pretty-printing
+        */
+       if (csv_output) {
+               /* User explicitely passed -B? */
+               if (big_num_opt == 1) {
+                       fprintf(stderr, "-B option not supported with -x\n");
+                       usage_with_options(stat_usage, options);
+               } else /* Nope, so disable big number formatting */
+                       big_num = false;
+       } else if (big_num_opt == 0) /* User passed --no-big-num */
+               big_num = false;
+
         if (!argc && target_pid == -1 && target_tid == -1)
                 usage_with_options(stat_usage, options);
         if (run_count <= 0)
                 usage_with_options(stat_usage, options);
  
+       /* no_aggr is for system-wide only */
+       if (no_aggr && !system_wide)
+               usage_with_options(stat_usage, options);
+
         /* Set attrs and nr_counters if no event is selected and !null_run */
         if (!null_run && !nr_counters) {
-               memcpy(attrs, default_attrs, sizeof(default_attrs));
+               size_t c;
+
                 nr_counters = ARRAY_SIZE(default_attrs);
+
+               for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) {
+                       pos = perf_evsel__new(default_attrs[c].type,
+                                             default_attrs[c].config,
+                                             nr_counters);
+                       if (pos == NULL)
+                               goto out;
+                       list_add(&pos->node, &evsel_list);
+               }
         }
  
-       if (system_wide)
-               nr_cpus = read_cpu_map(cpu_list);
-       else
-               nr_cpus = 1;
+       if (target_pid != -1)
+               target_tid = target_pid;
  
-       if (nr_cpus < 1)
+       threads = thread_map__new(target_pid, target_tid);
+       if (threads == NULL) {
+               pr_err("Problems finding threads of monitor\n");
                 usage_with_options(stat_usage, options);
+       }
  
-       if (target_pid != -1) {
-               target_tid = target_pid;
-               thread_num = find_all_tid(target_pid, &all_tids);
-               if (thread_num <= 0) {
-                       fprintf(stderr, "Can't find all threads of pid %d\n",
-                                       target_pid);
-                       usage_with_options(stat_usage, options);
-               }
-       } else {
-               all_tids=malloc(sizeof(pid_t));
-               if (!all_tids)
-                       return -ENOMEM;
+       if (system_wide)
+               cpus = cpu_map__new(cpu_list);
+       else
+               cpus = cpu_map__dummy_new();
  
-               all_tids[0] = target_tid;
-               thread_num = 1;
+       if (cpus == NULL) {
+               perror("failed to parse CPUs map");
+               usage_with_options(stat_usage, options);
+               return -1;
         }
  
-       for (i = 0; i < MAX_NR_CPUS; i++) {
-               for (j = 0; j < MAX_COUNTERS; j++) {
-                       fd[i][j] = malloc(sizeof(int)*thread_num);
-                       if (!fd[i][j])
-                               return -ENOMEM;
-               }
+       list_for_each_entry(pos, &evsel_list, node) {
+               if (perf_evsel__alloc_stat_priv(pos) < 0 ||
+                   perf_evsel__alloc_counts(pos, cpus->nr) < 0 ||
+                   perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
+                       goto out_free_fd;
         }
  
         /*
@@ -621,6 +739,11 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
  
         if (status != -1)
                 print_stat(argc, argv);
-
+out_free_fd:
+       list_for_each_entry(pos, &evsel_list, node)
+               perf_evsel__free_stat_priv(pos);
+out:
+       thread_map__delete(threads);
+       threads = NULL;
         return status;
  }
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c

index 035b9fa063a9453002873c00f654adcd13a99eb2..1c984342a5795090d2e863d0b2da1a995e0201d2 100644 (file)
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -119,10 +119,16 @@ static int test__vmlinux_matches_kallsyms(void)
          * end addresses too.
          */
         for (nd = rb_first(&vmlinux_map->dso->symbols[type]); nd; nd = rb_next(nd)) {
-               struct symbol *pair;
+               struct symbol *pair, *first_pair;
+               bool backwards = true;
  
                 sym  = rb_entry(nd, struct symbol, rb_node);
-               pair = machine__find_kernel_symbol(&kallsyms, type, sym->start, NULL, NULL);
+
+               if (sym->start == sym->end)
+                       continue;
+
+               first_pair = machine__find_kernel_symbol(&kallsyms, type, sym->start, NULL, NULL);
+               pair = first_pair;
  
                 if (pair && pair->start == sym->start) {
  next_pair:
@@ -143,8 +149,10 @@ next_pair:
                                 pr_debug("%#Lx: diff end addr for %s v: %#Lx k: %#Lx\n",
                                          sym->start, sym->name, sym->end, pair->end);
                         } else {
-                               struct rb_node *nnd = rb_prev(&pair->rb_node);
-
+                               struct rb_node *nnd;
+detour:
+                               nnd = backwards ? rb_prev(&pair->rb_node) :
+                                                 rb_next(&pair->rb_node);
                                 if (nnd) {
                                         struct symbol *next = rb_entry(nnd, struct symbol, rb_node);
  
@@ -153,6 +161,13 @@ next_pair:
                                                 goto next_pair;
                                         }
                                 }
+
+                               if (backwards) {
+                                       backwards = false;
+                                       pair = first_pair;
+                                       goto detour;
+                               }
+
                                 pr_debug("%#Lx: diff name v: %s k: %s\n",
                                          sym->start, sym->name, pair->name);
                         }
@@ -219,6 +234,89 @@ out:
         return err;
  }
  
+#include "util/evsel.h"
+#include <sys/types.h>
+
+static int trace_event__id(const char *event_name)
+{
+       char *filename;
+       int err = -1, fd;
+
+       if (asprintf(&filename,
+                    "/sys/kernel/debug/tracing/events/syscalls/%s/id",
+                    event_name) < 0)
+               return -1;
+
+       fd = open(filename, O_RDONLY);
+       if (fd >= 0) {
+               char id[16];
+               if (read(fd, id, sizeof(id)) > 0)
+                       err = atoi(id);
+               close(fd);
+       }
+
+       free(filename);
+       return err;
+}
+
+static int test__open_syscall_event(void)
+{
+       int err = -1, fd;
+       struct thread_map *threads;
+       struct perf_evsel *evsel;
+       unsigned int nr_open_calls = 111, i;
+       int id = trace_event__id("sys_enter_open");
+
+       if (id < 0) {
+               pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
+               return -1;
+       }
+
+       threads = thread_map__new(-1, getpid());
+       if (threads == NULL) {
+               pr_debug("thread_map__new\n");
+               return -1;
+       }
+
+       evsel = perf_evsel__new(PERF_TYPE_TRACEPOINT, id, 0);
+       if (evsel == NULL) {
+               pr_debug("perf_evsel__new\n");
+               goto out_thread_map_delete;
+       }
+
+       if (perf_evsel__open_per_thread(evsel, threads) < 0) {
+               pr_debug("failed to open counter: %s, "
+                        "tweak /proc/sys/kernel/perf_event_paranoid?\n",
+                        strerror(errno));
+               goto out_evsel_delete;
+       }
+
+       for (i = 0; i < nr_open_calls; ++i) {
+               fd = open("/etc/passwd", O_RDONLY);
+               close(fd);
+       }
+
+       if (perf_evsel__read_on_cpu(evsel, 0, 0) < 0) {
+               pr_debug("perf_evsel__open_read_on_cpu\n");
+               goto out_close_fd;
+       }
+
+       if (evsel->counts->cpu[0].val != nr_open_calls) {
+               pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls, got %Ld\n",
+                        nr_open_calls, evsel->counts->cpu[0].val);
+               goto out_close_fd;
+       }
+       
+       err = 0;
+out_close_fd:
+       perf_evsel__close_fd(evsel, 1, threads->nr);
+out_evsel_delete:
+       perf_evsel__delete(evsel);
+out_thread_map_delete:
+       thread_map__delete(threads);
+       return err;
+}
+
  static struct test {
         const char *desc;
         int (*func)(void);
@@ -227,6 +325,10 @@ static struct test {
                 .desc = "vmlinux symtab matches kallsyms",
                 .func = test__vmlinux_matches_kallsyms,
         },
+       {
+               .desc = "detect open syscall event",
+               .func = test__open_syscall_event,
+       },
         {
                 .func = NULL,
         },
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c

index 9bcc38f0b706f91ca3701e440c15e3e9c8aa7bd3..746cf03cb05d86a2796c88fca27930ca6e0a8894 100644 (file)
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -32,6 +32,10 @@
  #include "util/session.h"
  #include "util/svghelper.h"
  
+#define SUPPORT_OLD_POWER_EVENTS 1
+#define PWR_EVENT_EXIT -1
+
+
  static char            const *input_name = "perf.data";
  static char            const *output_name = "output.svg";
  
@@ -272,19 +276,22 @@ static int cpus_cstate_state[MAX_CPUS];
  static u64 cpus_pstate_start_times[MAX_CPUS];
  static u64 cpus_pstate_state[MAX_CPUS];
  
-static int process_comm_event(event_t *event, struct perf_session *session __used)
+static int process_comm_event(event_t *event, struct sample_data *sample __used,
+                             struct perf_session *session __used)
  {
         pid_set_comm(event->comm.tid, event->comm.comm);
         return 0;
  }
  
-static int process_fork_event(event_t *event, struct perf_session *session __used)
+static int process_fork_event(event_t *event, struct sample_data *sample __used,
+                             struct perf_session *session __used)
  {
         pid_fork(event->fork.pid, event->fork.ppid, event->fork.time);
         return 0;
  }
  
-static int process_exit_event(event_t *event, struct perf_session *session __used)
+static int process_exit_event(event_t *event, struct sample_data *sample __used,
+                             struct perf_session *session __used)
  {
         pid_exit(event->fork.pid, event->fork.time);
         return 0;
@@ -298,12 +305,21 @@ struct trace_entry {
         int                     lock_depth;
  };
  
-struct power_entry {
+#ifdef SUPPORT_OLD_POWER_EVENTS
+static int use_old_power_events;
+struct power_entry_old {
         struct trace_entry te;
         u64     type;
         u64     value;
         u64     cpu_id;
  };
+#endif
+
+struct power_processor_entry {
+       struct trace_entry te;
+       u32     state;
+       u32     cpu_id;
+};
  
  #define TASK_COMM_LEN 16
  struct wakeup_entry {
@@ -470,48 +486,65 @@ static void sched_switch(int cpu, u64 timestamp, struct trace_entry *te)
  }
  
  
-static int process_sample_event(event_t *event, struct perf_session *session)
+static int process_sample_event(event_t *event __used,
+                               struct sample_data *sample,
+                               struct perf_session *session)
  {
-       struct sample_data data;
         struct trace_entry *te;
  
-       memset(&data, 0, sizeof(data));
-
-       event__parse_sample(event, session->sample_type, &data);
-
         if (session->sample_type & PERF_SAMPLE_TIME) {
-               if (!first_time || first_time > data.time)
-                       first_time = data.time;
-               if (last_time < data.time)
-                       last_time = data.time;
+               if (!first_time || first_time > sample->time)
+                       first_time = sample->time;
+               if (last_time < sample->time)
+                       last_time = sample->time;
         }
  
-       te = (void *)data.raw_data;
-       if (session->sample_type & PERF_SAMPLE_RAW && data.raw_size > 0) {
+       te = (void *)sample->raw_data;
+       if (session->sample_type & PERF_SAMPLE_RAW && sample->raw_size > 0) {
                 char *event_str;
-               struct power_entry *pe;
-
-               pe = (void *)te;
-
+#ifdef SUPPORT_OLD_POWER_EVENTS
+               struct power_entry_old *peo;
+               peo = (void *)te;
+#endif
                 event_str = perf_header__find_event(te->type);
  
                 if (!event_str)
                         return 0;
  
-               if (strcmp(event_str, "power:power_start") == 0)
-                       c_state_start(pe->cpu_id, data.time, pe->value);
+               if (strcmp(event_str, "power:cpu_idle") == 0) {
+                       struct power_processor_entry *ppe = (void *)te;
+                       if (ppe->state == (u32)PWR_EVENT_EXIT)
+                               c_state_end(ppe->cpu_id, sample->time);
+                       else
+                               c_state_start(ppe->cpu_id, sample->time,
+                                             ppe->state);
+               }
+               else if (strcmp(event_str, "power:cpu_frequency") == 0) {
+                       struct power_processor_entry *ppe = (void *)te;
+                       p_state_change(ppe->cpu_id, sample->time, ppe->state);
+               }
+
+               else if (strcmp(event_str, "sched:sched_wakeup") == 0)
+                       sched_wakeup(sample->cpu, sample->time, sample->pid, te);
  
-               if (strcmp(event_str, "power:power_end") == 0)
-                       c_state_end(pe->cpu_id, data.time);
+               else if (strcmp(event_str, "sched:sched_switch") == 0)
+                       sched_switch(sample->cpu, sample->time, te);
  
-               if (strcmp(event_str, "power:power_frequency") == 0)
-                       p_state_change(pe->cpu_id, data.time, pe->value);
+#ifdef SUPPORT_OLD_POWER_EVENTS
+               if (use_old_power_events) {
+                       if (strcmp(event_str, "power:power_start") == 0)
+                               c_state_start(peo->cpu_id, sample->time,
+                                             peo->value);
  
-               if (strcmp(event_str, "sched:sched_wakeup") == 0)
-                       sched_wakeup(data.cpu, data.time, data.pid, te);
+                       else if (strcmp(event_str, "power:power_end") == 0)
+                               c_state_end(sample->cpu, sample->time);
  
-               if (strcmp(event_str, "sched:sched_switch") == 0)
-                       sched_switch(data.cpu, data.time, te);
+                       else if (strcmp(event_str,
+                                       "power:power_frequency") == 0)
+                               p_state_change(peo->cpu_id, sample->time,
+                                              peo->value);
+               }
+#endif
         }
         return 0;
  }
@@ -937,7 +970,8 @@ static struct perf_event_ops event_ops = {
  
  static int __cmd_timechart(void)
  {
-       struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0, false);
+       struct perf_session *session = perf_session__new(input_name, O_RDONLY,
+                                                        0, false, &event_ops);
         int ret = -EINVAL;
  
         if (session == NULL)
@@ -968,7 +1002,8 @@ static const char * const timechart_usage[] = {
         NULL
  };
  
-static const char *record_args[] = {
+#ifdef SUPPORT_OLD_POWER_EVENTS
+static const char * const record_old_args[] = {
         "record",
         "-a",
         "-R",
@@ -980,16 +1015,43 @@ static const char *record_args[] = {
         "-e", "sched:sched_wakeup",
         "-e", "sched:sched_switch",
  };
+#endif
+
+static const char * const record_new_args[] = {
+       "record",
+       "-a",
+       "-R",
+       "-f",
+       "-c", "1",
+       "-e", "power:cpu_frequency",
+       "-e", "power:cpu_idle",
+       "-e", "sched:sched_wakeup",
+       "-e", "sched:sched_switch",
+};
  
  static int __cmd_record(int argc, const char **argv)
  {
         unsigned int rec_argc, i, j;
         const char **rec_argv;
+       const char * const *record_args = record_new_args;
+       unsigned int record_elems = ARRAY_SIZE(record_new_args);
+
+#ifdef SUPPORT_OLD_POWER_EVENTS
+       if (!is_valid_tracepoint("power:cpu_idle") &&
+           is_valid_tracepoint("power:power_start")) {
+               use_old_power_events = 1;
+               record_args = record_old_args;
+               record_elems = ARRAY_SIZE(record_old_args);
+       }
+#endif
  
-       rec_argc = ARRAY_SIZE(record_args) + argc - 1;
+       rec_argc = record_elems + argc - 1;
         rec_argv = calloc(rec_argc + 1, sizeof(char *));
  
-       for (i = 0; i < ARRAY_SIZE(record_args); i++)
+       if (rec_argv == NULL)
+               return -ENOMEM;
+
+       for (i = 0; i < record_elems; i++)
                 rec_argv[i] = strdup(record_args[i]);
  
         for (j = 1; j < (unsigned int)argc; j++, i++)
@@ -1018,6 +1080,8 @@ static const struct option options[] = {
         OPT_CALLBACK('p', "process", NULL, "process",
                       "process selector. Pass a pid or process name.",
                        parse_process),
+       OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
+                   "Look for files with symbols relative to this directory"),
         OPT_END()
  };
  
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c

index dd625808c2a5332c4f733a59acfb1ee881faae3f..1e67ab9c7ebc46c5df87e6219ad023b5509207f2 100644 (file)
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -21,6 +21,7 @@
  #include "perf.h"
  
  #include "util/color.h"
+#include "util/evsel.h"
  #include "util/session.h"
  #include "util/symbol.h"
  #include "util/thread.h"
@@ -29,6 +30,7 @@
  #include "util/parse-options.h"
  #include "util/parse-events.h"
  #include "util/cpumap.h"
+#include "util/xyarray.h"
  
  #include "util/debug.h"
  
@@ -55,7 +57,7 @@
  #include <linux/unistd.h>
  #include <linux/types.h>
  
-static int                     *fd[MAX_NR_CPUS][MAX_COUNTERS];
+#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
  
  static bool                    system_wide                     =  false;
  
@@ -66,10 +68,9 @@ static int                   print_entries;
  
  static int                     target_pid                      =     -1;
  static int                     target_tid                      =     -1;
-static pid_t                   *all_tids                       =      NULL;
-static int                     thread_num                      =      0;
+static struct thread_map       *threads;
  static bool                    inherit                         =  false;
-static int                     nr_cpus                         =      0;
+static struct cpu_map          *cpus;
  static int                     realtime_prio                   =      0;
  static bool                    group                           =  false;
  static unsigned int            page_size;
@@ -100,6 +101,7 @@ struct sym_entry            *sym_filter_entry               =   NULL;
  struct sym_entry               *sym_filter_entry_sched         =   NULL;
  static int                     sym_pcnt_filter                 =      5;
  static int                     sym_counter                     =      0;
+static struct perf_evsel       *sym_evsel                      =   NULL;
  static int                     display_weighted                =     -1;
  static const char              *cpu_list;
  
@@ -353,7 +355,7 @@ static void show_details(struct sym_entry *syme)
                 return;
  
         symbol = sym_entry__symbol(syme);
-       printf("Showing %s for %s\n", event_name(sym_counter), symbol->name);
+       printf("Showing %s for %s\n", event_name(sym_evsel), symbol->name);
         printf("  Events  Pcnt (>=%d%%)\n", sym_pcnt_filter);
  
         pthread_mutex_lock(&syme->src->lock);
@@ -460,7 +462,8 @@ static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
  static void print_sym_table(void)
  {
         int printed = 0, j;
-       int counter, snap = !display_weighted ? sym_counter : 0;
+       struct perf_evsel *counter;
+       int snap = !display_weighted ? sym_counter : 0;
         float samples_per_sec = samples/delay_secs;
         float ksamples_per_sec = kernel_samples/delay_secs;
         float us_samples_per_sec = (us_samples)/delay_secs;
@@ -532,7 +535,9 @@ static void print_sym_table(void)
         }
  
         if (nr_counters == 1 || !display_weighted) {
-               printf("%Ld", (u64)attrs[0].sample_period);
+               struct perf_evsel *first;
+               first = list_entry(evsel_list.next, struct perf_evsel, node);
+               printf("%Ld", first->attr.sample_period);
                 if (freq)
                         printf("Hz ");
                 else
@@ -540,9 +545,9 @@ static void print_sym_table(void)
         }
  
         if (!display_weighted)
-               printf("%s", event_name(sym_counter));
-       else for (counter = 0; counter < nr_counters; counter++) {
-               if (counter)
+               printf("%s", event_name(sym_evsel));
+       else list_for_each_entry(counter, &evsel_list, node) {
+               if (counter->idx)
                         printf("/");
  
                 printf("%s", event_name(counter));
@@ -558,12 +563,12 @@ static void print_sym_table(void)
                 printf(" (all");
  
         if (cpu_list)
-               printf(", CPU%s: %s)\n", nr_cpus > 1 ? "s" : "", cpu_list);
+               printf(", CPU%s: %s)\n", cpus->nr > 1 ? "s" : "", cpu_list);
         else {
                 if (target_tid != -1)
                         printf(")\n");
                 else
-                       printf(", %d CPU%s)\n", nr_cpus, nr_cpus > 1 ? "s" : "");
+                       printf(", %d CPU%s)\n", cpus->nr, cpus->nr > 1 ? "s" : "");
         }
  
         printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
@@ -739,7 +744,7 @@ static void print_mapped_keys(void)
         fprintf(stdout, "\t[e]     display entries (lines).           \t(%d)\n", print_entries);
  
         if (nr_counters > 1)
-               fprintf(stdout, "\t[E]     active event counter.              \t(%s)\n", event_name(sym_counter));
+               fprintf(stdout, "\t[E]     active event counter.              \t(%s)\n", event_name(sym_evsel));
  
         fprintf(stdout, "\t[f]     profile display filter (count).    \t(%d)\n", count_filter);
  
@@ -826,19 +831,23 @@ static void handle_keypress(struct perf_session *session, int c)
                         break;
                 case 'E':
                         if (nr_counters > 1) {
-                               int i;
-
                                 fprintf(stderr, "\nAvailable events:");
-                               for (i = 0; i < nr_counters; i++)
-                                       fprintf(stderr, "\n\t%d %s", i, event_name(i));
+
+                               list_for_each_entry(sym_evsel, &evsel_list, node)
+                                       fprintf(stderr, "\n\t%d %s", sym_evsel->idx, event_name(sym_evsel));
  
                                 prompt_integer(&sym_counter, "Enter details event counter");
  
                                 if (sym_counter >= nr_counters) {
-                                       fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(0));
+                                       sym_evsel = list_entry(evsel_list.next, struct perf_evsel, node);
                                         sym_counter = 0;
+                                       fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(sym_evsel));
                                         sleep(1);
+                                       break;
                                 }
+                               list_for_each_entry(sym_evsel, &evsel_list, node)
+                                       if (sym_evsel->idx == sym_counter)
+                                               break;
                         } else sym_counter = 0;
                         break;
                 case 'f':
@@ -977,12 +986,13 @@ static int symbol_filter(struct map *map, struct symbol *sym)
  }
  
  static void event__process_sample(const event_t *self,
-                                struct perf_session *session, int counter)
+                                 struct sample_data *sample,
+                                 struct perf_session *session,
+                                 struct perf_evsel *evsel)
  {
         u64 ip = self->ip.ip;
         struct sym_entry *syme;
         struct addr_location al;
-       struct sample_data data;
         struct machine *machine;
         u8 origin = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
  
@@ -1025,7 +1035,7 @@ static void event__process_sample(const event_t *self,
         if (self->header.misc & PERF_RECORD_MISC_EXACT_IP)
                 exact_samples++;
  
-       if (event__preprocess_sample(self, session, &al, &data,
+       if (event__preprocess_sample(self, session, &al, sample,
                                      symbol_filter) < 0 ||
             al.filtered)
                 return;
@@ -1071,9 +1081,9 @@ static void event__process_sample(const event_t *self,
  
         syme = symbol__priv(al.sym);
         if (!syme->skip) {
-               syme->count[counter]++;
+               syme->count[evsel->idx]++;
                 syme->origin = origin;
-               record_precise_ip(syme, counter, ip);
+               record_precise_ip(syme, evsel->idx, ip);
                 pthread_mutex_lock(&active_symbols_lock);
                 if (list_empty(&syme->node) || !syme->node.next)
                         __list_insert_active_sym(syme);
@@ -1082,12 +1092,24 @@ static void event__process_sample(const event_t *self,
  }
  
  struct mmap_data {
-       int                     counter;
         void                    *base;
         int                     mask;
         unsigned int            prev;
  };
  
+static int perf_evsel__alloc_mmap_per_thread(struct perf_evsel *evsel,
+                                            int ncpus, int nthreads)
+{
+       evsel->priv = xyarray__new(ncpus, nthreads, sizeof(struct mmap_data));
+       return evsel->priv != NULL ? 0 : -ENOMEM;
+}
+
+static void perf_evsel__free_mmap(struct perf_evsel *evsel)
+{
+       xyarray__delete(evsel->priv);
+       evsel->priv = NULL;
+}
+
  static unsigned int mmap_read_head(struct mmap_data *md)
  {
         struct perf_event_mmap_page *pc = md->base;
@@ -1100,11 +1122,15 @@ static unsigned int mmap_read_head(struct mmap_data *md)
  }
  
  static void perf_session__mmap_read_counter(struct perf_session *self,
-                                           struct mmap_data *md)
+                                           struct perf_evsel *evsel,
+                                           int cpu, int thread_idx)
  {
+       struct xyarray *mmap_array = evsel->priv;
+       struct mmap_data *md = xyarray__entry(mmap_array, cpu, thread_idx);
         unsigned int head = mmap_read_head(md);
         unsigned int old = md->prev;
         unsigned char *data = md->base + page_size;
+       struct sample_data sample;
         int diff;
  
         /*
@@ -1152,10 +1178,11 @@ static void perf_session__mmap_read_counter(struct perf_session *self,
                         event = &event_copy;
                 }
  
+               event__parse_sample(event, self, &sample);
                 if (event->header.type == PERF_RECORD_SAMPLE)
-                       event__process_sample(event, self, md->counter);
+                       event__process_sample(event, &sample, self, evsel);
                 else
-                       event__process(event, self);
+                       event__process(event, &sample, self);
                 old += size;
         }
  
@@ -1163,36 +1190,39 @@ static void perf_session__mmap_read_counter(struct perf_session *self,
  }
  
  static struct pollfd *event_array;
-static struct mmap_data *mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
  
  static void perf_session__mmap_read(struct perf_session *self)
  {
-       int i, counter, thread_index;
+       struct perf_evsel *counter;
+       int i, thread_index;
  
-       for (i = 0; i < nr_cpus; i++) {
-               for (counter = 0; counter < nr_counters; counter++)
+       for (i = 0; i < cpus->nr; i++) {
+               list_for_each_entry(counter, &evsel_list, node) {
                         for (thread_index = 0;
-                               thread_index < thread_num;
+                               thread_index < threads->nr;
                                 thread_index++) {
                                 perf_session__mmap_read_counter(self,
-                                       &mmap_array[i][counter][thread_index]);
+                                       counter, i, thread_index);
                         }
+               }
         }
  }
  
  int nr_poll;
  int group_fd;
  
-static void start_counter(int i, int counter)
+static void start_counter(int i, struct perf_evsel *evsel)
  {
+       struct xyarray *mmap_array = evsel->priv;
+       struct mmap_data *mm;
         struct perf_event_attr *attr;
         int cpu = -1;
         int thread_index;
  
         if (target_tid == -1)
-               cpu = cpumap[i];
+               cpu = cpus->map[i];
  
-       attr = attrs + counter;
+       attr = &evsel->attr;
  
         attr->sample_type       = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
  
@@ -1205,16 +1235,18 @@ static void start_counter(int i, int counter)
         attr->inherit           = (cpu < 0) && inherit;
         attr->mmap              = 1;
  
-       for (thread_index = 0; thread_index < thread_num; thread_index++) {
+       for (thread_index = 0; thread_index < threads->nr; thread_index++) {
  try_again:
-               fd[i][counter][thread_index] = sys_perf_event_open(attr,
-                               all_tids[thread_index], cpu, group_fd, 0);
+               FD(evsel, i, thread_index) = sys_perf_event_open(attr,
+                               threads->map[thread_index], cpu, group_fd, 0);
  
-               if (fd[i][counter][thread_index] < 0) {
+               if (FD(evsel, i, thread_index) < 0) {
                         int err = errno;
  
                         if (err == EPERM || err == EACCES)
-                               die("No permission - are you root?\n");
+                               die("Permission error - are you root?\n"
+                                       "\t Consider tweaking"
+                                       " /proc/sys/kernel/perf_event_paranoid.\n");
                         /*
                          * If it's cycles then fall back to hrtimer
                          * based cpu-clock-tick sw counter, which
@@ -1231,30 +1263,30 @@ try_again:
                                 goto try_again;
                         }
                         printf("\n");
-                       error("perfcounter syscall returned with %d (%s)\n",
-                                       fd[i][counter][thread_index], strerror(err));
+                       error("sys_perf_event_open() syscall returned with %d (%s).  /bin/dmesg may provide additional information.\n",
+                                       FD(evsel, i, thread_index), strerror(err));
                         die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
                         exit(-1);
                 }
-               assert(fd[i][counter][thread_index] >= 0);
-               fcntl(fd[i][counter][thread_index], F_SETFL, O_NONBLOCK);
+               assert(FD(evsel, i, thread_index) >= 0);
+               fcntl(FD(evsel, i, thread_index), F_SETFL, O_NONBLOCK);
  
                 /*
                  * First counter acts as the group leader:
                  */
                 if (group && group_fd == -1)
-                       group_fd = fd[i][counter][thread_index];
+                       group_fd = FD(evsel, i, thread_index);
  
-               event_array[nr_poll].fd = fd[i][counter][thread_index];
+               event_array[nr_poll].fd = FD(evsel, i, thread_index);
                 event_array[nr_poll].events = POLLIN;
                 nr_poll++;
  
-               mmap_array[i][counter][thread_index].counter = counter;
-               mmap_array[i][counter][thread_index].prev = 0;
-               mmap_array[i][counter][thread_index].mask = mmap_pages*page_size - 1;
-               mmap_array[i][counter][thread_index].base = mmap(NULL, (mmap_pages+1)*page_size,
-                               PROT_READ, MAP_SHARED, fd[i][counter][thread_index], 0);
-               if (mmap_array[i][counter][thread_index].base == MAP_FAILED)
+               mm = xyarray__entry(mmap_array, i, thread_index);
+               mm->prev = 0;
+               mm->mask = mmap_pages*page_size - 1;
+               mm->base = mmap(NULL, (mmap_pages+1)*page_size,
+                               PROT_READ, MAP_SHARED, FD(evsel, i, thread_index), 0);
+               if (mm->base == MAP_FAILED)
                         die("failed to mmap with %d (%s)\n", errno, strerror(errno));
         }
  }
@@ -1262,13 +1294,13 @@ try_again:
  static int __cmd_top(void)
  {
         pthread_t thread;
-       int i, counter;
-       int ret;
+       struct perf_evsel *counter;
+       int i, ret;
         /*
          * FIXME: perf_session__new should allow passing a O_MMAP, so that all this
          * mmap reading, etc is encapsulated in it. Use O_WRONLY for now.
          */
-       struct perf_session *session = perf_session__new(NULL, O_WRONLY, false, false);
+       struct perf_session *session = perf_session__new(NULL, O_WRONLY, false, false, NULL);
         if (session == NULL)
                 return -ENOMEM;
  
@@ -1277,9 +1309,9 @@ static int __cmd_top(void)
         else
                 event__synthesize_threads(event__process, session);
  
-       for (i = 0; i < nr_cpus; i++) {
+       for (i = 0; i < cpus->nr; i++) {
                 group_fd = -1;
-               for (counter = 0; counter < nr_counters; counter++)
+               list_for_each_entry(counter, &evsel_list, node)
                         start_counter(i, counter);
         }
  
@@ -1368,8 +1400,8 @@ static const struct option options[] = {
  
  int cmd_top(int argc, const char **argv, const char *prefix __used)
  {
-       int counter;
-       int i,j;
+       struct perf_evsel *pos;
+       int status = -ENOMEM;
  
         page_size = sysconf(_SC_PAGE_SIZE);
  
@@ -1377,34 +1409,17 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
         if (argc)
                 usage_with_options(top_usage, options);
  
-       if (target_pid != -1) {
+       if (target_pid != -1)
                 target_tid = target_pid;
-               thread_num = find_all_tid(target_pid, &all_tids);
-               if (thread_num <= 0) {
-                       fprintf(stderr, "Can't find all threads of pid %d\n",
-                               target_pid);
-                       usage_with_options(top_usage, options);
-               }
-       } else {
-               all_tids=malloc(sizeof(pid_t));
-               if (!all_tids)
-                       return -ENOMEM;
  
-               all_tids[0] = target_tid;
-               thread_num = 1;
+       threads = thread_map__new(target_pid, target_tid);
+       if (threads == NULL) {
+               pr_err("Problems finding threads of monitor\n");
+               usage_with_options(top_usage, options);
         }
  
-       for (i = 0; i < MAX_NR_CPUS; i++) {
-               for (j = 0; j < MAX_COUNTERS; j++) {
-                       fd[i][j] = malloc(sizeof(int)*thread_num);
-                       mmap_array[i][j] = zalloc(
-                               sizeof(struct mmap_data)*thread_num);
-                       if (!fd[i][j] || !mmap_array[i][j])
-                               return -ENOMEM;
-               }
-       }
-       event_array = malloc(
-               sizeof(struct pollfd)*MAX_NR_CPUS*MAX_COUNTERS*thread_num);
+       event_array = malloc((sizeof(struct pollfd) *
+                             MAX_NR_CPUS * MAX_COUNTERS * threads->nr));
         if (!event_array)
                 return -ENOMEM;
  
@@ -1415,15 +1430,10 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
                 cpu_list = NULL;
         }
  
-       if (!nr_counters)
-               nr_counters = 1;
-
-       symbol_conf.priv_size = (sizeof(struct sym_entry) +
-                                (nr_counters + 1) * sizeof(unsigned long));
-
-       symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
-       if (symbol__init() < 0)
-               return -1;
+       if (!nr_counters && perf_evsel_list__create_default() < 0) {
+               pr_err("Not enough memory for event selector list\n");
+               return -ENOMEM;
+       }
  
         if (delay_secs < 1)
                 delay_secs = 1;
@@ -1440,23 +1450,33 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
                 exit(EXIT_FAILURE);
         }
  
-       /*
-        * Fill in the ones not specifically initialized via -c:
-        */
-       for (counter = 0; counter < nr_counters; counter++) {
-               if (attrs[counter].sample_period)
+       if (target_tid != -1)
+               cpus = cpu_map__dummy_new();
+       else
+               cpus = cpu_map__new(cpu_list);
+
+       if (cpus == NULL)
+               usage_with_options(top_usage, options);
+
+       list_for_each_entry(pos, &evsel_list, node) {
+               if (perf_evsel__alloc_mmap_per_thread(pos, cpus->nr, threads->nr) < 0 ||
+                   perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
+                       goto out_free_fd;
+               /*
+                * Fill in the ones not specifically initialized via -c:
+                */
+               if (pos->attr.sample_period)
                         continue;
  
-               attrs[counter].sample_period = default_interval;
+               pos->attr.sample_period = default_interval;
         }
  
-       if (target_tid != -1)
-               nr_cpus = 1;
-       else
-               nr_cpus = read_cpu_map(cpu_list);
+       symbol_conf.priv_size = (sizeof(struct sym_entry) +
+                                (nr_counters + 1) * sizeof(unsigned long));
  
-       if (nr_cpus < 1)
-               usage_with_options(top_usage, options);
+       symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
+       if (symbol__init() < 0)
+               return -1;
  
         get_term_dimensions(&winsize);
         if (print_entries == 0) {
@@ -1464,5 +1484,10 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
                 signal(SIGWINCH, sig_winch_handler);
         }
  
-       return __cmd_top();
+       status = __cmd_top();
+out_free_fd:
+       list_for_each_entry(pos, &evsel_list, node)
+               perf_evsel__free_mmap(pos);
+
+       return status;
  }
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c

deleted file mode 100644 (file)

index 86cfe38..0000000
--- a/tools/perf/builtin-trace.c
+++ /dev/null
@@ -1,826 +0,0 @@
-#include "builtin.h"
-
-#include "perf.h"
-#include "util/cache.h"
-#include "util/debug.h"
-#include "util/exec_cmd.h"
-#include "util/header.h"
-#include "util/parse-options.h"
-#include "util/session.h"
-#include "util/symbol.h"
-#include "util/thread.h"
-#include "util/trace-event.h"
-#include "util/parse-options.h"
-#include "util/util.h"
-
-static char const              *script_name;
-static char const              *generate_script_lang;
-static bool                    debug_mode;
-static u64                     last_timestamp;
-static u64                     nr_unordered;
-extern const struct option     record_options[];
-
-static int default_start_script(const char *script __unused,
-                               int argc __unused,
-                               const char **argv __unused)
-{
-       return 0;
-}
-
-static int default_stop_script(void)
-{
-       return 0;
-}
-
-static int default_generate_script(const char *outfile __unused)
-{
-       return 0;
-}
-
-static struct scripting_ops default_scripting_ops = {
-       .start_script           = default_start_script,
-       .stop_script            = default_stop_script,
-       .process_event          = print_event,
-       .generate_script        = default_generate_script,
-};
-
-static struct scripting_ops    *scripting_ops;
-
-static void setup_scripting(void)
-{
-       setup_perl_scripting();
-       setup_python_scripting();
-
-       scripting_ops = &default_scripting_ops;
-}
-
-static int cleanup_scripting(void)
-{
-       pr_debug("\nperf trace script stopped\n");
-
-       return scripting_ops->stop_script();
-}
-
-static char const              *input_name = "perf.data";
-
-static int process_sample_event(event_t *event, struct perf_session *session)
-{
-       struct sample_data data;
-       struct thread *thread;
-
-       memset(&data, 0, sizeof(data));
-       data.time = -1;
-       data.cpu = -1;
-       data.period = 1;
-
-       event__parse_sample(event, session->sample_type, &data);
-
-       dump_printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc,
-                   data.pid, data.tid, data.ip, data.period);
-
-       thread = perf_session__findnew(session, event->ip.pid);
-       if (thread == NULL) {
-               pr_debug("problem processing %d event, skipping it.\n",
-                        event->header.type);
-               return -1;
-       }
-
-       if (session->sample_type & PERF_SAMPLE_RAW) {
-               if (debug_mode) {
-                       if (data.time < last_timestamp) {
-                               pr_err("Samples misordered, previous: %llu "
-                                       "this: %llu\n", last_timestamp,
-                                       data.time);
-                               nr_unordered++;
-                       }
-                       last_timestamp = data.time;
-                       return 0;
-               }
-               /*
-                * FIXME: better resolve from pid from the struct trace_entry
-                * field, although it should be the same than this perf
-                * event pid
-                */
-               scripting_ops->process_event(data.cpu, data.raw_data,
-                                            data.raw_size,
-                                            data.time, thread->comm);
-       }
-
-       session->hists.stats.total_period += data.period;
-       return 0;
-}
-
-static u64 nr_lost;
-
-static int process_lost_event(event_t *event, struct perf_session *session __used)
-{
-       nr_lost += event->lost.lost;
-
-       return 0;
-}
-
-static struct perf_event_ops event_ops = {
-       .sample = process_sample_event,
-       .comm   = event__process_comm,
-       .attr   = event__process_attr,
-       .event_type = event__process_event_type,
-       .tracing_data = event__process_tracing_data,
-       .build_id = event__process_build_id,
-       .lost = process_lost_event,
-       .ordered_samples = true,
-};
-
-extern volatile int session_done;
-
-static void sig_handler(int sig __unused)
-{
-       session_done = 1;
-}
-
-static int __cmd_trace(struct perf_session *session)
-{
-       int ret;
-
-       signal(SIGINT, sig_handler);
-
-       ret = perf_session__process_events(session, &event_ops);
-
-       if (debug_mode) {
-               pr_err("Misordered timestamps: %llu\n", nr_unordered);
-               pr_err("Lost events: %llu\n", nr_lost);
-       }
-
-       return ret;
-}
-
-struct script_spec {
-       struct list_head        node;
-       struct scripting_ops    *ops;
-       char                    spec[0];
-};
-
-LIST_HEAD(script_specs);
-
-static struct script_spec *script_spec__new(const char *spec,
-                                           struct scripting_ops *ops)
-{
-       struct script_spec *s = malloc(sizeof(*s) + strlen(spec) + 1);
-
-       if (s != NULL) {
-               strcpy(s->spec, spec);
-               s->ops = ops;
-       }
-
-       return s;
-}
-
-static void script_spec__delete(struct script_spec *s)
-{
-       free(s->spec);
-       free(s);
-}
-
-static void script_spec__add(struct script_spec *s)
-{
-       list_add_tail(&s->node, &script_specs);
-}
-
-static struct script_spec *script_spec__find(const char *spec)
-{
-       struct script_spec *s;
-
-       list_for_each_entry(s, &script_specs, node)
-               if (strcasecmp(s->spec, spec) == 0)
-                       return s;
-       return NULL;
-}
-
-static struct script_spec *script_spec__findnew(const char *spec,
-                                               struct scripting_ops *ops)
-{
-       struct script_spec *s = script_spec__find(spec);
-
-       if (s)
-               return s;
-
-       s = script_spec__new(spec, ops);
-       if (!s)
-               goto out_delete_spec;
-
-       script_spec__add(s);
-
-       return s;
-
-out_delete_spec:
-       script_spec__delete(s);
-
-       return NULL;
-}
-
-int script_spec_register(const char *spec, struct scripting_ops *ops)
-{
-       struct script_spec *s;
-
-       s = script_spec__find(spec);
-       if (s)
-               return -1;
-
-       s = script_spec__findnew(spec, ops);
-       if (!s)
-               return -1;
-
-       return 0;
-}
-
-static struct scripting_ops *script_spec__lookup(const char *spec)
-{
-       struct script_spec *s = script_spec__find(spec);
-       if (!s)
-               return NULL;
-
-       return s->ops;
-}
-
-static void list_available_languages(void)
-{
-       struct script_spec *s;
-
-       fprintf(stderr, "\n");
-       fprintf(stderr, "Scripting language extensions (used in "
-               "perf trace -s [spec:]script.[spec]):\n\n");
-
-       list_for_each_entry(s, &script_specs, node)
-               fprintf(stderr, "  %-42s [%s]\n", s->spec, s->ops->name);
-
-       fprintf(stderr, "\n");
-}
-
-static int parse_scriptname(const struct option *opt __used,
-                           const char *str, int unset __used)
-{
-       char spec[PATH_MAX];
-       const char *script, *ext;
-       int len;
-
-       if (strcmp(str, "lang") == 0) {
-               list_available_languages();
-               exit(0);
-       }
-
-       script = strchr(str, ':');
-       if (script) {
-               len = script - str;
-               if (len >= PATH_MAX) {
-                       fprintf(stderr, "invalid language specifier");
-                       return -1;
-               }
-               strncpy(spec, str, len);
-               spec[len] = '\0';
-               scripting_ops = script_spec__lookup(spec);
-               if (!scripting_ops) {
-                       fprintf(stderr, "invalid language specifier");
-                       return -1;
-               }
-               script++;
-       } else {
-               script = str;
-               ext = strrchr(script, '.');
-               if (!ext) {
-                       fprintf(stderr, "invalid script extension");
-                       return -1;
-               }
-               scripting_ops = script_spec__lookup(++ext);
-               if (!scripting_ops) {
-                       fprintf(stderr, "invalid script extension");
-                       return -1;
-               }
-       }
-
-       script_name = strdup(script);
-
-       return 0;
-}
-
-#define for_each_lang(scripts_dir, lang_dirent, lang_next)             \
-       while (!readdir_r(scripts_dir, &lang_dirent, &lang_next) &&     \
-              lang_next)                                               \
-               if (lang_dirent.d_type == DT_DIR &&                     \
-                   (strcmp(lang_dirent.d_name, ".")) &&                \
-                   (strcmp(lang_dirent.d_name, "..")))
-
-#define for_each_script(lang_dir, script_dirent, script_next)          \
-       while (!readdir_r(lang_dir, &script_dirent, &script_next) &&    \
-              script_next)                                             \
-               if (script_dirent.d_type != DT_DIR)
-
-
-#define RECORD_SUFFIX                  "-record"
-#define REPORT_SUFFIX                  "-report"
-
-struct script_desc {
-       struct list_head        node;
-       char                    *name;
-       char                    *half_liner;
-       char                    *args;
-};
-
-LIST_HEAD(script_descs);
-
-static struct script_desc *script_desc__new(const char *name)
-{
-       struct script_desc *s = zalloc(sizeof(*s));
-
-       if (s != NULL && name)
-               s->name = strdup(name);
-
-       return s;
-}
-
-static void script_desc__delete(struct script_desc *s)
-{
-       free(s->name);
-       free(s->half_liner);
-       free(s->args);
-       free(s);
-}
-
-static void script_desc__add(struct script_desc *s)
-{
-       list_add_tail(&s->node, &script_descs);
-}
-
-static struct script_desc *script_desc__find(const char *name)
-{
-       struct script_desc *s;
-
-       list_for_each_entry(s, &script_descs, node)
-               if (strcasecmp(s->name, name) == 0)
-                       return s;
-       return NULL;
-}
-
-static struct script_desc *script_desc__findnew(const char *name)
-{
-       struct script_desc *s = script_desc__find(name);
-
-       if (s)
-               return s;
-
-       s = script_desc__new(name);
-       if (!s)
-               goto out_delete_desc;
-
-       script_desc__add(s);
-
-       return s;
-
-out_delete_desc:
-       script_desc__delete(s);
-
-       return NULL;
-}
-
-static char *ends_with(char *str, const char *suffix)
-{
-       size_t suffix_len = strlen(suffix);
-       char *p = str;
-
-       if (strlen(str) > suffix_len) {
-               p = str + strlen(str) - suffix_len;
-               if (!strncmp(p, suffix, suffix_len))
-                       return p;
-       }
-
-       return NULL;
-}
-
-static char *ltrim(char *str)
-{
-       int len = strlen(str);
-
-       while (len && isspace(*str)) {
-               len--;
-               str++;
-       }
-
-       return str;
-}
-
-static int read_script_info(struct script_desc *desc, const char *filename)
-{
-       char line[BUFSIZ], *p;
-       FILE *fp;
-
-       fp = fopen(filename, "r");
-       if (!fp)
-               return -1;
-
-       while (fgets(line, sizeof(line), fp)) {
-               p = ltrim(line);
-               if (strlen(p) == 0)
-                       continue;
-               if (*p != '#')
-                       continue;
-               p++;
-               if (strlen(p) && *p == '!')
-                       continue;
-
-               p = ltrim(p);
-               if (strlen(p) && p[strlen(p) - 1] == '\n')
-                       p[strlen(p) - 1] = '\0';
-
-               if (!strncmp(p, "description:", strlen("description:"))) {
-                       p += strlen("description:");
-                       desc->half_liner = strdup(ltrim(p));
-                       continue;
-               }
-
-               if (!strncmp(p, "args:", strlen("args:"))) {
-                       p += strlen("args:");
-                       desc->args = strdup(ltrim(p));
-                       continue;
-               }
-       }
-
-       fclose(fp);
-
-       return 0;
-}
-
-static int list_available_scripts(const struct option *opt __used,
-                                 const char *s __used, int unset __used)
-{
-       struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
-       char scripts_path[MAXPATHLEN];
-       DIR *scripts_dir, *lang_dir;
-       char script_path[MAXPATHLEN];
-       char lang_path[MAXPATHLEN];
-       struct script_desc *desc;
-       char first_half[BUFSIZ];
-       char *script_root;
-       char *str;
-
-       snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
-
-       scripts_dir = opendir(scripts_path);
-       if (!scripts_dir)
-               return -1;
-
-       for_each_lang(scripts_dir, lang_dirent, lang_next) {
-               snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
-                        lang_dirent.d_name);
-               lang_dir = opendir(lang_path);
-               if (!lang_dir)
-                       continue;
-
-               for_each_script(lang_dir, script_dirent, script_next) {
-                       script_root = strdup(script_dirent.d_name);
-                       str = ends_with(script_root, REPORT_SUFFIX);
-                       if (str) {
-                               *str = '\0';
-                               desc = script_desc__findnew(script_root);
-                               snprintf(script_path, MAXPATHLEN, "%s/%s",
-                                        lang_path, script_dirent.d_name);
-                               read_script_info(desc, script_path);
-                       }
-                       free(script_root);
-               }
-       }
-
-       fprintf(stdout, "List of available trace scripts:\n");
-       list_for_each_entry(desc, &script_descs, node) {
-               sprintf(first_half, "%s %s", desc->name,
-                       desc->args ? desc->args : "");
-               fprintf(stdout, "  %-36s %s\n", first_half,
-                       desc->half_liner ? desc->half_liner : "");
-       }
-
-       exit(0);
-}
-
-static char *get_script_path(const char *script_root, const char *suffix)
-{
-       struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
-       char scripts_path[MAXPATHLEN];
-       char script_path[MAXPATHLEN];
-       DIR *scripts_dir, *lang_dir;
-       char lang_path[MAXPATHLEN];
-       char *str, *__script_root;
-       char *path = NULL;
-
-       snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
-
-       scripts_dir = opendir(scripts_path);
-       if (!scripts_dir)
-               return NULL;
-
-       for_each_lang(scripts_dir, lang_dirent, lang_next) {
-               snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
-                        lang_dirent.d_name);
-               lang_dir = opendir(lang_path);
-               if (!lang_dir)
-                       continue;
-
-               for_each_script(lang_dir, script_dirent, script_next) {
-                       __script_root = strdup(script_dirent.d_name);
-                       str = ends_with(__script_root, suffix);
-                       if (str) {
-                               *str = '\0';
-                               if (strcmp(__script_root, script_root))
-                                       continue;
-                               snprintf(script_path, MAXPATHLEN, "%s/%s",
-                                        lang_path, script_dirent.d_name);
-                               path = strdup(script_path);
-                               free(__script_root);
-                               break;
-                       }
-                       free(__script_root);
-               }
-       }
-
-       return path;
-}
-
-static bool is_top_script(const char *script_path)
-{
-       return ends_with((char *)script_path, "top") == NULL ? false : true;
-}
-
-static int has_required_arg(char *script_path)
-{
-       struct script_desc *desc;
-       int n_args = 0;
-       char *p;
-
-       desc = script_desc__new(NULL);
-
-       if (read_script_info(desc, script_path))
-               goto out;
-
-       if (!desc->args)
-               goto out;
-
-       for (p = desc->args; *p; p++)
-               if (*p == '<')
-                       n_args++;
-out:
-       script_desc__delete(desc);
-
-       return n_args;
-}
-
-static const char * const trace_usage[] = {
-       "perf trace [<options>]",
-       "perf trace [<options>] record <script> [<record-options>] <command>",
-       "perf trace [<options>] report <script> [script-args]",
-       "perf trace [<options>] <script> [<record-options>] <command>",
-       "perf trace [<options>] <top-script> [script-args]",
-       NULL
-};
-
-static const struct option options[] = {
-       OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
-                   "dump raw trace in ASCII"),
-       OPT_INCR('v', "verbose", &verbose,
-                   "be more verbose (show symbol address, etc)"),
-       OPT_BOOLEAN('L', "Latency", &latency_format,
-                   "show latency attributes (irqs/preemption disabled, etc)"),
-       OPT_CALLBACK_NOOPT('l', "list", NULL, NULL, "list available scripts",
-                          list_available_scripts),
-       OPT_CALLBACK('s', "script", NULL, "name",
-                    "script file name (lang:script name, script name, or *)",
-                    parse_scriptname),
-       OPT_STRING('g', "gen-script", &generate_script_lang, "lang",
-                  "generate perf-trace.xx script in specified language"),
-       OPT_STRING('i', "input", &input_name, "file",
-                   "input file name"),
-       OPT_BOOLEAN('d', "debug-mode", &debug_mode,
-                  "do various checks like samples ordering and lost events"),
-
-       OPT_END()
-};
-
-static bool have_cmd(int argc, const char **argv)
-{
-       char **__argv = malloc(sizeof(const char *) * argc);
-
-       if (!__argv)
-               die("malloc");
-       memcpy(__argv, argv, sizeof(const char *) * argc);
-       argc = parse_options(argc, (const char **)__argv, record_options,
-                            NULL, PARSE_OPT_STOP_AT_NON_OPTION);
-       free(__argv);
-
-       return argc != 0;
-}
-
-int cmd_trace(int argc, const char **argv, const char *prefix __used)
-{
-       char *rec_script_path = NULL;
-       char *rep_script_path = NULL;
-       struct perf_session *session;
-       char *script_path = NULL;
-       const char **__argv;
-       bool system_wide;
-       int i, j, err;
-
-       setup_scripting();
-
-       argc = parse_options(argc, argv, options, trace_usage,
-                            PARSE_OPT_STOP_AT_NON_OPTION);
-
-       if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) {
-               rec_script_path = get_script_path(argv[1], RECORD_SUFFIX);
-               if (!rec_script_path)
-                       return cmd_record(argc, argv, NULL);
-       }
-
-       if (argc > 1 && !strncmp(argv[0], "rep", strlen("rep"))) {
-               rep_script_path = get_script_path(argv[1], REPORT_SUFFIX);
-               if (!rep_script_path) {
-                       fprintf(stderr,
-                               "Please specify a valid report script"
-                               "(see 'perf trace -l' for listing)\n");
-                       return -1;
-               }
-       }
-
-       /* make sure PERF_EXEC_PATH is set for scripts */
-       perf_set_argv_exec_path(perf_exec_path());
-
-       if (argc && !script_name && !rec_script_path && !rep_script_path) {
-               int live_pipe[2];
-               int rep_args;
-               pid_t pid;
-
-               rec_script_path = get_script_path(argv[0], RECORD_SUFFIX);
-               rep_script_path = get_script_path(argv[0], REPORT_SUFFIX);
-
-               if (!rec_script_path && !rep_script_path) {
-                       fprintf(stderr, " Couldn't find script %s\n\n See perf"
-                               " trace -l for available scripts.\n", argv[0]);
-                       usage_with_options(trace_usage, options);
-               }
-
-               if (is_top_script(argv[0])) {
-                       rep_args = argc - 1;
-               } else {
-                       int rec_args;
-
-                       rep_args = has_required_arg(rep_script_path);
-                       rec_args = (argc - 1) - rep_args;
-                       if (rec_args < 0) {
-                               fprintf(stderr, " %s script requires options."
-                                       "\n\n See perf trace -l for available "
-                                       "scripts and options.\n", argv[0]);
-                               usage_with_options(trace_usage, options);
-                       }
-               }
-
-               if (pipe(live_pipe) < 0) {
-                       perror("failed to create pipe");
-                       exit(-1);
-               }
-
-               pid = fork();
-               if (pid < 0) {
-                       perror("failed to fork");
-                       exit(-1);
-               }
-
-               if (!pid) {
-                       system_wide = true;
-                       j = 0;
-
-                       dup2(live_pipe[1], 1);
-                       close(live_pipe[0]);
-
-                       if (!is_top_script(argv[0]))
-                               system_wide = !have_cmd(argc - rep_args,
-                                                       &argv[rep_args]);
-
-                       __argv = malloc((argc + 6) * sizeof(const char *));
-                       if (!__argv)
-                               die("malloc");
-
-                       __argv[j++] = "/bin/sh";
-                       __argv[j++] = rec_script_path;
-                       if (system_wide)
-                               __argv[j++] = "-a";
-                       __argv[j++] = "-q";
-                       __argv[j++] = "-o";
-                       __argv[j++] = "-";
-                       for (i = rep_args + 1; i < argc; i++)
-                               __argv[j++] = argv[i];
-                       __argv[j++] = NULL;
-
-                       execvp("/bin/sh", (char **)__argv);
-                       free(__argv);
-                       exit(-1);
-               }
-
-               dup2(live_pipe[0], 0);
-               close(live_pipe[1]);
-
-               __argv = malloc((argc + 4) * sizeof(const char *));
-               if (!__argv)
-                       die("malloc");
-               j = 0;
-               __argv[j++] = "/bin/sh";
-               __argv[j++] = rep_script_path;
-               for (i = 1; i < rep_args + 1; i++)
-                       __argv[j++] = argv[i];
-               __argv[j++] = "-i";
-               __argv[j++] = "-";
-               __argv[j++] = NULL;
-
-               execvp("/bin/sh", (char **)__argv);
-               free(__argv);
-               exit(-1);
-       }
-
-       if (rec_script_path)
-               script_path = rec_script_path;
-       if (rep_script_path)
-               script_path = rep_script_path;
-
-       if (script_path) {
-               system_wide = false;
-               j = 0;
-
-               if (rec_script_path)
-                       system_wide = !have_cmd(argc - 1, &argv[1]);
-
-               __argv = malloc((argc + 2) * sizeof(const char *));
-               if (!__argv)
-                       die("malloc");
-               __argv[j++] = "/bin/sh";
-               __argv[j++] = script_path;
-               if (system_wide)
-                       __argv[j++] = "-a";
-               for (i = 2; i < argc; i++)
-                       __argv[j++] = argv[i];
-               __argv[j++] = NULL;
-
-               execvp("/bin/sh", (char **)__argv);
-               free(__argv);
-               exit(-1);
-       }
-
-       if (symbol__init() < 0)
-               return -1;
-       if (!script_name)
-               setup_pager();
-
-       session = perf_session__new(input_name, O_RDONLY, 0, false);
-       if (session == NULL)
-               return -ENOMEM;
-
-       if (strcmp(input_name, "-") &&
-           !perf_session__has_traces(session, "record -R"))
-               return -EINVAL;
-
-       if (generate_script_lang) {
-               struct stat perf_stat;
-
-               int input = open(input_name, O_RDONLY);
-               if (input < 0) {
-                       perror("failed to open file");
-                       exit(-1);
-               }
-
-               err = fstat(input, &perf_stat);
-               if (err < 0) {
-                       perror("failed to stat file");
-                       exit(-1);
-               }
-
-               if (!perf_stat.st_size) {
-                       fprintf(stderr, "zero-sized file, nothing to do!\n");
-                       exit(0);
-               }
-
-               scripting_ops = script_spec__lookup(generate_script_lang);
-               if (!scripting_ops) {
-                       fprintf(stderr, "invalid language specifier");
-                       return -1;
-               }
-
-               err = scripting_ops->generate_script("perf-trace");
-               goto out;
-       }
-
-       if (script_name) {
-               err = scripting_ops->start_script(script_name, argc, argv);
-               if (err)
-                       goto out;
-               pr_debug("perf trace started with script %s\n\n", script_name);
-       }
-
-       err = __cmd_trace(session);
-
-       perf_session__delete(session);
-       cleanup_scripting();
-out:
-       return err;
-}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h

index 921245b28583e448cce49008b2482e0f59193454..c7798c7f24ed737f03a4376485be5a2e14b3aef4 100644 (file)
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -27,7 +27,7 @@ extern int cmd_report(int argc, const char **argv, const char *prefix);
  extern int cmd_stat(int argc, const char **argv, const char *prefix);
  extern int cmd_timechart(int argc, const char **argv, const char *prefix);
  extern int cmd_top(int argc, const char **argv, const char *prefix);
-extern int cmd_trace(int argc, const char **argv, const char *prefix);
+extern int cmd_script(int argc, const char **argv, const char *prefix);
  extern int cmd_version(int argc, const char **argv, const char *prefix);
  extern int cmd_probe(int argc, const char **argv, const char *prefix);
  extern int cmd_kmem(int argc, const char **argv, const char *prefix);
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt

index 949d77fc0b9718d812a8906883b7a89ef99c49f0..16b5088cf8f4bbb2259aee8ad90483cd926a0935 100644 (file)
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -16,7 +16,7 @@ perf-report                   mainporcelain common
  perf-stat                      mainporcelain common
  perf-timechart                 mainporcelain common
  perf-top                       mainporcelain common
-perf-trace                     mainporcelain common
+perf-script                    mainporcelain common
  perf-probe                     mainporcelain common
  perf-kmem                      mainporcelain common
  perf-lock                      mainporcelain common
diff --git a/tools/perf/feature-tests.mak b/tools/perf/feature-tests.mak

index b253db634f04b7e8ddfddd1cc33bb3ce8343a49a..b041ca67a2cbdee01c87ff92ec21deda82ae1b5c 100644 (file)
--- a/tools/perf/feature-tests.mak
+++ b/tools/perf/feature-tests.mak
@@ -9,8 +9,8 @@ endef
  ifndef NO_DWARF
  define SOURCE_DWARF
  #include <dwarf.h>
-#include <libdw.h>
-#include <version.h>
+#include <elfutils/libdw.h>
+#include <elfutils/version.h>
  #ifndef _ELFUTILS_PREREQ
  #error
  #endif
diff --git a/tools/perf/perf.c b/tools/perf/perf.c

index cdd6c03f1e14c132e550b85e07b22e7621a710d2..5b1ecd66bb36a053b6427020f9aa3361d4ec1265 100644 (file)
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -286,6 +286,8 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
         status = p->fn(argc, argv, prefix);
         exit_browser(status);
  
+       perf_evsel_list__delete();
+
         if (status)
                 return status & 0xff;
  
@@ -323,7 +325,7 @@ static void handle_internal_command(int argc, const char **argv)
                 { "top",        cmd_top,        0 },
                 { "annotate",   cmd_annotate,   0 },
                 { "version",    cmd_version,    0 },
-               { "trace",      cmd_trace,      0 },
+               { "script",     cmd_script,     0 },
                 { "sched",      cmd_sched,      0 },
                 { "probe",      cmd_probe,      0 },
                 { "kmem",       cmd_kmem,       0 },
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/Context.c b/tools/perf/scripts/perl/Perf-Trace-Util/Context.c

index 01a64ad693f2a7c4ac8600e8d37f9a90ffad1b00..790ceba6ad3f4a4102a1affa81a637ef774d7d43 100644 (file)
--- a/tools/perf/scripts/perl/Perf-Trace-Util/Context.c
+++ b/tools/perf/scripts/perl/Perf-Trace-Util/Context.c
@@ -8,7 +8,7 @@
  
  #line 1 "Context.xs"
  /*
- * Context.xs.  XS interfaces for perf trace.
+ * Context.xs.  XS interfaces for perf script.
   *
   * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
   *
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/Context.xs b/tools/perf/scripts/perl/Perf-Trace-Util/Context.xs

index 549cf0467d309eda5be9c0faee7897a5608371b4..c1e2ed1ed34e4e16e3398acd12f3e5723a81ddee 100644 (file)
--- a/tools/perf/scripts/perl/Perf-Trace-Util/Context.xs
+++ b/tools/perf/scripts/perl/Perf-Trace-Util/Context.xs
@@ -1,5 +1,5 @@
  /*
- * Context.xs.  XS interfaces for perf trace.
+ * Context.xs.  XS interfaces for perf script.
   *
   * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
   *
@@ -23,7 +23,7 @@
  #include "perl.h"
  #include "XSUB.h"
  #include "../../../perf.h"
-#include "../../../util/trace-event.h"
+#include "../../../util/script-event.h"
  
  MODULE = Perf::Trace::Context          PACKAGE = Perf::Trace::Context
  PROTOTYPES: ENABLE
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/README b/tools/perf/scripts/perl/Perf-Trace-Util/README

index 9a970763079144666b9bebb3f6d626bd7009f298..2f0c7f3043ee5d992727b50f11c404d33cfb009e 100644 (file)
--- a/tools/perf/scripts/perl/Perf-Trace-Util/README
+++ b/tools/perf/scripts/perl/Perf-Trace-Util/README
@@ -1,7 +1,7 @@
  Perf-Trace-Util version 0.01
  ============================
  
-This module contains utility functions for use with perf trace.
+This module contains utility functions for use with perf script.
  
  Core.pm and Util.pm are pure Perl modules; Core.pm contains routines
  that the core perf support for Perl calls on and should always be
@@ -33,7 +33,7 @@ After you do that:
  
  INSTALLATION
  
-Building perf with perf trace Perl scripting should install this
+Building perf with perf script Perl scripting should install this
  module in the right place.
  
  You should make sure libperl and ExtUtils/Embed.pm are installed first
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm

index 6c7f3659cb1769ca8d40bd19816d54be94dd47ca..4e2f6039ac920f60192a7cee033339afbdf26042 100644 (file)
--- a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm
+++ b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm
@@ -34,7 +34,7 @@ Perf::Trace::Context - Perl extension for accessing functions in perf.
  
  =head1 SEE ALSO
  
-Perf (trace) documentation
+Perf (script) documentation
  
  =head1 AUTHOR
  
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm

index 9df376a9f62971e355de96b14324c52fa9b469ea..9158458d3eeb118c35357a29af763f533e370247 100644 (file)
--- a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm
+++ b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm
@@ -163,7 +163,7 @@ sub dump_symbolic_fields
  __END__
  =head1 NAME
  
-Perf::Trace::Core - Perl extension for perf trace
+Perf::Trace::Core - Perl extension for perf script
  
  =head1 SYNOPSIS
  
@@ -171,7 +171,7 @@ Perf::Trace::Core - Perl extension for perf trace
  
  =head1 SEE ALSO
  
-Perf (trace) documentation
+Perf (script) documentation
  
  =head1 AUTHOR
  
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm

index d94b40c8ac857227516b6f408bf5b13a9fea03cd..053500114625515d7745757178274e097c0d6aa2 100644 (file)
--- a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm
+++ b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm
@@ -65,7 +65,7 @@ sub clear_term
  __END__
  =head1 NAME
  
-Perf::Trace::Util - Perl extension for perf trace
+Perf::Trace::Util - Perl extension for perf script
  
  =head1 SYNOPSIS
  
@@ -73,7 +73,7 @@ Perf::Trace::Util - Perl extension for perf trace
  
  =head1 SEE ALSO
  
-Perf (trace) documentation
+Perf (script) documentation
  
  =head1 AUTHOR
  
diff --git a/tools/perf/scripts/perl/bin/failed-syscalls-report b/tools/perf/scripts/perl/bin/failed-syscalls-report

index 4028d92dc4ae6602927d3c82c02f3973506d995e..9f83cc1ad8ba253acabff31a32987566976e6d7c 100644 (file)
--- a/tools/perf/scripts/perl/bin/failed-syscalls-report
+++ b/tools/perf/scripts/perl/bin/failed-syscalls-report
@@ -7,4 +7,4 @@ if [ $# -gt 0 ] ; then
         shift
      fi
  fi
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/failed-syscalls.pl $comm
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/failed-syscalls.pl $comm
diff --git a/tools/perf/scripts/perl/bin/rw-by-file-report b/tools/perf/scripts/perl/bin/rw-by-file-report

index ba25f4d41fb02a1d40303ebcdb8ba8f17ff8470d..77200b3f31003c7bd8c3a137106f461223b67a43 100644 (file)
--- a/tools/perf/scripts/perl/bin/rw-by-file-report
+++ b/tools/perf/scripts/perl/bin/rw-by-file-report
@@ -7,7 +7,4 @@ if [ $# -lt 1 ] ; then
  fi
  comm=$1
  shift
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/rw-by-file.pl $comm
-
-
-
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/rw-by-file.pl $comm
diff --git a/tools/perf/scripts/perl/bin/rw-by-pid-report b/tools/perf/scripts/perl/bin/rw-by-pid-report

index 641a3f5d085c6148e9437e10704f6953d7d1eca5..a27b9f311f959a626d9e481f39cbb65b060d743f 100644 (file)
--- a/tools/perf/scripts/perl/bin/rw-by-pid-report
+++ b/tools/perf/scripts/perl/bin/rw-by-pid-report
@@ -1,6 +1,3 @@
  #!/bin/bash
  # description: system-wide r/w activity
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/rw-by-pid.pl
-
-
-
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/rw-by-pid.pl
diff --git a/tools/perf/scripts/perl/bin/rwtop-report b/tools/perf/scripts/perl/bin/rwtop-report

index 4918dba77021e676fdfeaf6c7d3e90105979ccc8..83e11ec2e190988c142aab5d4644778b812b9447 100644 (file)
--- a/tools/perf/scripts/perl/bin/rwtop-report
+++ b/tools/perf/scripts/perl/bin/rwtop-report
@@ -17,7 +17,4 @@ if [ "$n_args" -gt 0 ] ; then
      interval=$1
      shift
  fi
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/rwtop.pl $interval
-
-
-
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/rwtop.pl $interval
diff --git a/tools/perf/scripts/perl/bin/wakeup-latency-report b/tools/perf/scripts/perl/bin/wakeup-latency-report

index 49052ebcb6326d8aa13ea309994d965ca045f58b..889e8130cca55c7235ae749c83c5a9aedb92d4fd 100644 (file)
--- a/tools/perf/scripts/perl/bin/wakeup-latency-report
+++ b/tools/perf/scripts/perl/bin/wakeup-latency-report
@@ -1,6 +1,3 @@
  #!/bin/bash
  # description: system-wide min/max/avg wakeup latency
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/wakeup-latency.pl
-
-
-
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/wakeup-latency.pl
diff --git a/tools/perf/scripts/perl/bin/workqueue-stats-report b/tools/perf/scripts/perl/bin/workqueue-stats-report

index df0c65f4ca93de35b07bbe9e24f5a966d8b75ea8..6d91411d248caa1a0f6ade3ee39645cfb51b631d 100644 (file)
--- a/tools/perf/scripts/perl/bin/workqueue-stats-report
+++ b/tools/perf/scripts/perl/bin/workqueue-stats-report
@@ -1,7 +1,3 @@
  #!/bin/bash
  # description: workqueue stats (ins/exe/create/destroy)
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/workqueue-stats.pl
-
-
-
-
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/workqueue-stats.pl
diff --git a/tools/perf/scripts/perl/check-perf-trace.pl b/tools/perf/scripts/perl/check-perf-trace.pl

index 4e7dc0a407a5fbf65d0bcfab434e6dfad23f0733..4e7076c2061610044f766dddff136bf785166256 100644 (file)
--- a/tools/perf/scripts/perl/check-perf-trace.pl
+++ b/tools/perf/scripts/perl/check-perf-trace.pl
@@ -1,4 +1,4 @@
-# perf trace event handlers, generated by perf trace -g perl
+# perf script event handlers, generated by perf script -g perl
  # (c) 2009, Tom Zanussi <tzanussi@gmail.com>
  # Licensed under the terms of the GNU GPL License version 2
  
diff --git a/tools/perf/scripts/perl/rw-by-file.pl b/tools/perf/scripts/perl/rw-by-file.pl

index 2a39097687b9f70dffb2fe83f4477539ff8e98b4..74844ee2be3ef691ce9fd1c7129a158b5080a975 100644 (file)
--- a/tools/perf/scripts/perl/rw-by-file.pl
+++ b/tools/perf/scripts/perl/rw-by-file.pl
@@ -18,7 +18,7 @@ use lib "./Perf-Trace-Util/lib";
  use Perf::Trace::Core;
  use Perf::Trace::Util;
  
-my $usage = "perf trace -s rw-by-file.pl <comm>\n";
+my $usage = "perf script -s rw-by-file.pl <comm>\n";
  
  my $for_comm = shift or die $usage;
  
diff --git a/tools/perf/scripts/perl/workqueue-stats.pl b/tools/perf/scripts/perl/workqueue-stats.pl

index b84b12699b70ba0731bdbfdd4115d62558990184..a8eaff5119e09fa953626d0ed58ddfabae18bb00 100644 (file)
--- a/tools/perf/scripts/perl/workqueue-stats.pl
+++ b/tools/perf/scripts/perl/workqueue-stats.pl
@@ -10,7 +10,7 @@
  #     workqueue:workqueue_destruction -e workqueue:workqueue_execution
  #     -e workqueue:workqueue_insertion
  #
-#   perf trace -p -s tools/perf/scripts/perl/workqueue-stats.pl
+#   perf script -p -s tools/perf/scripts/perl/workqueue-stats.pl
  
  use 5.010000;
  use strict;
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/Context.c b/tools/perf/scripts/python/Perf-Trace-Util/Context.c

index 957085dd5d8d1a2ff17ee4bb6ad3c3160f8df508..315067b8f5522ae9cafbef1df506aca814e93012 100644 (file)
--- a/tools/perf/scripts/python/Perf-Trace-Util/Context.c
+++ b/tools/perf/scripts/python/Perf-Trace-Util/Context.c
@@ -1,5 +1,5 @@
  /*
- * Context.c.  Python interfaces for perf trace.
+ * Context.c.  Python interfaces for perf script.
   *
   * Copyright (C) 2010 Tom Zanussi <tzanussi@gmail.com>
   *
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Core.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Core.py

index aad7525bca1dc5a45ca72cd4be79eec7411a2bb0..de7211e4fa471ac0a0475f15097c0e175fdf4da9 100644 (file)
--- a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Core.py
+++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Core.py
@@ -1,4 +1,4 @@
-# Core.py - Python extension for perf trace, core functions
+# Core.py - Python extension for perf script, core functions
  #
  # Copyright (C) 2010 by Tom Zanussi <tzanussi@gmail.com>
  #
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/SchedGui.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/SchedGui.py

index ae9a56e43e05e37981774e9dc9ec3d0ed59f6ba2..fdd92f699055713e2d1fec1c99a61489e5812a64 100644 (file)
--- a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/SchedGui.py
+++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/SchedGui.py
@@ -1,4 +1,4 @@
-# SchedGui.py - Python extension for perf trace, basic GUI code for
+# SchedGui.py - Python extension for perf script, basic GUI code for
  #              traces drawing and overview.
  #
  # Copyright (C) 2010 by Frederic Weisbecker <fweisbec@gmail.com>
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py

index 13cc02b5893a7ee0a248b040eddc32cdc675a258..15c8400240fd9029ae34fca077304337d9c75ca6 100644 (file)
--- a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
+++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
@@ -1,4 +1,4 @@
-# Util.py - Python extension for perf trace, miscellaneous utility code
+# Util.py - Python extension for perf script, miscellaneous utility code
  #
  # Copyright (C) 2010 by Tom Zanussi <tzanussi@gmail.com>
  #
diff --git a/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report b/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report

index 03587021463d4ef6c7d25b4d0a852178fded5a86..fda5096d0cbf81a29792819c9648a43f89497d3c 100644 (file)
--- a/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report
+++ b/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report
@@ -7,4 +7,4 @@ if [ $# -gt 0 ] ; then
         shift
      fi
  fi
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/failed-syscalls-by-pid.py $comm
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/failed-syscalls-by-pid.py $comm
diff --git a/tools/perf/scripts/python/bin/futex-contention-report b/tools/perf/scripts/python/bin/futex-contention-report

index c8268138fb7e3e6d431be07bada51abed3022294..6c44271091abbb977b2a0ef725dea470626b1337 100644 (file)
--- a/tools/perf/scripts/python/bin/futex-contention-report
+++ b/tools/perf/scripts/python/bin/futex-contention-report
@@ -1,4 +1,4 @@
  #!/bin/bash
  # description: futext contention measurement
  
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/futex-contention.py
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/futex-contention.py
diff --git a/tools/perf/scripts/python/bin/netdev-times-report b/tools/perf/scripts/python/bin/netdev-times-report

index 4ad361b31249c03f10d424b842b92b6d9911acee..8f759291da86c07435a62e7fa044f8c75f9c2749 100644 (file)
--- a/tools/perf/scripts/python/bin/netdev-times-report
+++ b/tools/perf/scripts/python/bin/netdev-times-report
@@ -2,4 +2,4 @@
  # description: display a process of packet and processing time
  # args: [tx] [rx] [dev=] [debug]
  
-perf trace -s "$PERF_EXEC_PATH"/scripts/python/netdev-times.py $@
+perf script -s "$PERF_EXEC_PATH"/scripts/python/netdev-times.py $@
diff --git a/tools/perf/scripts/python/bin/sched-migration-report b/tools/perf/scripts/python/bin/sched-migration-report

index df1791f07c24233c638e445d48ff3ab52955889c..68b037a1849b1aeb71ec86d21fb05af4e208fc87 100644 (file)
--- a/tools/perf/scripts/python/bin/sched-migration-report
+++ b/tools/perf/scripts/python/bin/sched-migration-report
@@ -1,3 +1,3 @@
  #!/bin/bash
  # description: sched migration overview
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/sched-migration.py
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/sched-migration.py
diff --git a/tools/perf/scripts/python/bin/sctop-report b/tools/perf/scripts/python/bin/sctop-report

index 36b409c05e50ac5e6f80f7b82189aaa0d2ba096a..c32db294124da91d2654302c9d8e32673ec2bc4c 100644 (file)
--- a/tools/perf/scripts/python/bin/sctop-report
+++ b/tools/perf/scripts/python/bin/sctop-report
@@ -21,4 +21,4 @@ elif [ "$n_args" -gt 0 ] ; then
      interval=$1
      shift
  fi
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/sctop.py $comm $interval
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/sctop.py $comm $interval
diff --git a/tools/perf/scripts/python/bin/syscall-counts-by-pid-report b/tools/perf/scripts/python/bin/syscall-counts-by-pid-report

index 4eb88c9fc83ce7e99e14e8b004f71930b422d89a..16eb8d65c54335e08d1a95e7068e6d0df55be6b5 100644 (file)
--- a/tools/perf/scripts/python/bin/syscall-counts-by-pid-report
+++ b/tools/perf/scripts/python/bin/syscall-counts-by-pid-report
@@ -7,4 +7,4 @@ if [ $# -gt 0 ] ; then
         shift
      fi
  fi
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/syscall-counts-by-pid.py $comm
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/syscall-counts-by-pid.py $comm
diff --git a/tools/perf/scripts/python/bin/syscall-counts-report b/tools/perf/scripts/python/bin/syscall-counts-report

index cb2f9c5cf17e825972870c5c934500672e8bd15e..0f0e9d453bb48a606b3c6522a104bd16499fede8 100644 (file)
--- a/tools/perf/scripts/python/bin/syscall-counts-report
+++ b/tools/perf/scripts/python/bin/syscall-counts-report
@@ -7,4 +7,4 @@ if [ $# -gt 0 ] ; then
         shift
      fi
  fi
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/syscall-counts.py $comm
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/syscall-counts.py $comm
diff --git a/tools/perf/scripts/python/check-perf-trace.py b/tools/perf/scripts/python/check-perf-trace.py

index d9f7893e315c0d5aa064df04a8cc3ac870aebdad..4647a7694cf60a77835f3c80aeb79d54578df69d 100644 (file)
--- a/tools/perf/scripts/python/check-perf-trace.py
+++ b/tools/perf/scripts/python/check-perf-trace.py
@@ -1,4 +1,4 @@
-# perf trace event handlers, generated by perf trace -g python
+# perf script event handlers, generated by perf script -g python
  # (c) 2010, Tom Zanussi <tzanussi@gmail.com>
  # Licensed under the terms of the GNU GPL License version 2
  #
diff --git a/tools/perf/scripts/python/failed-syscalls-by-pid.py b/tools/perf/scripts/python/failed-syscalls-by-pid.py

index acd7848717b35ea7c0c46ae61dc01241673f936d..85805fac41167b9e531c09f51ca21595287e08aa 100644 (file)
--- a/tools/perf/scripts/python/failed-syscalls-by-pid.py
+++ b/tools/perf/scripts/python/failed-syscalls-by-pid.py
@@ -15,7 +15,7 @@ from perf_trace_context import *
  from Core import *
  from Util import *
  
-usage = "perf trace -s syscall-counts-by-pid.py [comm|pid]\n";
+usage = "perf script -s syscall-counts-by-pid.py [comm|pid]\n";
  
  for_comm = None
  for_pid = None
diff --git a/tools/perf/scripts/python/sched-migration.py b/tools/perf/scripts/python/sched-migration.py

index b934383c3364e63ed7bd6147bf5509867c65e7e2..74d55ec08aed5ec27867b1d74682a5a0bb320748 100644 (file)
--- a/tools/perf/scripts/python/sched-migration.py
+++ b/tools/perf/scripts/python/sched-migration.py
@@ -4,7 +4,7 @@
  #
  # Copyright (C) 2010 Frederic Weisbecker <fweisbec@gmail.com>
  #
-# perf trace event handlers have been generated by perf trace -g python
+# perf script event handlers have been generated by perf script -g python
  #
  # This software is distributed under the terms of the GNU General
  # Public License ("GPL") version 2 as published by the Free Software
diff --git a/tools/perf/scripts/python/sctop.py b/tools/perf/scripts/python/sctop.py

index 7a6ec2c7d8abe7bf01b660210811829f37eb572d..42c267e292fa36155f5d6b8270fb4b6943d9f6d0 100644 (file)
--- a/tools/perf/scripts/python/sctop.py
+++ b/tools/perf/scripts/python/sctop.py
@@ -17,7 +17,7 @@ from perf_trace_context import *
  from Core import *
  from Util import *
  
-usage = "perf trace -s sctop.py [comm] [interval]\n";
+usage = "perf script -s sctop.py [comm] [interval]\n";
  
  for_comm = None
  default_interval = 3
diff --git a/tools/perf/scripts/python/syscall-counts-by-pid.py b/tools/perf/scripts/python/syscall-counts-by-pid.py

index d1ee3ec10cf2b911776df81df7a5d4d66a5f5fc1..c64d1c55d745b7437e4e26f77446b87567c67ed1 100644 (file)
--- a/tools/perf/scripts/python/syscall-counts-by-pid.py
+++ b/tools/perf/scripts/python/syscall-counts-by-pid.py
@@ -14,7 +14,7 @@ from perf_trace_context import *
  from Core import *
  from Util import syscall_name
  
-usage = "perf trace -s syscall-counts-by-pid.py [comm]\n";
+usage = "perf script -s syscall-counts-by-pid.py [comm]\n";
  
  for_comm = None
  for_pid = None
diff --git a/tools/perf/scripts/python/syscall-counts.py b/tools/perf/scripts/python/syscall-counts.py

index ea183dc82d29e54a005f28648201a2219feac224..b435d3f188e84c421819802cb2efcefd62cff0d2 100644 (file)
--- a/tools/perf/scripts/python/syscall-counts.py
+++ b/tools/perf/scripts/python/syscall-counts.py
@@ -15,7 +15,7 @@ from perf_trace_context import *
  from Core import *
  from Util import syscall_name
  
-usage = "perf trace -s syscall-counts.py [comm]\n";
+usage = "perf script -s syscall-counts.py [comm]\n";
  
  for_comm = None
  
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c

index e437edb72417ba2f12e90b810f8851caef2c24fb..deffb8c960716213124b7fd36f56edf41bf8207d 100644 (file)
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -14,7 +14,9 @@
  #include <linux/kernel.h>
  #include "debug.h"
  
-static int build_id__mark_dso_hit(event_t *event, struct perf_session *session)
+static int build_id__mark_dso_hit(event_t *event,
+                                 struct sample_data *sample __used,
+                                 struct perf_session *session)
  {
         struct addr_location al;
         u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
@@ -35,7 +37,8 @@ static int build_id__mark_dso_hit(event_t *event, struct perf_session *session)
         return 0;
  }
  
-static int event__exit_del_thread(event_t *self, struct perf_session *session)
+static int event__exit_del_thread(event_t *self, struct sample_data *sample __used,
+                                 struct perf_session *session)
  {
         struct thread *thread = perf_session__findnew(session, self->fork.tid);
  
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c

index 0f9b8d7a7d7e7d62f38c48ec8846251beb532918..3ccaa10433830503325bb7625527839f4586b93e 100644 (file)
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -4,32 +4,53 @@
  #include <assert.h>
  #include <stdio.h>
  
-int cpumap[MAX_NR_CPUS];
-
-static int default_cpu_map(void)
+static struct cpu_map *cpu_map__default_new(void)
  {
-       int nr_cpus, i;
+       struct cpu_map *cpus;
+       int nr_cpus;
  
         nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-       assert(nr_cpus <= MAX_NR_CPUS);
-       assert((int)nr_cpus >= 0);
+       if (nr_cpus < 0)
+               return NULL;
+
+       cpus = malloc(sizeof(*cpus) + nr_cpus * sizeof(int));
+       if (cpus != NULL) {
+               int i;
+               for (i = 0; i < nr_cpus; ++i)
+                       cpus->map[i] = i;
  
-       for (i = 0; i < nr_cpus; ++i)
-               cpumap[i] = i;
+               cpus->nr = nr_cpus;
+       }
  
-       return nr_cpus;
+       return cpus;
  }
  
-static int read_all_cpu_map(void)
+static struct cpu_map *cpu_map__trim_new(int nr_cpus, int *tmp_cpus)
  {
+       size_t payload_size = nr_cpus * sizeof(int);
+       struct cpu_map *cpus = malloc(sizeof(*cpus) + payload_size);
+
+       if (cpus != NULL) {
+               cpus->nr = nr_cpus;
+               memcpy(cpus->map, tmp_cpus, payload_size);
+       }
+
+       return cpus;
+}
+
+static struct cpu_map *cpu_map__read_all_cpu_map(void)
+{
+       struct cpu_map *cpus = NULL;
         FILE *onlnf;
         int nr_cpus = 0;
+       int *tmp_cpus = NULL, *tmp;
+       int max_entries = 0;
         int n, cpu, prev;
         char sep;
  
         onlnf = fopen("/sys/devices/system/cpu/online", "r");
         if (!onlnf)
-               return default_cpu_map();
+               return cpu_map__default_new();
  
         sep = 0;
         prev = -1;
@@ -38,12 +59,28 @@ static int read_all_cpu_map(void)
                 if (n <= 0)
                         break;
                 if (prev >= 0) {
-                       assert(nr_cpus + cpu - prev - 1 < MAX_NR_CPUS);
+                       int new_max = nr_cpus + cpu - prev - 1;
+
+                       if (new_max >= max_entries) {
+                               max_entries = new_max + MAX_NR_CPUS / 2;
+                               tmp = realloc(tmp_cpus, max_entries * sizeof(int));
+                               if (tmp == NULL)
+                                       goto out_free_tmp;
+                               tmp_cpus = tmp;
+                       }
+
                         while (++prev < cpu)
-                               cpumap[nr_cpus++] = prev;
+                               tmp_cpus[nr_cpus++] = prev;
+               }
+               if (nr_cpus == max_entries) {
+                       max_entries += MAX_NR_CPUS;
+                       tmp = realloc(tmp_cpus, max_entries * sizeof(int));
+                       if (tmp == NULL)
+                               goto out_free_tmp;
+                       tmp_cpus = tmp;
                 }
-               assert (nr_cpus < MAX_NR_CPUS);
-               cpumap[nr_cpus++] = cpu;
+
+               tmp_cpus[nr_cpus++] = cpu;
                 if (n == 2 && sep == '-')
                         prev = cpu;
                 else
@@ -51,24 +88,31 @@ static int read_all_cpu_map(void)
                 if (n == 1 || sep == '\n')
                         break;
         }
-       fclose(onlnf);
-       if (nr_cpus > 0)
-               return nr_cpus;
  
-       return default_cpu_map();
+       if (nr_cpus > 0)
+               cpus = cpu_map__trim_new(nr_cpus, tmp_cpus);
+       else
+               cpus = cpu_map__default_new();
+out_free_tmp:
+       free(tmp_cpus);
+       fclose(onlnf);
+       return cpus;
  }
  
-int read_cpu_map(const char *cpu_list)
+struct cpu_map *cpu_map__new(const char *cpu_list)
  {
+       struct cpu_map *cpus = NULL;
         unsigned long start_cpu, end_cpu = 0;
         char *p = NULL;
         int i, nr_cpus = 0;
+       int *tmp_cpus = NULL, *tmp;
+       int max_entries = 0;
  
         if (!cpu_list)
-               return read_all_cpu_map();
+               return cpu_map__read_all_cpu_map();
  
         if (!isdigit(*cpu_list))
-               goto invalid;
+               goto out;
  
         while (isdigit(*cpu_list)) {
                 p = NULL;
@@ -94,21 +138,42 @@ int read_cpu_map(const char *cpu_list)
                 for (; start_cpu <= end_cpu; start_cpu++) {
                         /* check for duplicates */
                         for (i = 0; i < nr_cpus; i++)
-                               if (cpumap[i] == (int)start_cpu)
+                               if (tmp_cpus[i] == (int)start_cpu)
                                         goto invalid;
  
-                       assert(nr_cpus < MAX_NR_CPUS);
-                       cpumap[nr_cpus++] = (int)start_cpu;
+                       if (nr_cpus == max_entries) {
+                               max_entries += MAX_NR_CPUS;
+                               tmp = realloc(tmp_cpus, max_entries * sizeof(int));
+                               if (tmp == NULL)
+                                       goto invalid;
+                               tmp_cpus = tmp;
+                       }
+                       tmp_cpus[nr_cpus++] = (int)start_cpu;
                 }
                 if (*p)
                         ++p;
  
                 cpu_list = p;
         }
-       if (nr_cpus > 0)
-               return nr_cpus;
  
-       return default_cpu_map();
+       if (nr_cpus > 0)
+               cpus = cpu_map__trim_new(nr_cpus, tmp_cpus);
+       else
+               cpus = cpu_map__default_new();
  invalid:
-       return -1;
+       free(tmp_cpus);
+out:
+       return cpus;
+}
+
+struct cpu_map *cpu_map__dummy_new(void)
+{
+       struct cpu_map *cpus = malloc(sizeof(*cpus) + sizeof(int));
+
+       if (cpus != NULL) {
+               cpus->nr = 1;
+               cpus->map[0] = -1;
+       }
+
+       return cpus;
  }
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h

index 3e60f56e490eb10f8cf08981e703bf5699d6b20c..f7a4f42f6307fb522299ea48d1126e6d08ffda71 100644 (file)
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -1,7 +1,13 @@
  #ifndef __PERF_CPUMAP_H
  #define __PERF_CPUMAP_H
  
-extern int read_cpu_map(const char *cpu_list);
-extern int cpumap[];
+struct cpu_map {
+       int nr;
+       int map[];
+};
+
+struct cpu_map *cpu_map__new(const char *cpu_list);
+struct cpu_map *cpu_map__dummy_new(void);
+void *cpu_map__delete(struct cpu_map *map);
  
  #endif /* __PERF_CPUMAP_H */
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c

index c8d81b00089d6d9d26ca431aea91453a86ef2351..01bbe8ecec3f7eda9088e9e59ad78ea53b7b5ce6 100644 (file)
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -46,20 +46,16 @@ int dump_printf(const char *fmt, ...)
         return ret;
  }
  
-static int dump_printf_color(const char *fmt, const char *color, ...)
+#ifdef NO_NEWT_SUPPORT
+void ui__warning(const char *format, ...)
  {
         va_list args;
-       int ret = 0;
  
-       if (dump_trace) {
-               va_start(args, color);
-               ret = color_vfprintf(stdout, color, fmt, args);
-               va_end(args);
-       }
-
-       return ret;
+       va_start(args, format);
+       vfprintf(stderr, format, args);
+       va_end(args);
  }
-
+#endif
  
  void trace_event(event_t *event)
  {
@@ -70,29 +66,29 @@ void trace_event(event_t *event)
         if (!dump_trace)
                 return;
  
-       dump_printf(".");
-       dump_printf_color("\n. ... raw event: size %d bytes\n", color,
-                         event->header.size);
+       printf(".");
+       color_fprintf(stdout, color, "\n. ... raw event: size %d bytes\n",
+                     event->header.size);
  
         for (i = 0; i < event->header.size; i++) {
                 if ((i & 15) == 0) {
-                       dump_printf(".");
-                       dump_printf_color("  %04x: ", color, i);
+                       printf(".");
+                       color_fprintf(stdout, color, "  %04x: ", i);
                 }
  
-               dump_printf_color(" %02x", color, raw_event[i]);
+               color_fprintf(stdout, color, " %02x", raw_event[i]);
  
                 if (((i & 15) == 15) || i == event->header.size-1) {
-                       dump_printf_color("  ", color);
+                       color_fprintf(stdout, color, "  ");
                         for (j = 0; j < 15-(i & 15); j++)
-                               dump_printf_color("   ", color);
+                               color_fprintf(stdout, color, "   ");
                         for (j = i & ~15; j <= i; j++) {
-                               dump_printf_color("%c", color,
-                                               isprint(raw_event[j]) ?
-                                               raw_event[j] : '.');
+                               color_fprintf(stdout, color, "%c",
+                                             isprint(raw_event[j]) ?
+                                             raw_event[j] : '.');
                         }
-                       dump_printf_color("\n", color);
+                       color_fprintf(stdout, color, "\n");
                 }
         }
-       dump_printf(".\n");
+       printf(".\n");
  }
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h

index 7b514082bbaff4992c31c590cd6d5b3153149f85..ca35fd66b5dfc8c238f5a4be3cb28ce402cf0bf6 100644 (file)
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -35,4 +35,6 @@ int ui_helpline__show_help(const char *format, va_list ap);
  #include "ui/progress.h"
  #endif
  
+void ui__warning(const char *format, ...) __attribute__((format(printf, 1, 2)));
+
  #endif /* __PERF_DEBUG_H */
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c

index dab9e754a28103b1727d6dee29669aaa2d7f89da..2302ec051bb4f1b5171bb543b33284b29bf6f8b6 100644 (file)
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -7,7 +7,7 @@
  #include "strlist.h"
  #include "thread.h"
  
-const char *event__name[] = {
+static const char *event__name[] = {
         [0]                      = "TOTAL",
         [PERF_RECORD_MMAP]       = "MMAP",
         [PERF_RECORD_LOST]       = "LOST",
@@ -22,13 +22,31 @@ const char *event__name[] = {
         [PERF_RECORD_HEADER_EVENT_TYPE]  = "EVENT_TYPE",
         [PERF_RECORD_HEADER_TRACING_DATA]        = "TRACING_DATA",
         [PERF_RECORD_HEADER_BUILD_ID]    = "BUILD_ID",
+       [PERF_RECORD_FINISHED_ROUND]     = "FINISHED_ROUND",
  };
  
-static pid_t event__synthesize_comm(pid_t pid, int full,
+const char *event__get_event_name(unsigned int id)
+{
+       if (id >= ARRAY_SIZE(event__name))
+               return "INVALID";
+       if (!event__name[id])
+               return "UNKNOWN";
+       return event__name[id];
+}
+
+static struct sample_data synth_sample = {
+       .pid       = -1,
+       .tid       = -1,
+       .time      = -1,
+       .stream_id = -1,
+       .cpu       = -1,
+       .period    = 1,
+};
+
+static pid_t event__synthesize_comm(event_t *event, pid_t pid, int full,
                                     event__handler_t process,
                                     struct perf_session *session)
  {
-       event_t ev;
         char filename[PATH_MAX];
         char bf[BUFSIZ];
         FILE *fp;
@@ -49,34 +67,39 @@ out_race:
                 return 0;
         }
  
-       memset(&ev.comm, 0, sizeof(ev.comm));
-       while (!ev.comm.comm[0] || !ev.comm.pid) {
-               if (fgets(bf, sizeof(bf), fp) == NULL)
-                       goto out_failure;
+       memset(&event->comm, 0, sizeof(event->comm));
+
+       while (!event->comm.comm[0] || !event->comm.pid) {
+               if (fgets(bf, sizeof(bf), fp) == NULL) {
+                       pr_warning("couldn't get COMM and pgid, malformed %s\n", filename);
+                       goto out;
+               }
  
                 if (memcmp(bf, "Name:", 5) == 0) {
                         char *name = bf + 5;
                         while (*name && isspace(*name))
                                 ++name;
                         size = strlen(name) - 1;
-                       memcpy(ev.comm.comm, name, size++);
+                       memcpy(event->comm.comm, name, size++);
                 } else if (memcmp(bf, "Tgid:", 5) == 0) {
                         char *tgids = bf + 5;
                         while (*tgids && isspace(*tgids))
                                 ++tgids;
-                       tgid = ev.comm.pid = atoi(tgids);
+                       tgid = event->comm.pid = atoi(tgids);
                 }
         }
  
-       ev.comm.header.type = PERF_RECORD_COMM;
+       event->comm.header.type = PERF_RECORD_COMM;
         size = ALIGN(size, sizeof(u64));
-       ev.comm.header.size = sizeof(ev.comm) - (sizeof(ev.comm.comm) - size);
-
+       memset(event->comm.comm + size, 0, session->id_hdr_size);
+       event->comm.header.size = (sizeof(event->comm) -
+                               (sizeof(event->comm.comm) - size) +
+                               session->id_hdr_size);
         if (!full) {
-               ev.comm.tid = pid;
+               event->comm.tid = pid;
  
-               process(&ev, session);
-               goto out_fclose;
+               process(event, &synth_sample, session);
+               goto out;
         }
  
         snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
@@ -91,22 +114,19 @@ out_race:
                 if (*end)
                         continue;
  
-               ev.comm.tid = pid;
+               event->comm.tid = pid;
  
-               process(&ev, session);
+               process(event, &synth_sample, session);
         }
-       closedir(tasks);
  
-out_fclose:
+       closedir(tasks);
+out:
         fclose(fp);
-       return tgid;
  
-out_failure:
-       pr_warning("couldn't get COMM and pgid, malformed %s\n", filename);
-       return -1;
+       return tgid;
  }
  
-static int event__synthesize_mmap_events(pid_t pid, pid_t tgid,
+static int event__synthesize_mmap_events(event_t *event, pid_t pid, pid_t tgid,
                                          event__handler_t process,
                                          struct perf_session *session)
  {
@@ -124,29 +144,25 @@ static int event__synthesize_mmap_events(pid_t pid, pid_t tgid,
                 return -1;
         }
  
+       event->header.type = PERF_RECORD_MMAP;
+       /*
+        * Just like the kernel, see __perf_event_mmap in kernel/perf_event.c
+        */
+       event->header.misc = PERF_RECORD_MISC_USER;
+
         while (1) {
                 char bf[BUFSIZ], *pbf = bf;
-               event_t ev = {
-                       .header = {
-                               .type = PERF_RECORD_MMAP,
-                               /*
-                                * Just like the kernel, see __perf_event_mmap
-                                * in kernel/perf_event.c
-                                */
-                               .misc = PERF_RECORD_MISC_USER,
-                        },
-               };
                 int n;
                 size_t size;
                 if (fgets(bf, sizeof(bf), fp) == NULL)
                         break;
  
                 /* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
-               n = hex2u64(pbf, &ev.mmap.start);
+               n = hex2u64(pbf, &event->mmap.start);
                 if (n < 0)
                         continue;
                 pbf += n + 1;
-               n = hex2u64(pbf, &ev.mmap.len);
+               n = hex2u64(pbf, &event->mmap.len);
                 if (n < 0)
                         continue;
                 pbf += n + 3;
@@ -161,19 +177,21 @@ static int event__synthesize_mmap_events(pid_t pid, pid_t tgid,
                                 continue;
  
                         pbf += 3;
-                       n = hex2u64(pbf, &ev.mmap.pgoff);
+                       n = hex2u64(pbf, &event->mmap.pgoff);
  
                         size = strlen(execname);
                         execname[size - 1] = '\0'; /* Remove \n */
-                       memcpy(ev.mmap.filename, execname, size);
+                       memcpy(event->mmap.filename, execname, size);
                         size = ALIGN(size, sizeof(u64));
-                       ev.mmap.len -= ev.mmap.start;
-                       ev.mmap.header.size = (sizeof(ev.mmap) -
-                                              (sizeof(ev.mmap.filename) - size));
-                       ev.mmap.pid = tgid;
-                       ev.mmap.tid = pid;
-
-                       process(&ev, session);
+                       event->mmap.len -= event->mmap.start;
+                       event->mmap.header.size = (sizeof(event->mmap) -
+                                               (sizeof(event->mmap.filename) - size));
+                       memset(event->mmap.filename + size, 0, session->id_hdr_size);
+                       event->mmap.header.size += session->id_hdr_size;
+                       event->mmap.pid = tgid;
+                       event->mmap.tid = pid;
+
+                       process(event, &synth_sample, session);
                 }
         }
  
@@ -187,20 +205,27 @@ int event__synthesize_modules(event__handler_t process,
  {
         struct rb_node *nd;
         struct map_groups *kmaps = &machine->kmaps;
-       u16 misc;
+       event_t *event = zalloc(sizeof(event->mmap) + session->id_hdr_size);
+
+       if (event == NULL) {
+               pr_debug("Not enough memory synthesizing mmap event "
+                        "for kernel modules\n");
+               return -1;
+       }
+
+       event->header.type = PERF_RECORD_MMAP;
  
         /*
          * kernel uses 0 for user space maps, see kernel/perf_event.c
          * __perf_event_mmap
          */
         if (machine__is_host(machine))
-               misc = PERF_RECORD_MISC_KERNEL;
+               event->header.misc = PERF_RECORD_MISC_KERNEL;
         else
-               misc = PERF_RECORD_MISC_GUEST_KERNEL;
+               event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL;
  
         for (nd = rb_first(&kmaps->maps[MAP__FUNCTION]);
              nd; nd = rb_next(nd)) {
-               event_t ev;
                 size_t size;
                 struct map *pos = rb_entry(nd, struct map, rb_node);
  
@@ -208,39 +233,78 @@ int event__synthesize_modules(event__handler_t process,
                         continue;
  
                 size = ALIGN(pos->dso->long_name_len + 1, sizeof(u64));
-               memset(&ev, 0, sizeof(ev));
-               ev.mmap.header.misc = misc;
-               ev.mmap.header.type = PERF_RECORD_MMAP;
-               ev.mmap.header.size = (sizeof(ev.mmap) -
-                                       (sizeof(ev.mmap.filename) - size));
-               ev.mmap.start = pos->start;
-               ev.mmap.len   = pos->end - pos->start;
-               ev.mmap.pid   = machine->pid;
-
-               memcpy(ev.mmap.filename, pos->dso->long_name,
+               event->mmap.header.type = PERF_RECORD_MMAP;
+               event->mmap.header.size = (sizeof(event->mmap) -
+                                       (sizeof(event->mmap.filename) - size));
+               memset(event->mmap.filename + size, 0, session->id_hdr_size);
+               event->mmap.header.size += session->id_hdr_size;
+               event->mmap.start = pos->start;
+               event->mmap.len   = pos->end - pos->start;
+               event->mmap.pid   = machine->pid;
+
+               memcpy(event->mmap.filename, pos->dso->long_name,
                        pos->dso->long_name_len + 1);
-               process(&ev, session);
+               process(event, &synth_sample, session);
         }
  
+       free(event);
         return 0;
  }
  
-int event__synthesize_thread(pid_t pid, event__handler_t process,
-                            struct perf_session *session)
+static int __event__synthesize_thread(event_t *comm_event, event_t *mmap_event,
+                                     pid_t pid, event__handler_t process,
+                                     struct perf_session *session)
  {
-       pid_t tgid = event__synthesize_comm(pid, 1, process, session);
+       pid_t tgid = event__synthesize_comm(comm_event, pid, 1, process,
+                                           session);
         if (tgid == -1)
                 return -1;
-       return event__synthesize_mmap_events(pid, tgid, process, session);
+       return event__synthesize_mmap_events(mmap_event, pid, tgid,
+                                            process, session);
+}
+
+int event__synthesize_thread(pid_t pid, event__handler_t process,
+                            struct perf_session *session)
+{
+       event_t *comm_event, *mmap_event;
+       int err = -1;
+
+       comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size);
+       if (comm_event == NULL)
+               goto out;
+
+       mmap_event = malloc(sizeof(mmap_event->mmap) + session->id_hdr_size);
+       if (mmap_event == NULL)
+               goto out_free_comm;
+
+       err = __event__synthesize_thread(comm_event, mmap_event, pid,
+                                        process, session);
+       free(mmap_event);
+out_free_comm:
+       free(comm_event);
+out:
+       return err;
  }
  
-void event__synthesize_threads(event__handler_t process,
-                              struct perf_session *session)
+int event__synthesize_threads(event__handler_t process,
+                             struct perf_session *session)
  {
         DIR *proc;
         struct dirent dirent, *next;
+       event_t *comm_event, *mmap_event;
+       int err = -1;
+
+       comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size);
+       if (comm_event == NULL)
+               goto out;
+
+       mmap_event = malloc(sizeof(mmap_event->mmap) + session->id_hdr_size);
+       if (mmap_event == NULL)
+               goto out_free_comm;
  
         proc = opendir("/proc");
+       if (proc == NULL)
+               goto out_free_mmap;
  
         while (!readdir_r(proc, &dirent, &next) && next) {
                 char *end;
@@ -249,10 +313,18 @@ void event__synthesize_threads(event__handler_t process,
                 if (*end) /* only interested in proper numerical dirents */
                         continue;
  
-               event__synthesize_thread(pid, process, session);
+               __event__synthesize_thread(comm_event, mmap_event, pid,
+                                          process, session);
         }
  
         closedir(proc);
+       err = 0;
+out_free_mmap:
+       free(mmap_event);
+out_free_comm:
+       free(comm_event);
+out:
+       return err;
  }
  
  struct process_symbol_args {
@@ -260,7 +332,8 @@ struct process_symbol_args {
         u64        start;
  };
  
-static int find_symbol_cb(void *arg, const char *name, char type, u64 start)
+static int find_symbol_cb(void *arg, const char *name, char type,
+                         u64 start, u64 end __used)
  {
         struct process_symbol_args *args = arg;
  
@@ -286,18 +359,20 @@ int event__synthesize_kernel_mmap(event__handler_t process,
         char path[PATH_MAX];
         char name_buff[PATH_MAX];
         struct map *map;
-
-       event_t ev = {
-               .header = {
-                       .type = PERF_RECORD_MMAP,
-               },
-       };
+       int err;
         /*
          * We should get this from /sys/kernel/sections/.text, but till that is
          * available use this, and after it is use this as a fallback for older
          * kernels.
          */
         struct process_symbol_args args = { .name = symbol_name, };
+       event_t *event = zalloc(sizeof(event->mmap) + session->id_hdr_size);
+
+       if (event == NULL) {
+               pr_debug("Not enough memory synthesizing mmap event "
+                        "for kernel modules\n");
+               return -1;
+       }
  
         mmap_name = machine__mmap_name(machine, name_buff, sizeof(name_buff));
         if (machine__is_host(machine)) {
@@ -305,10 +380,10 @@ int event__synthesize_kernel_mmap(event__handler_t process,
                  * kernel uses PERF_RECORD_MISC_USER for user space maps,
                  * see kernel/perf_event.c __perf_event_mmap
                  */
-               ev.header.misc = PERF_RECORD_MISC_KERNEL;
+               event->header.misc = PERF_RECORD_MISC_KERNEL;
                 filename = "/proc/kallsyms";
         } else {
-               ev.header.misc = PERF_RECORD_MISC_GUEST_KERNEL;
+               event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL;
                 if (machine__is_default_guest(machine))
                         filename = (char *) symbol_conf.default_guest_kallsyms;
                 else {
@@ -321,17 +396,21 @@ int event__synthesize_kernel_mmap(event__handler_t process,
                 return -ENOENT;
  
         map = machine->vmlinux_maps[MAP__FUNCTION];
-       size = snprintf(ev.mmap.filename, sizeof(ev.mmap.filename),
+       size = snprintf(event->mmap.filename, sizeof(event->mmap.filename),
                         "%s%s", mmap_name, symbol_name) + 1;
         size = ALIGN(size, sizeof(u64));
-       ev.mmap.header.size = (sizeof(ev.mmap) -
-                       (sizeof(ev.mmap.filename) - size));
-       ev.mmap.pgoff = args.start;
-       ev.mmap.start = map->start;
-       ev.mmap.len   = map->end - ev.mmap.start;
-       ev.mmap.pid   = machine->pid;
-
-       return process(&ev, session);
+       event->mmap.header.type = PERF_RECORD_MMAP;
+       event->mmap.header.size = (sizeof(event->mmap) -
+                       (sizeof(event->mmap.filename) - size) + session->id_hdr_size);
+       event->mmap.pgoff = args.start;
+       event->mmap.start = map->start;
+       event->mmap.len   = map->end - event->mmap.start;
+       event->mmap.pid   = machine->pid;
+
+       err = process(event, &synth_sample, session);
+       free(event);
+
+       return err;
  }
  
  static void thread__comm_adjust(struct thread *self, struct hists *hists)
@@ -361,7 +440,8 @@ static int thread__set_comm_adjust(struct thread *self, const char *comm,
         return 0;
  }
  
-int event__process_comm(event_t *self, struct perf_session *session)
+int event__process_comm(event_t *self, struct sample_data *sample __used,
+                       struct perf_session *session)
  {
         struct thread *thread = perf_session__findnew(session, self->comm.tid);
  
@@ -376,7 +456,8 @@ int event__process_comm(event_t *self, struct perf_session *session)
         return 0;
  }
  
-int event__process_lost(event_t *self, struct perf_session *session)
+int event__process_lost(event_t *self, struct sample_data *sample __used,
+                       struct perf_session *session)
  {
         dump_printf(": id:%Ld: lost:%Ld\n", self->lost.id, self->lost.lost);
         session->hists.stats.total_lost += self->lost.lost;
@@ -392,7 +473,7 @@ static void event_set_kernel_mmap_len(struct map **maps, event_t *self)
          * a zero sized synthesized MMAP event for the kernel.
          */
         if (maps[MAP__FUNCTION]->end == 0)
-               maps[MAP__FUNCTION]->end = ~0UL;
+               maps[MAP__FUNCTION]->end = ~0ULL;
  }
  
  static int event__process_kernel_mmap(event_t *self,
@@ -485,7 +566,8 @@ out_problem:
         return -1;
  }
  
-int event__process_mmap(event_t *self, struct perf_session *session)
+int event__process_mmap(event_t *self, struct sample_data *sample __used,
+                       struct perf_session *session)
  {
         struct machine *machine;
         struct thread *thread;
@@ -526,7 +608,8 @@ out_problem:
         return 0;
  }
  
-int event__process_task(event_t *self, struct perf_session *session)
+int event__process_task(event_t *self, struct sample_data *sample __used,
+                       struct perf_session *session)
  {
         struct thread *thread = perf_session__findnew(session, self->fork.tid);
         struct thread *parent = perf_session__findnew(session, self->fork.ptid);
@@ -548,18 +631,19 @@ int event__process_task(event_t *self, struct perf_session *session)
         return 0;
  }
  
-int event__process(event_t *event, struct perf_session *session)
+int event__process(event_t *event, struct sample_data *sample,
+                  struct perf_session *session)
  {
         switch (event->header.type) {
         case PERF_RECORD_COMM:
-               event__process_comm(event, session);
+               event__process_comm(event, sample, session);
                 break;
         case PERF_RECORD_MMAP:
-               event__process_mmap(event, session);
+               event__process_mmap(event, sample, session);
                 break;
         case PERF_RECORD_FORK:
         case PERF_RECORD_EXIT:
-               event__process_task(event, session);
+               event__process_task(event, sample, session);
                 break;
         default:
                 break;
@@ -674,32 +758,8 @@ int event__preprocess_sample(const event_t *self, struct perf_session *session,
                              symbol_filter_t filter)
  {
         u8 cpumode = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
-       struct thread *thread;
-
-       event__parse_sample(self, session->sample_type, data);
-
-       dump_printf("(IP, %d): %d/%d: %#Lx period: %Ld cpu:%d\n",
-                   self->header.misc, data->pid, data->tid, data->ip,
-                   data->period, data->cpu);
-
-       if (session->sample_type & PERF_SAMPLE_CALLCHAIN) {
-               unsigned int i;
-
-               dump_printf("... chain: nr:%Lu\n", data->callchain->nr);
+       struct thread *thread = perf_session__findnew(session, self->ip.pid);
  
-               if (!ip_callchain__valid(data->callchain, self)) {
-                       pr_debug("call-chain problem with event, "
-                                "skipping it.\n");
-                       goto out_filtered;
-               }
-
-               if (dump_trace) {
-                       for (i = 0; i < data->callchain->nr; i++)
-                               dump_printf("..... %2d: %016Lx\n",
-                                           i, data->callchain->ips[i]);
-               }
-       }
-       thread = perf_session__findnew(session, self->ip.pid);
         if (thread == NULL)
                 return -1;
  
@@ -766,9 +826,65 @@ out_filtered:
         return 0;
  }
  
-int event__parse_sample(const event_t *event, u64 type, struct sample_data *data)
+static int event__parse_id_sample(const event_t *event,
+                                 struct perf_session *session,
+                                 struct sample_data *sample)
  {
-       const u64 *array = event->sample.array;
+       const u64 *array;
+       u64 type;
+
+       sample->cpu = sample->pid = sample->tid = -1;
+       sample->stream_id = sample->id = sample->time = -1ULL;
+
+       if (!session->sample_id_all)
+               return 0;
+
+       array = event->sample.array;
+       array += ((event->header.size -
+                  sizeof(event->header)) / sizeof(u64)) - 1;
+       type = session->sample_type;
+
+       if (type & PERF_SAMPLE_CPU) {
+               u32 *p = (u32 *)array;
+               sample->cpu = *p;
+               array--;
+       }
+
+       if (type & PERF_SAMPLE_STREAM_ID) {
+               sample->stream_id = *array;
+               array--;
+       }
+
+       if (type & PERF_SAMPLE_ID) {
+               sample->id = *array;
+               array--;
+       }
+
+       if (type & PERF_SAMPLE_TIME) {
+               sample->time = *array;
+               array--;
+       }
+
+       if (type & PERF_SAMPLE_TID) {
+               u32 *p = (u32 *)array;
+               sample->pid = p[0];
+               sample->tid = p[1];
+       }
+
+       return 0;
+}
+
+int event__parse_sample(const event_t *event, struct perf_session *session,
+                       struct sample_data *data)
+{
+       const u64 *array;
+       u64 type;
+
+       if (event->header.type != PERF_RECORD_SAMPLE)
+               return event__parse_id_sample(event, session, data);
+
+       array = event->sample.array;
+       type = session->sample_type;
  
         if (type & PERF_SAMPLE_IP) {
                 data->ip = event->ip.ip;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h

index 8e790dae702625aa564594bee1dd52618b94f8d8..2b7e91902f105d5962a68254737ce21cfeef935c 100644 (file)
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -85,6 +85,7 @@ struct build_id_event {
  };
  
  enum perf_user_event_type { /* above any possible kernel type */
+       PERF_RECORD_USER_TYPE_START             = 64,
         PERF_RECORD_HEADER_ATTR                 = 64,
         PERF_RECORD_HEADER_EVENT_TYPE           = 65,
         PERF_RECORD_HEADER_TRACING_DATA         = 66,
@@ -135,12 +136,15 @@ void event__print_totals(void);
  
  struct perf_session;
  
-typedef int (*event__handler_t)(event_t *event, struct perf_session *session);
+typedef int (*event__handler_synth_t)(event_t *event, 
+                                     struct perf_session *session);
+typedef int (*event__handler_t)(event_t *event, struct sample_data *sample,
+                               struct perf_session *session);
  
  int event__synthesize_thread(pid_t pid, event__handler_t process,
                              struct perf_session *session);
-void event__synthesize_threads(event__handler_t process,
-                              struct perf_session *session);
+int event__synthesize_threads(event__handler_t process,
+                             struct perf_session *session);
  int event__synthesize_kernel_mmap(event__handler_t process,
                                 struct perf_session *session,
                                 struct machine *machine,
@@ -150,18 +154,24 @@ int event__synthesize_modules(event__handler_t process,
                               struct perf_session *session,
                               struct machine *machine);
  
-int event__process_comm(event_t *self, struct perf_session *session);
-int event__process_lost(event_t *self, struct perf_session *session);
-int event__process_mmap(event_t *self, struct perf_session *session);
-int event__process_task(event_t *self, struct perf_session *session);
-int event__process(event_t *event, struct perf_session *session);
+int event__process_comm(event_t *self, struct sample_data *sample,
+                       struct perf_session *session);
+int event__process_lost(event_t *self, struct sample_data *sample,
+                       struct perf_session *session);
+int event__process_mmap(event_t *self, struct sample_data *sample,
+                       struct perf_session *session);
+int event__process_task(event_t *self, struct sample_data *sample,
+                       struct perf_session *session);
+int event__process(event_t *event, struct sample_data *sample,
+                  struct perf_session *session);
  
  struct addr_location;
  int event__preprocess_sample(const event_t *self, struct perf_session *session,
                              struct addr_location *al, struct sample_data *data,
                              symbol_filter_t filter);
-int event__parse_sample(const event_t *event, u64 type, struct sample_data *data);
+int event__parse_sample(const event_t *event, struct perf_session *session,
+                       struct sample_data *sample);
  
-extern const char *event__name[];
+const char *event__get_event_name(unsigned int id);
  
  #endif /* __PERF_RECORD_H */
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c

new file mode 100644 (file)

index 0000000..c95267e
--- /dev/null
+++ b/tools/perf/util/evsel.c
@@ -0,0 +1,186 @@
+#include "evsel.h"
+#include "../perf.h"
+#include "util.h"
+#include "cpumap.h"
+#include "thread.h"
+
+#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
+
+struct perf_evsel *perf_evsel__new(u32 type, u64 config, int idx)
+{
+       struct perf_evsel *evsel = zalloc(sizeof(*evsel));
+
+       if (evsel != NULL) {
+               evsel->idx         = idx;
+               evsel->attr.type   = type;
+               evsel->attr.config = config;
+               INIT_LIST_HEAD(&evsel->node);
+       }
+
+       return evsel;
+}
+
+int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
+{
+       evsel->fd = xyarray__new(ncpus, nthreads, sizeof(int));
+       return evsel->fd != NULL ? 0 : -ENOMEM;
+}
+
+int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus)
+{
+       evsel->counts = zalloc((sizeof(*evsel->counts) +
+                               (ncpus * sizeof(struct perf_counts_values))));
+       return evsel->counts != NULL ? 0 : -ENOMEM;
+}
+
+void perf_evsel__free_fd(struct perf_evsel *evsel)
+{
+       xyarray__delete(evsel->fd);
+       evsel->fd = NULL;
+}
+
+void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
+{
+       int cpu, thread;
+
+       for (cpu = 0; cpu < ncpus; cpu++)
+               for (thread = 0; thread < nthreads; ++thread) {
+                       close(FD(evsel, cpu, thread));
+                       FD(evsel, cpu, thread) = -1;
+               }
+}
+
+void perf_evsel__delete(struct perf_evsel *evsel)
+{
+       assert(list_empty(&evsel->node));
+       xyarray__delete(evsel->fd);
+       free(evsel);
+}
+
+int __perf_evsel__read_on_cpu(struct perf_evsel *evsel,
+                             int cpu, int thread, bool scale)
+{
+       struct perf_counts_values count;
+       size_t nv = scale ? 3 : 1;
+
+       if (FD(evsel, cpu, thread) < 0)
+               return -EINVAL;
+
+       if (evsel->counts == NULL && perf_evsel__alloc_counts(evsel, cpu + 1) < 0)
+               return -ENOMEM;
+
+       if (readn(FD(evsel, cpu, thread), &count, nv * sizeof(u64)) < 0)
+               return -errno;
+
+       if (scale) {
+               if (count.run == 0)
+                       count.val = 0;
+               else if (count.run < count.ena)
+                       count.val = (u64)((double)count.val * count.ena / count.run + 0.5);
+       } else
+               count.ena = count.run = 0;
+
+       evsel->counts->cpu[cpu] = count;
+       return 0;
+}
+
+int __perf_evsel__read(struct perf_evsel *evsel,
+                      int ncpus, int nthreads, bool scale)
+{
+       size_t nv = scale ? 3 : 1;
+       int cpu, thread;
+       struct perf_counts_values *aggr = &evsel->counts->aggr, count;
+
+       aggr->val = 0;
+
+       for (cpu = 0; cpu < ncpus; cpu++) {
+               for (thread = 0; thread < nthreads; thread++) {
+                       if (FD(evsel, cpu, thread) < 0)
+                               continue;
+
+                       if (readn(FD(evsel, cpu, thread),
+                                 &count, nv * sizeof(u64)) < 0)
+                               return -errno;
+
+                       aggr->val += count.val;
+                       if (scale) {
+                               aggr->ena += count.ena;
+                               aggr->run += count.run;
+                       }
+               }
+       }
+
+       evsel->counts->scaled = 0;
+       if (scale) {
+               if (aggr->run == 0) {
+                       evsel->counts->scaled = -1;
+                       aggr->val = 0;
+                       return 0;
+               }
+
+               if (aggr->run < aggr->ena) {
+                       evsel->counts->scaled = 1;
+                       aggr->val = (u64)((double)aggr->val * aggr->ena / aggr->run + 0.5);
+               }
+       } else
+               aggr->ena = aggr->run = 0;
+
+       return 0;
+}
+
+int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus)
+{
+       int cpu;
+
+       if (evsel->fd == NULL && perf_evsel__alloc_fd(evsel, cpus->nr, 1) < 0)
+               return -1;
+
+       for (cpu = 0; cpu < cpus->nr; cpu++) {
+               FD(evsel, cpu, 0) = sys_perf_event_open(&evsel->attr, -1,
+                                                       cpus->map[cpu], -1, 0);
+               if (FD(evsel, cpu, 0) < 0)
+                       goto out_close;
+       }
+
+       return 0;
+
+out_close:
+       while (--cpu >= 0) {
+               close(FD(evsel, cpu, 0));
+               FD(evsel, cpu, 0) = -1;
+       }
+       return -1;
+}
+
+int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads)
+{
+       int thread;
+
+       if (evsel->fd == NULL && perf_evsel__alloc_fd(evsel, 1, threads->nr))
+               return -1;
+
+       for (thread = 0; thread < threads->nr; thread++) {
+               FD(evsel, 0, thread) = sys_perf_event_open(&evsel->attr,
+                                                          threads->map[thread], -1, -1, 0);
+               if (FD(evsel, 0, thread) < 0)
+                       goto out_close;
+       }
+
+       return 0;
+
+out_close:
+       while (--thread >= 0) {
+               close(FD(evsel, 0, thread));
+               FD(evsel, 0, thread) = -1;
+       }
+       return -1;
+}
+
+int perf_evsel__open(struct perf_evsel *evsel, 
+                    struct cpu_map *cpus, struct thread_map *threads)
+{
+       if (threads == NULL)
+               return perf_evsel__open_per_cpu(evsel, cpus);
+
+       return perf_evsel__open_per_thread(evsel, threads);
+}
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h

new file mode 100644 (file)

index 0000000..a0ccd69
--- /dev/null
+++ b/tools/perf/util/evsel.h
@@ -0,0 +1,115 @@
+#ifndef __PERF_EVSEL_H
+#define __PERF_EVSEL_H 1
+
+#include <linux/list.h>
+#include <stdbool.h>
+#include "../../../include/linux/perf_event.h"
+#include "types.h"
+#include "xyarray.h"
+ 
+struct perf_counts_values {
+       union {
+               struct {
+                       u64 val;
+                       u64 ena;
+                       u64 run;
+               };
+               u64 values[3];
+       };
+};
+
+struct perf_counts {
+       s8                        scaled;
+       struct perf_counts_values aggr;
+       struct perf_counts_values cpu[];
+};
+
+struct perf_evsel {
+       struct list_head        node;
+       struct perf_event_attr  attr;
+       char                    *filter;
+       struct xyarray          *fd;
+       struct perf_counts      *counts;
+       int                     idx;
+       void                    *priv;
+};
+
+struct cpu_map;
+struct thread_map;
+
+struct perf_evsel *perf_evsel__new(u32 type, u64 config, int idx);
+void perf_evsel__delete(struct perf_evsel *evsel);
+
+int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
+int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus);
+void perf_evsel__free_fd(struct perf_evsel *evsel);
+void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
+
+int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus);
+int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads);
+int perf_evsel__open(struct perf_evsel *evsel, 
+                    struct cpu_map *cpus, struct thread_map *threads);
+
+#define perf_evsel__match(evsel, t, c)         \
+       (evsel->attr.type == PERF_TYPE_##t &&   \
+        evsel->attr.config == PERF_COUNT_##c)
+
+int __perf_evsel__read_on_cpu(struct perf_evsel *evsel,
+                             int cpu, int thread, bool scale);
+
+/**
+ * perf_evsel__read_on_cpu - Read out the results on a CPU and thread
+ *
+ * @evsel - event selector to read value
+ * @cpu - CPU of interest
+ * @thread - thread of interest
+ */
+static inline int perf_evsel__read_on_cpu(struct perf_evsel *evsel,
+                                         int cpu, int thread)
+{
+       return __perf_evsel__read_on_cpu(evsel, cpu, thread, false);
+}
+
+/**
+ * perf_evsel__read_on_cpu_scaled - Read out the results on a CPU and thread, scaled
+ *
+ * @evsel - event selector to read value
+ * @cpu - CPU of interest
+ * @thread - thread of interest
+ */
+static inline int perf_evsel__read_on_cpu_scaled(struct perf_evsel *evsel,
+                                                int cpu, int thread)
+{
+       return __perf_evsel__read_on_cpu(evsel, cpu, thread, true);
+}
+
+int __perf_evsel__read(struct perf_evsel *evsel, int ncpus, int nthreads,
+                      bool scale);
+
+/**
+ * perf_evsel__read - Read the aggregate results on all CPUs
+ *
+ * @evsel - event selector to read value
+ * @ncpus - Number of cpus affected, from zero
+ * @nthreads - Number of threads affected, from zero
+ */
+static inline int perf_evsel__read(struct perf_evsel *evsel,
+                                   int ncpus, int nthreads)
+{
+       return __perf_evsel__read(evsel, ncpus, nthreads, false);
+}
+
+/**
+ * perf_evsel__read_scaled - Read the aggregate results on all CPUs, scaled
+ *
+ * @evsel - event selector to read value
+ * @ncpus - Number of cpus affected, from zero
+ * @nthreads - Number of threads affected, from zero
+ */
+static inline int perf_evsel__read_scaled(struct perf_evsel *evsel,
+                                         int ncpus, int nthreads)
+{
+       return __perf_evsel__read(evsel, ncpus, nthreads, true);
+}
+
+#endif /* __PERF_EVSEL_H */
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c

index 7cba0551a56550888c98ff1a933fd6bec8ab9ca0..989fa2dee2fd2ae441d98921b9e69a56c4fe50e2 100644 (file)
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -152,6 +152,11 @@ void perf_header__set_feat(struct perf_header *self, int feat)
         set_bit(feat, self->adds_features);
  }
  
+void perf_header__clear_feat(struct perf_header *self, int feat)
+{
+       clear_bit(feat, self->adds_features);
+}
+
  bool perf_header__has_feat(const struct perf_header *self, int feat)
  {
         return test_bit(feat, self->adds_features);
@@ -433,8 +438,10 @@ static int perf_header__adds_write(struct perf_header *self, int fd)
         int idx = 0, err;
  
         session = container_of(self, struct perf_session, header);
-       if (perf_session__read_build_ids(session, true))
-               perf_header__set_feat(self, HEADER_BUILD_ID);
+
+       if (perf_header__has_feat(self, HEADER_BUILD_ID &&
+           !perf_session__read_build_ids(session, true)))
+               perf_header__clear_feat(self, HEADER_BUILD_ID);
  
         nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS);
         if (!nr_sections)
@@ -456,7 +463,7 @@ static int perf_header__adds_write(struct perf_header *self, int fd)
  
                 /* Write trace info */
                 trace_sec->offset = lseek(fd, 0, SEEK_CUR);
-               read_tracing_data(fd, attrs, nr_counters);
+               read_tracing_data(fd, &evsel_list);
                 trace_sec->size = lseek(fd, 0, SEEK_CUR) - trace_sec->offset;
         }
  
@@ -599,7 +606,7 @@ int perf_header__write(struct perf_header *self, int fd, bool at_exit)
  static int perf_header__getbuffer64(struct perf_header *self,
                                     int fd, void *buf, size_t size)
  {
-       if (do_read(fd, buf, size) <= 0)
+       if (readn(fd, buf, size) <= 0)
                 return -1;
  
         if (self->needs_swap)
@@ -655,7 +662,7 @@ int perf_file_header__read(struct perf_file_header *self,
  {
         lseek(fd, 0, SEEK_SET);
  
-       if (do_read(fd, self, sizeof(*self)) <= 0 ||
+       if (readn(fd, self, sizeof(*self)) <= 0 ||
             memcmp(&self->magic, __perf_magic, sizeof(self->magic)))
                 return -1;
  
@@ -816,7 +823,7 @@ static int perf_file_header__read_pipe(struct perf_pipe_file_header *self,
                                        struct perf_header *ph, int fd,
                                        bool repipe)
  {
-       if (do_read(fd, self, sizeof(*self)) <= 0 ||
+       if (readn(fd, self, sizeof(*self)) <= 0 ||
             memcmp(&self->magic, __perf_magic, sizeof(self->magic)))
                 return -1;
  
@@ -941,6 +948,24 @@ u64 perf_header__sample_type(struct perf_header *header)
         return type;
  }
  
+bool perf_header__sample_id_all(const struct perf_header *header)
+{
+       bool value = false, first = true;
+       int i;
+
+       for (i = 0; i < header->attrs; i++) {
+               struct perf_header_attr *attr = header->attr[i];
+
+               if (first) {
+                       value = attr->attr.sample_id_all;
+                       first = false;
+               } else if (value != attr->attr.sample_id_all)
+                       die("non matching sample_id_all");
+       }
+
+       return value;
+}
+
  struct perf_event_attr *
  perf_header__find_attr(u64 id, struct perf_header *header)
  {
@@ -987,21 +1012,23 @@ int event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
  
         ev = malloc(size);
  
+       if (ev == NULL)
+               return -ENOMEM;
+
         ev->attr.attr = *attr;
         memcpy(ev->attr.id, id, ids * sizeof(u64));
  
         ev->attr.header.type = PERF_RECORD_HEADER_ATTR;
         ev->attr.header.size = size;
  
-       err = process(ev, session);
+       err = process(ev, NULL, session);
  
         free(ev);
  
         return err;
  }
  
-int event__synthesize_attrs(struct perf_header *self,
-                           event__handler_t process,
+int event__synthesize_attrs(struct perf_header *self, event__handler_t process,
                             struct perf_session *session)
  {
         struct perf_header_attr *attr;
@@ -1071,7 +1098,7 @@ int event__synthesize_event_type(u64 event_id, char *name,
         ev.event_type.header.size = sizeof(ev.event_type) -
                 (sizeof(ev.event_type.event_type.name) - size);
  
-       err = process(&ev, session);
+       err = process(&ev, NULL, session);
  
         return err;
  }
@@ -1106,8 +1133,7 @@ int event__process_event_type(event_t *self,
         return 0;
  }
  
-int event__synthesize_tracing_data(int fd, struct perf_event_attr *pattrs,
-                                  int nb_events,
+int event__synthesize_tracing_data(int fd, struct list_head *pattrs,
                                    event__handler_t process,
                                    struct perf_session *session __unused)
  {
@@ -1118,7 +1144,7 @@ int event__synthesize_tracing_data(int fd, struct perf_event_attr *pattrs,
         memset(&ev, 0, sizeof(ev));
  
         ev.tracing_data.header.type = PERF_RECORD_HEADER_TRACING_DATA;
-       size = read_tracing_data_size(fd, pattrs, nb_events);
+       size = read_tracing_data_size(fd, pattrs);
         if (size <= 0)
                 return size;
         aligned_size = ALIGN(size, sizeof(u64));
@@ -1126,9 +1152,9 @@ int event__synthesize_tracing_data(int fd, struct perf_event_attr *pattrs,
         ev.tracing_data.header.size = sizeof(ev.tracing_data);
         ev.tracing_data.size = aligned_size;
  
-       process(&ev, session);
+       process(&ev, NULL, session);
  
-       err = read_tracing_data(fd, pattrs, nb_events);
+       err = read_tracing_data(fd, pattrs);
         write_padded(fd, NULL, 0, padding);
  
         return aligned_size;
@@ -1186,7 +1212,7 @@ int event__synthesize_build_id(struct dso *pos, u16 misc,
         ev.build_id.header.size = sizeof(ev.build_id) + len;
         memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len);
  
-       err = process(&ev, session);
+       err = process(&ev, NULL, session);
  
         return err;
  }
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h

index 402ac2454cf8bcc664c3daf193119f470c77d7c3..33f16be7b72fdad5a3757acbfbceb52b3966c593 100644 (file)
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -81,9 +81,11 @@ void perf_header_attr__delete(struct perf_header_attr *self);
  int perf_header_attr__add_id(struct perf_header_attr *self, u64 id);
  
  u64 perf_header__sample_type(struct perf_header *header);
+bool perf_header__sample_id_all(const struct perf_header *header);
  struct perf_event_attr *
  perf_header__find_attr(u64 id, struct perf_header *header);
  void perf_header__set_feat(struct perf_header *self, int feat);
+void perf_header__clear_feat(struct perf_header *self, int feat);
  bool perf_header__has_feat(const struct perf_header *self, int feat);
  
  int perf_header__process_sections(struct perf_header *self, int fd,
@@ -111,8 +113,7 @@ int event__synthesize_event_types(event__handler_t process,
  int event__process_event_type(event_t *self,
                               struct perf_session *session);
  
-int event__synthesize_tracing_data(int fd, struct perf_event_attr *pattrs,
-                                  int nb_events,
+int event__synthesize_tracing_data(int fd, struct list_head *pattrs,
                                    event__handler_t process,
                                    struct perf_session *session);
  int event__process_tracing_data(event_t *self,
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c

index 2022e87409942ca4b0d133c3f889e41178a663d1..c749ba6136a0ac33d7cdae8bf5b6604f1ee524af 100644 (file)
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -356,7 +356,7 @@ static size_t ipchain__fprintf_graph_line(FILE *fp, int depth, int depth_mask,
  
  static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_list *chain,
                                      int depth, int depth_mask, int period,
-                                    u64 total_samples, int hits,
+                                    u64 total_samples, u64 hits,
                                      int left_margin)
  {
         int i;
@@ -1092,6 +1092,12 @@ int hist_entry__annotate(struct hist_entry *self, struct list_head *head,
         FILE *file;
         int err = 0;
         u64 len;
+       char symfs_filename[PATH_MAX];
+
+       if (filename) {
+               snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
+                        symbol_conf.symfs, filename);
+       }
  
         if (filename == NULL) {
                 if (dso->has_build_id) {
@@ -1100,9 +1106,9 @@ int hist_entry__annotate(struct hist_entry *self, struct list_head *head,
                         return -ENOMEM;
                 }
                 goto fallback;
-       } else if (readlink(filename, command, sizeof(command)) < 0 ||
+       } else if (readlink(symfs_filename, command, sizeof(command)) < 0 ||
                    strstr(command, "[kernel.kallsyms]") ||
-                  access(filename, R_OK)) {
+                  access(symfs_filename, R_OK)) {
                 free(filename);
  fallback:
                 /*
@@ -1111,6 +1117,8 @@ fallback:
                  * DSO is the same as when 'perf record' ran.
                  */
                 filename = dso->long_name;
+               snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
+                        symbol_conf.symfs, filename);
                 free_filename = false;
         }
  
@@ -1137,7 +1145,7 @@ fallback:
                  "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS -C %s|grep -v %s|expand",
                  map__rip_2objdump(map, sym->start),
                  map__rip_2objdump(map, sym->end),
-                filename, filename);
+                symfs_filename, filename);
  
         pr_debug("Executing: %s\n", command);
  
@@ -1168,10 +1176,13 @@ size_t hists__fprintf_nr_events(struct hists *self, FILE *fp)
         size_t ret = 0;
  
         for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) {
-               if (!event__name[i])
+               const char *name = event__get_event_name(i);
+
+               if (!strcmp(name, "UNKNOWN"))
                         continue;
-               ret += fprintf(fp, "%10s events: %10d\n",
-                              event__name[i], self->stats.nr_events[i]);
+
+               ret += fprintf(fp, "%16s events: %10d\n", name,
+                              self->stats.nr_events[i]);
         }
  
         return ret;
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h

index 587d375d34300daa09948c1a0d95407b4f8439fa..ee789856a8c94644e189f0dc8a7be7933469a6cb 100644 (file)
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -52,8 +52,10 @@ struct sym_priv {
  struct events_stats {
         u64 total_period;
         u64 total_lost;
+       u64 total_invalid_chains;
         u32 nr_events[PERF_RECORD_HEADER_MAX];
         u32 nr_unknown_events;
+       u32 nr_invalid_chains;
  };
  
  enum hist_column {
diff --git a/tools/perf/util/include/asm/cpufeature.h b/tools/perf/util/include/asm/cpufeature.h

new file mode 100644 (file)

index 0000000..acffd5e
--- /dev/null
+++ b/tools/perf/util/include/asm/cpufeature.h
@@ -0,0 +1,9 @@
+
+#ifndef PERF_CPUFEATURE_H
+#define PERF_CPUFEATURE_H
+
+/* cpufeature.h ... dummy header file for including arch/x86/lib/memcpy_64.S */
+
+#define X86_FEATURE_REP_GOOD 0
+
+#endif /* PERF_CPUFEATURE_H */
diff --git a/tools/perf/util/include/asm/dwarf2.h b/tools/perf/util/include/asm/dwarf2.h

new file mode 100644 (file)

index 0000000..bb4198e
--- /dev/null
+++ b/tools/perf/util/include/asm/dwarf2.h
@@ -0,0 +1,11 @@
+
+#ifndef PERF_DWARF2_H
+#define PERF_DWARF2_H
+
+/* dwarf2.h ... dummy header file for including arch/x86/lib/memcpy_64.S */
+
+#define CFI_STARTPROC
+#define CFI_ENDPROC
+
+#endif /* PERF_DWARF2_H */
+
diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h

index bb4ac2e053859482f98933b278a8d0adda71aa5a..8be0b968ca0bcfa44c95248e14e3878481887b21 100644 (file)
--- a/tools/perf/util/include/linux/bitops.h
+++ b/tools/perf/util/include/linux/bitops.h
@@ -13,6 +13,11 @@ static inline void set_bit(int nr, unsigned long *addr)
         addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
  }
  
+static inline void clear_bit(int nr, unsigned long *addr)
+{
+       addr[nr / BITS_PER_LONG] &= ~(1UL << (nr % BITS_PER_LONG));
+}
+
  static __always_inline int test_bit(unsigned int nr, const unsigned long *addr)
  {
         return ((1UL << (nr % BITS_PER_LONG)) &
diff --git a/tools/perf/util/include/linux/linkage.h b/tools/perf/util/include/linux/linkage.h

new file mode 100644 (file)

index 0000000..06387cf
--- /dev/null
+++ b/tools/perf/util/include/linux/linkage.h
@@ -0,0 +1,13 @@
+
+#ifndef PERF_LINUX_LINKAGE_H_
+#define PERF_LINUX_LINKAGE_H_
+
+/* linkage.h ... for including arch/x86/lib/memcpy_64.S */
+
+#define ENTRY(name)                            \
+       .globl name;                            \
+       name:
+
+#define ENDPROC(name)
+
+#endif /* PERF_LINUX_LINKAGE_H_ */
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c

index 4af5bd59cfd14b475d0f2fa60e15f1b4b4e908de..649083f27e08bfbfa952b10bb68fc8639f4d1195 100644 (file)
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1,6 +1,7 @@
  #include "../../../include/linux/hw_breakpoint.h"
  #include "util.h"
  #include "../perf.h"
+#include "evsel.h"
  #include "parse-options.h"
  #include "parse-events.h"
  #include "exec_cmd.h"
@@ -12,8 +13,7 @@
  
  int                            nr_counters;
  
-struct perf_event_attr         attrs[MAX_COUNTERS];
-char                           *filters[MAX_COUNTERS];
+LIST_HEAD(evsel_list);
  
  struct event_symbol {
         u8              type;
@@ -266,10 +266,10 @@ static char *event_cache_name(u8 cache_type, u8 cache_op, u8 cache_result)
         return name;
  }
  
-const char *event_name(int counter)
+const char *event_name(struct perf_evsel *evsel)
  {
-       u64 config = attrs[counter].config;
-       int type = attrs[counter].type;
+       u64 config = evsel->attr.config;
+       int type = evsel->attr.type;
  
         return __event_name(type, config);
  }
@@ -434,7 +434,7 @@ parse_single_tracepoint_event(char *sys_name,
         id = atoll(id_buf);
         attr->config = id;
         attr->type = PERF_TYPE_TRACEPOINT;
-       *strp = evt_name + evt_length;
+       *strp += strlen(sys_name) + evt_length + 1; /* + 1 for the ':' */
  
         attr->sample_type |= PERF_SAMPLE_RAW;
         attr->sample_type |= PERF_SAMPLE_TIME;
@@ -495,7 +495,7 @@ static enum event_result parse_tracepoint_event(const char **strp,
                                     struct perf_event_attr *attr)
  {
         const char *evt_name;
-       char *flags;
+       char *flags = NULL, *comma_loc;
         char sys_name[MAX_EVENT_LENGTH];
         unsigned int sys_length, evt_length;
  
@@ -514,6 +514,11 @@ static enum event_result parse_tracepoint_event(const char **strp,
         sys_name[sys_length] = '\0';
         evt_name = evt_name + 1;
  
+       comma_loc = strchr(evt_name, ',');
+       if (comma_loc) {
+               /* take the event name up to the comma */
+               evt_name = strndup(evt_name, comma_loc - evt_name);
+       }
         flags = strchr(evt_name, ':');
         if (flags) {
                 /* split it out: */
@@ -524,9 +529,8 @@ static enum event_result parse_tracepoint_event(const char **strp,
         evt_length = strlen(evt_name);
         if (evt_length >= MAX_EVENT_LENGTH)
                 return EVT_FAILED;
-
         if (strpbrk(evt_name, "*?")) {
-               *strp = evt_name + evt_length;
+               *strp += strlen(sys_name) + evt_length;
                 return parse_multiple_tracepoint_event(sys_name, evt_name,
                                                        flags);
         } else
@@ -810,9 +814,6 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u
                         return -1;
  
         for (;;) {
-               if (nr_counters == MAX_COUNTERS)
-                       return -1;
-
                 memset(&attr, 0, sizeof(attr));
                 ret = parse_event_symbols(&str, &attr);
                 if (ret == EVT_FAILED)
@@ -822,8 +823,13 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u
                         return -1;
  
                 if (ret != EVT_HANDLED_ALL) {
-                       attrs[nr_counters] = attr;
-                       nr_counters++;
+                       struct perf_evsel *evsel;
+                       evsel = perf_evsel__new(attr.type, attr.config,
+                                               nr_counters);
+                       if (evsel == NULL)
+                               return -1;
+                       list_add_tail(&evsel->node, &evsel_list);
+                       ++nr_counters;
                 }
  
                 if (*str == 0)
@@ -840,21 +846,22 @@ int parse_events(const struct option *opt __used, const char *str, int unset __u
  int parse_filter(const struct option *opt __used, const char *str,
                  int unset __used)
  {
-       int i = nr_counters - 1;
-       int len = strlen(str);
+       struct perf_evsel *last = NULL;
  
-       if (i < 0 || attrs[i].type != PERF_TYPE_TRACEPOINT) {
+       if (!list_empty(&evsel_list))
+               last = list_entry(evsel_list.prev, struct perf_evsel, node);
+
+       if (last == NULL || last->attr.type != PERF_TYPE_TRACEPOINT) {
                 fprintf(stderr,
                         "-F option should follow a -e tracepoint option\n");
                 return -1;
         }
  
-       filters[i] = malloc(len + 1);
-       if (!filters[i]) {
+       last->filter = strdup(str);
+       if (last->filter == NULL) {
                 fprintf(stderr, "not enough memory to hold filter string\n");
                 return -1;
         }
-       strcpy(filters[i], str);
  
         return 0;
  }
@@ -905,6 +912,47 @@ static void print_tracepoint_events(void)
         closedir(sys_dir);
  }
  
+/*
+ * Check whether event is in <debugfs_mount_point>/tracing/events
+ */
+
+int is_valid_tracepoint(const char *event_string)
+{
+       DIR *sys_dir, *evt_dir;
+       struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
+       char evt_path[MAXPATHLEN];
+       char dir_path[MAXPATHLEN];
+
+       if (debugfs_valid_mountpoint(debugfs_path))
+               return 0;
+
+       sys_dir = opendir(debugfs_path);
+       if (!sys_dir)
+               return 0;
+
+       for_each_subsystem(sys_dir, sys_dirent, sys_next) {
+
+               snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path,
+                        sys_dirent.d_name);
+               evt_dir = opendir(dir_path);
+               if (!evt_dir)
+                       continue;
+
+               for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
+                       snprintf(evt_path, MAXPATHLEN, "%s:%s",
+                                sys_dirent.d_name, evt_dirent.d_name);
+                       if (!strcmp(evt_path, event_string)) {
+                               closedir(evt_dir);
+                               closedir(sys_dir);
+                               return 1;
+                       }
+               }
+               closedir(evt_dir);
+       }
+       closedir(sys_dir);
+       return 0;
+}
+
  /*
   * Print the help text for the event symbols:
   */
@@ -963,3 +1011,26 @@ void print_events(void)
  
         exit(129);
  }
+
+int perf_evsel_list__create_default(void)
+{
+       struct perf_evsel *evsel = perf_evsel__new(PERF_TYPE_HARDWARE,
+                                                  PERF_COUNT_HW_CPU_CYCLES, 0);
+       if (evsel == NULL)
+               return -ENOMEM;
+
+       list_add(&evsel->node, &evsel_list);
+       ++nr_counters;
+       return 0;
+}
+
+void perf_evsel_list__delete(void)
+{
+       struct perf_evsel *pos, *n;
+
+       list_for_each_entry_safe(pos, n, &evsel_list, node) {
+               list_del_init(&pos->node);
+               perf_evsel__delete(pos);
+       }
+       nr_counters = 0;
+}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h

index fc4ab3fe877a22d191450a6bc4c75118cf71901d..b82cafb8377202462d16ef7ed36b8e546201c76b 100644 (file)
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -4,6 +4,16 @@
   * Parse symbolic events/counts passed in as options:
   */
  
+#include "../../../include/linux/perf_event.h"
+
+struct list_head;
+struct perf_evsel;
+
+extern struct list_head evsel_list;
+
+int perf_evsel_list__create_default(void);
+void perf_evsel_list__delete(void);
+
  struct option;
  
  struct tracepoint_path {
@@ -13,14 +23,11 @@ struct tracepoint_path {
  };
  
  extern struct tracepoint_path *tracepoint_id_to_path(u64 config);
-extern bool have_tracepoints(struct perf_event_attr *pattrs, int nb_events);
+extern bool have_tracepoints(struct list_head *evsel_list);
  
  extern int                     nr_counters;
  
-extern struct perf_event_attr attrs[MAX_COUNTERS];
-extern char *filters[MAX_COUNTERS];
-
-extern const char *event_name(int ctr);
+const char *event_name(struct perf_evsel *event);
  extern const char *__event_name(int type, u64 config);
  
  extern int parse_events(const struct option *opt, const char *str, int unset);
@@ -29,9 +36,9 @@ extern int parse_filter(const struct option *opt, const char *str, int unset);
  #define EVENTS_HELP_MAX (128*1024)
  
  extern void print_events(void);
+extern int is_valid_tracepoint(const char *event_string);
  
  extern char debugfs_path[];
  extern int valid_debugfs_mount(const char *debugfs);
  
-
  #endif /* __PERF_PARSE_EVENTS_H */
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h

index c7d72dce54b2cf7c3f46042bd6ce6a68c941b4d6..abc31a1dac1a738c5791512032c5a9337af84b9a 100644 (file)
--- a/tools/perf/util/parse-options.h
+++ b/tools/perf/util/parse-options.h
@@ -119,6 +119,10 @@ struct option {
         { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f), .flags = PARSE_OPT_NOARG }
  #define OPT_CALLBACK_DEFAULT(s, l, v, a, h, f, d) \
         { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f), .defval = (intptr_t)d, .flags = PARSE_OPT_LASTARG_DEFAULT }
+#define OPT_CALLBACK_DEFAULT_NOOPT(s, l, v, a, h, f, d) \
+       { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l),\
+       .value = (v), (a), .help = (h), .callback = (f), .defval = (intptr_t)d,\
+       .flags = PARSE_OPT_LASTARG_DEFAULT | PARSE_OPT_NOARG}
  
  /* parse_options() will filter out the processed options and leave the
   * non-option argments in argv[].
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c

index 61191c6cbe7a8d04af83e1af5426029a73bc7b99..128aaab0aedad86403a0c722211e4d57cb982d8d 100644 (file)
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -95,7 +95,7 @@ static int init_vmlinux(void)
                 goto out;
  
         if (machine__create_kernel_maps(&machine) < 0) {
-               pr_debug("machine__create_kernel_maps ");
+               pr_debug("machine__create_kernel_maps() failed.\n");
                 goto out;
         }
  out:
@@ -149,7 +149,8 @@ static int open_vmlinux(const char *module)
  {
         const char *path = kernel_get_module_path(module);
         if (!path) {
-               pr_err("Failed to find path of %s module", module ?: "kernel");
+               pr_err("Failed to find path of %s module.\n",
+                      module ?: "kernel");
                 return -ENOENT;
         }
         pr_debug("Try to open %s\n", path);
@@ -226,7 +227,7 @@ static int try_to_find_probe_trace_events(struct perf_probe_event *pev,
                 pr_warning("Warning: No dwarf info found in the vmlinux - "
                         "please rebuild kernel with CONFIG_DEBUG_INFO=y.\n");
                 if (!need_dwarf) {
-                       pr_debug("Trying to use symbols.\nn");
+                       pr_debug("Trying to use symbols.\n");
                         return 0;
                 }
         }
@@ -295,42 +296,49 @@ static int get_real_path(const char *raw_path, const char *comp_dir,
  #define LINEBUF_SIZE 256
  #define NR_ADDITIONAL_LINES 2
  
-static int show_one_line(FILE *fp, int l, bool skip, bool show_num)
+static int __show_one_line(FILE *fp, int l, bool skip, bool show_num)
  {
         char buf[LINEBUF_SIZE];
-       const char *color = PERF_COLOR_BLUE;
+       const char *color = show_num ? "" : PERF_COLOR_BLUE;
+       const char *prefix = NULL;
  
-       if (fgets(buf, LINEBUF_SIZE, fp) == NULL)
-               goto error;
-       if (!skip) {
-               if (show_num)
-                       fprintf(stdout, "%7d  %s", l, buf);
-               else
-                       color_fprintf(stdout, color, "         %s", buf);
-       }
-
-       while (strlen(buf) == LINEBUF_SIZE - 1 &&
-              buf[LINEBUF_SIZE - 2] != '\n') {
+       do {
                 if (fgets(buf, LINEBUF_SIZE, fp) == NULL)
                         goto error;
-               if (!skip) {
-                       if (show_num)
-                               fprintf(stdout, "%s", buf);
-                       else
-                               color_fprintf(stdout, color, "%s", buf);
+               if (skip)
+                       continue;
+               if (!prefix) {
+                       prefix = show_num ? "%7d  " : "         ";
+                       color_fprintf(stdout, color, prefix, l);
                 }
-       }
+               color_fprintf(stdout, color, "%s", buf);
  
-       return 0;
+       } while (strchr(buf, '\n') == NULL);
+
+       return 1;
  error:
-       if (feof(fp))
-               pr_warning("Source file is shorter than expected.\n");
-       else
+       if (ferror(fp)) {
                 pr_warning("File read error: %s\n", strerror(errno));
+               return -1;
+       }
+       return 0;
+}
  
-       return -1;
+static int _show_one_line(FILE *fp, int l, bool skip, bool show_num)
+{
+       int rv = __show_one_line(fp, l, skip, show_num);
+       if (rv == 0) {
+               pr_warning("Source file is shorter than expected.\n");
+               rv = -1;
+       }
+       return rv;
  }
  
+#define show_one_line_with_num(f,l)    _show_one_line(f,l,false,true)
+#define show_one_line(f,l)             _show_one_line(f,l,false,false)
+#define skip_one_line(f,l)             _show_one_line(f,l,true,false)
+#define show_one_line_or_eof(f,l)      __show_one_line(f,l,false,false)
+
  /*
   * Show line-range always requires debuginfo to find source file and
   * line number.
@@ -379,7 +387,7 @@ int show_line_range(struct line_range *lr, const char *module)
                 fprintf(stdout, "<%s:%d>\n", lr->function,
                         lr->start - lr->offset);
         else
-               fprintf(stdout, "<%s:%d>\n", lr->file, lr->start);
+               fprintf(stdout, "<%s:%d>\n", lr->path, lr->start);
  
         fp = fopen(lr->path, "r");
         if (fp == NULL) {
@@ -388,26 +396,30 @@ int show_line_range(struct line_range *lr, const char *module)
                 return -errno;
         }
         /* Skip to starting line number */
-       while (l < lr->start && ret >= 0)
-               ret = show_one_line(fp, l++, true, false);
-       if (ret < 0)
-               goto end;
+       while (l < lr->start) {
+               ret = skip_one_line(fp, l++);
+               if (ret < 0)
+                       goto end;
+       }
  
         list_for_each_entry(ln, &lr->line_list, list) {
-               while (ln->line > l && ret >= 0)
-                       ret = show_one_line(fp, (l++) - lr->offset,
-                                           false, false);
-               if (ret >= 0)
-                       ret = show_one_line(fp, (l++) - lr->offset,
-                                           false, true);
+               for (; ln->line > l; l++) {
+                       ret = show_one_line(fp, l - lr->offset);
+                       if (ret < 0)
+                               goto end;
+               }
+               ret = show_one_line_with_num(fp, l++ - lr->offset);
                 if (ret < 0)
                         goto end;
         }
  
         if (lr->end == INT_MAX)
                 lr->end = l + NR_ADDITIONAL_LINES;
-       while (l <= lr->end && !feof(fp) && ret >= 0)
-               ret = show_one_line(fp, (l++) - lr->offset, false, false);
+       while (l <= lr->end) {
+               ret = show_one_line_or_eof(fp, l++ - lr->offset);
+               if (ret <= 0)
+                       break;
+       }
  end:
         fclose(fp);
         return ret;
@@ -466,7 +478,7 @@ int show_available_vars(struct perf_probe_event *pevs, int npevs,
  
         fd = open_vmlinux(module);
         if (fd < 0) {
-               pr_warning("Failed to open debuginfo file.\n");
+               pr_warning("Failed to open debug information file.\n");
                 return fd;
         }
  
@@ -526,56 +538,87 @@ int show_available_vars(struct perf_probe_event *pevs __unused,
  }
  #endif
  
+static int parse_line_num(char **ptr, int *val, const char *what)
+{
+       const char *start = *ptr;
+
+       errno = 0;
+       *val = strtol(*ptr, ptr, 0);
+       if (errno || *ptr == start) {
+               semantic_error("'%s' is not a valid number.\n", what);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+/*
+ * Stuff 'lr' according to the line range described by 'arg'.
+ * The line range syntax is described by:
+ *
+ *         SRC[:SLN[+NUM|-ELN]]
+ *         FNC[:SLN[+NUM|-ELN]]
+ */
  int parse_line_range_desc(const char *arg, struct line_range *lr)
  {
-       const char *ptr;
-       char *tmp;
-       /*
-        * <Syntax>
-        * SRC:SLN[+NUM|-ELN]
-        * FUNC[:SLN[+NUM|-ELN]]
-        */
-       ptr = strchr(arg, ':');
-       if (ptr) {
-               lr->start = (int)strtoul(ptr + 1, &tmp, 0);
-               if (*tmp == '+') {
-                       lr->end = lr->start + (int)strtoul(tmp + 1, &tmp, 0);
-                       lr->end--;      /*
-                                        * Adjust the number of lines here.
-                                        * If the number of lines == 1, the
-                                        * the end of line should be equal to
-                                        * the start of line.
-                                        */
-               } else if (*tmp == '-')
-                       lr->end = (int)strtoul(tmp + 1, &tmp, 0);
-               else
-                       lr->end = INT_MAX;
+       char *range, *name = strdup(arg);
+       int err;
+
+       if (!name)
+               return -ENOMEM;
+
+       lr->start = 0;
+       lr->end = INT_MAX;
+
+       range = strchr(name, ':');
+       if (range) {
+               *range++ = '\0';
+
+               err = parse_line_num(&range, &lr->start, "start line");
+               if (err)
+                       goto err;
+
+               if (*range == '+' || *range == '-') {
+                       const char c = *range++;
+
+                       err = parse_line_num(&range, &lr->end, "end line");
+                       if (err)
+                               goto err;
+
+                       if (c == '+') {
+                               lr->end += lr->start;
+                               /*
+                                * Adjust the number of lines here.
+                                * If the number of lines == 1, the
+                                * the end of line should be equal to
+                                * the start of line.
+                                */
+                               lr->end--;
+                       }
+               }
+
                 pr_debug("Line range is %d to %d\n", lr->start, lr->end);
+
+               err = -EINVAL;
                 if (lr->start > lr->end) {
                         semantic_error("Start line must be smaller"
                                        " than end line.\n");
-                       return -EINVAL;
+                       goto err;
                 }
-               if (*tmp != '\0') {
-                       semantic_error("Tailing with invalid character '%d'.\n",
-                                      *tmp);
-                       return -EINVAL;
+               if (*range != '\0') {
+                       semantic_error("Tailing with invalid str '%s'.\n", range);
+                       goto err;
                 }
-               tmp = strndup(arg, (ptr - arg));
-       } else {
-               tmp = strdup(arg);
-               lr->end = INT_MAX;
         }
  
-       if (tmp == NULL)
-               return -ENOMEM;
-
-       if (strchr(tmp, '.'))
-               lr->file = tmp;
+       if (strchr(name, '.'))
+               lr->file = name;
         else
-               lr->function = tmp;
+               lr->function = name;
  
         return 0;
+err:
+       free(name);
+       return err;
  }
  
  /* Check the name is good for event/group */
@@ -699,39 +742,40 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
  
         /* Exclusion check */
         if (pp->lazy_line && pp->line) {
-               semantic_error("Lazy pattern can't be used with line number.");
+               semantic_error("Lazy pattern can't be used with"
+                              " line number.\n");
                 return -EINVAL;
         }
  
         if (pp->lazy_line && pp->offset) {
-               semantic_error("Lazy pattern can't be used with offset.");
+               semantic_error("Lazy pattern can't be used with offset.\n");
                 return -EINVAL;
         }
  
         if (pp->line && pp->offset) {
-               semantic_error("Offset can't be used with line number.");
+               semantic_error("Offset can't be used with line number.\n");
                 return -EINVAL;
         }
  
         if (!pp->line && !pp->lazy_line && pp->file && !pp->function) {
                 semantic_error("File always requires line number or "
-                              "lazy pattern.");
+                              "lazy pattern.\n");
                 return -EINVAL;
         }
  
         if (pp->offset && !pp->function) {
-               semantic_error("Offset requires an entry function.");
+               semantic_error("Offset requires an entry function.\n");
                 return -EINVAL;
         }
  
         if (pp->retprobe && !pp->function) {
-               semantic_error("Return probe requires an entry function.");
+               semantic_error("Return probe requires an entry function.\n");
                 return -EINVAL;
         }
  
         if ((pp->offset || pp->line || pp->lazy_line) && pp->retprobe) {
                 semantic_error("Offset/Line/Lazy pattern can't be used with "
-                              "return probe.");
+                              "return probe.\n");
                 return -EINVAL;
         }
  
@@ -1005,7 +1049,7 @@ int synthesize_perf_probe_arg(struct perf_probe_arg *pa, char *buf, size_t len)
  
         return tmp - buf;
  error:
-       pr_debug("Failed to synthesize perf probe argument: %s",
+       pr_debug("Failed to synthesize perf probe argument: %s\n",
                  strerror(-ret));
         return ret;
  }
@@ -1033,13 +1077,13 @@ static char *synthesize_perf_probe_point(struct perf_probe_point *pp)
                         goto error;
         }
         if (pp->file) {
-               len = strlen(pp->file) - 31;
-               if (len < 0)
-                       len = 0;
-               tmp = strchr(pp->file + len, '/');
-               if (!tmp)
-                       tmp = pp->file + len;
-               ret = e_snprintf(file, 32, "@%s", tmp + 1);
+               tmp = pp->file;
+               len = strlen(tmp);
+               if (len > 30) {
+                       tmp = strchr(pp->file + len - 30, '/');
+                       tmp = tmp ? tmp + 1 : pp->file + len - 30;
+               }
+               ret = e_snprintf(file, 32, "@%s", tmp);
                 if (ret <= 0)
                         goto error;
         }
@@ -1055,7 +1099,7 @@ static char *synthesize_perf_probe_point(struct perf_probe_point *pp)
  
         return buf;
  error:
-       pr_debug("Failed to synthesize perf probe point: %s",
+       pr_debug("Failed to synthesize perf probe point: %s\n",
                  strerror(-ret));
         if (buf)
                 free(buf);
@@ -1796,7 +1840,7 @@ static int del_trace_probe_event(int fd, const char *group,
  
         ret = e_snprintf(buf, 128, "%s:%s", group, event);
         if (ret < 0) {
-               pr_err("Failed to copy event.");
+               pr_err("Failed to copy event.\n");
                 return ret;
         }
  
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c

index ddf4d45563218ad9e2d8971d32f2a8f25097c502..ab83b6ac5d657c80af1e67790c1ace15d7592578 100644 (file)
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -652,8 +652,8 @@ static_var:
         regs = get_arch_regstr(regn);
         if (!regs) {
                 /* This should be a bug in DWARF or this tool */
-               pr_warning("Mapping for DWARF register number %u "
-                          "missing on this architecture.", regn);
+               pr_warning("Mapping for the register number %u "
+                          "missing on this architecture.\n", regn);
                 return -ERANGE;
         }
  
@@ -699,13 +699,14 @@ static int convert_variable_type(Dwarf_Die *vr_die,
                 if (ret != DW_TAG_pointer_type &&
                     ret != DW_TAG_array_type) {
                         pr_warning("Failed to cast into string: "
-                                  "%s(%s) is not a pointer nor array.",
+                                  "%s(%s) is not a pointer nor array.\n",
                                    dwarf_diename(vr_die), dwarf_diename(&type));
                         return -EINVAL;
                 }
                 if (ret == DW_TAG_pointer_type) {
                         if (die_get_real_type(&type, &type) == NULL) {
-                               pr_warning("Failed to get a type information.");
+                               pr_warning("Failed to get a type"
+                                          " information.\n");
                                 return -ENOENT;
                         }
                         while (*ref_ptr)
@@ -720,7 +721,7 @@ static int convert_variable_type(Dwarf_Die *vr_die,
                 if (!die_compare_name(&type, "char") &&
                     !die_compare_name(&type, "unsigned char")) {
                         pr_warning("Failed to cast into string: "
-                                  "%s is not (unsigned) char *.",
+                                  "%s is not (unsigned) char *.\n",
                                    dwarf_diename(vr_die));
                         return -EINVAL;
                 }
@@ -830,8 +831,8 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
                         return -EINVAL;
                 }
                 if (field->name[0] == '[') {
-                       pr_err("Semantic error: %s is not a pointor nor array.",
-                              varname);
+                       pr_err("Semantic error: %s is not a pointor"
+                              " nor array.\n", varname);
                         return -EINVAL;
                 }
                 if (field->ref) {
@@ -978,7 +979,7 @@ static int convert_to_trace_point(Dwarf_Die *sp_die, Dwarf_Addr paddr,
         name = dwarf_diename(sp_die);
         if (name) {
                 if (dwarf_entrypc(sp_die, &eaddr) != 0) {
-                       pr_warning("Failed to get entry pc of %s\n",
+                       pr_warning("Failed to get entry address of %s\n",
                                    dwarf_diename(sp_die));
                         return -ENOENT;
                 }
@@ -994,7 +995,7 @@ static int convert_to_trace_point(Dwarf_Die *sp_die, Dwarf_Addr paddr,
         if (retprobe) {
                 if (eaddr != paddr) {
                         pr_warning("Return probe must be on the head of"
-                                  " a real function\n");
+                                  " a real function.\n");
                         return -EINVAL;
                 }
                 tp->retprobe = true;
@@ -1033,7 +1034,7 @@ static int call_probe_finder(Dwarf_Die *sp_die, struct probe_finder *pf)
                 Dwarf_Frame *frame;
                 if (dwarf_cfi_addrframe(pf->cfi, pf->addr, &frame) != 0 ||
                     dwarf_frame_cfa(frame, &pf->fb_ops, &nops) != 0) {
-                       pr_warning("Failed to get CFA on 0x%jx\n",
+                       pr_warning("Failed to get call frame on 0x%jx\n",
                                    (uintmax_t)pf->addr);
                         return -ENOENT;
                 }
@@ -1060,7 +1061,7 @@ static int find_probe_point_by_line(struct probe_finder *pf)
         int ret = 0;
  
         if (dwarf_getsrclines(&pf->cu_die, &lines, &nlines) != 0) {
-               pr_warning("No source lines found in this CU.\n");
+               pr_warning("No source lines found.\n");
                 return -ENOENT;
         }
  
@@ -1162,7 +1163,7 @@ static int find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
         }
  
         if (dwarf_getsrclines(&pf->cu_die, &lines, &nlines) != 0) {
-               pr_warning("No source lines found in this CU.\n");
+               pr_warning("No source lines found.\n");
                 return -ENOENT;
         }
  
@@ -1220,7 +1221,7 @@ static int probe_point_inline_cb(Dwarf_Die *in_die, void *data)
         else {
                 /* Get probe address */
                 if (dwarf_entrypc(in_die, &addr) != 0) {
-                       pr_warning("Failed to get entry pc of %s.\n",
+                       pr_warning("Failed to get entry address of %s.\n",
                                    dwarf_diename(in_die));
                         param->retval = -ENOENT;
                         return DWARF_CB_ABORT;
@@ -1261,8 +1262,8 @@ static int probe_point_search_cb(Dwarf_Die *sp_die, void *data)
                         param->retval = find_probe_point_lazy(sp_die, pf);
                 else {
                         if (dwarf_entrypc(sp_die, &pf->addr) != 0) {
-                               pr_warning("Failed to get entry pc of %s.\n",
-                                          dwarf_diename(sp_die));
+                               pr_warning("Failed to get entry address of "
+                                          "%s.\n", dwarf_diename(sp_die));
                                 param->retval = -ENOENT;
                                 return DWARF_CB_ABORT;
                         }
@@ -1304,7 +1305,7 @@ static int find_probes(int fd, struct probe_finder *pf)
  
         dbg = dwfl_init_offline_dwarf(fd, &dwfl, &bias);
         if (!dbg) {
-               pr_warning("No dwarf info found in the vmlinux - "
+               pr_warning("No debug information found in the vmlinux - "
                         "please rebuild with CONFIG_DEBUG_INFO=y.\n");
                 return -EBADF;
         }
@@ -1549,7 +1550,7 @@ int find_perf_probe_point(unsigned long addr, struct perf_probe_point *ppt)
         /* Open the live linux kernel */
         dbg = dwfl_init_live_kernel_dwarf(addr, &dwfl, &bias);
         if (!dbg) {
-               pr_warning("No dwarf info found in the vmlinux - "
+               pr_warning("No debug information found in the vmlinux - "
                         "please rebuild with CONFIG_DEBUG_INFO=y.\n");
                 ret = -EINVAL;
                 goto end;
@@ -1559,7 +1560,8 @@ int find_perf_probe_point(unsigned long addr, struct perf_probe_point *ppt)
         addr += bias;
         /* Find cu die */
         if (!dwarf_addrdie(dbg, (Dwarf_Addr)addr - bias, &cudie)) {
-               pr_warning("No CU DIE is found at %lx\n", addr);
+               pr_warning("Failed to find debug information for address %lx\n",
+                          addr);
                 ret = -EINVAL;
                 goto end;
         }
@@ -1684,7 +1686,7 @@ static int find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
  
         line_list__init(&lf->lr->line_list);
         if (dwarf_getsrclines(&lf->cu_die, &lines, &nlines) != 0) {
-               pr_warning("No source lines found in this CU.\n");
+               pr_warning("No source lines found.\n");
                 return -ENOENT;
         }
  
@@ -1809,7 +1811,7 @@ int find_line_range(int fd, struct line_range *lr)
  
         dbg = dwfl_init_offline_dwarf(fd, &dwfl, &bias);
         if (!dbg) {
-               pr_warning("No dwarf info found in the vmlinux - "
+               pr_warning("No debug information found in the vmlinux - "
                         "please rebuild with CONFIG_DEBUG_INFO=y.\n");
                 return -EBADF;
         }
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h

index bba69d4556999e5081b018857acd230de6740eea..beaefc3c1223df16423cbd084f700b6ee5a5fbf2 100644 (file)
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -34,9 +34,9 @@ extern int find_available_vars_at(int fd, struct perf_probe_event *pev,
                                   bool externs);
  
  #include <dwarf.h>
-#include <libdw.h>
-#include <libdwfl.h>
-#include <version.h>
+#include <elfutils/libdw.h>
+#include <elfutils/libdwfl.h>
+#include <elfutils/version.h>
  
  struct probe_finder {
         struct perf_probe_event *pev;           /* Target probe event */
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c

index b059dc50cc2db9021b75435e9aac132174c6dbec..93680818e244ca8a2e58f49e59af0ca729eeb50e 100644 (file)
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -1,5 +1,5 @@
  /*
- * trace-event-perl.  Feed perf trace events to an embedded Perl interpreter.
+ * trace-event-perl.  Feed perf script events to an embedded Perl interpreter.
   *
   * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
   *
@@ -411,8 +411,8 @@ static int perl_generate_script(const char *outfile)
                 return -1;
         }
  
-       fprintf(ofp, "# perf trace event handlers, "
-               "generated by perf trace -g perl\n");
+       fprintf(ofp, "# perf script event handlers, "
+               "generated by perf script -g perl\n");
  
         fprintf(ofp, "# Licensed under the terms of the GNU GPL"
                 " License version 2\n\n");
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c

index 33a632523743deff80cb9e3636bd9144b84b2f76..c6d99334bdfa836c1adba2d613b658fadf20797b 100644 (file)
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -442,8 +442,8 @@ static int python_generate_script(const char *outfile)
                 fprintf(stderr, "couldn't open %s\n", fname);
                 return -1;
         }
-       fprintf(ofp, "# perf trace event handlers, "
-               "generated by perf trace -g python\n");
+       fprintf(ofp, "# perf script event handlers, "
+               "generated by perf script -g python\n");
  
         fprintf(ofp, "# Licensed under the terms of the GNU GPL"
                 " License version 2\n\n");
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c

index fa9d652c2dc3c07182028d4a196293333be75d78..6fb4694d05fa1e2c89e5ac91207c025214031f38 100644 (file)
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -65,9 +65,49 @@ out_close:
         return -1;
  }
  
+static void perf_session__id_header_size(struct perf_session *session)
+{
+       struct sample_data *data;
+       u64 sample_type = session->sample_type;
+       u16 size = 0;
+
+       if (!session->sample_id_all)
+               goto out;
+
+       if (sample_type & PERF_SAMPLE_TID)
+               size += sizeof(data->tid) * 2;
+
+       if (sample_type & PERF_SAMPLE_TIME)
+               size += sizeof(data->time);
+
+       if (sample_type & PERF_SAMPLE_ID)
+               size += sizeof(data->id);
+
+       if (sample_type & PERF_SAMPLE_STREAM_ID)
+               size += sizeof(data->stream_id);
+
+       if (sample_type & PERF_SAMPLE_CPU)
+               size += sizeof(data->cpu) * 2;
+out:
+       session->id_hdr_size = size;
+}
+
+void perf_session__set_sample_id_all(struct perf_session *session, bool value)
+{
+       session->sample_id_all = value;
+       perf_session__id_header_size(session);
+}
+
+void perf_session__set_sample_type(struct perf_session *session, u64 type)
+{
+       session->sample_type = type;
+}
+
  void perf_session__update_sample_type(struct perf_session *self)
  {
         self->sample_type = perf_header__sample_type(&self->header);
+       self->sample_id_all = perf_header__sample_id_all(&self->header);
+       perf_session__id_header_size(self);
  }
  
  int perf_session__create_kernel_maps(struct perf_session *self)
@@ -85,7 +125,9 @@ static void perf_session__destroy_kernel_maps(struct perf_session *self)
         machines__destroy_guest_kernel_maps(&self->machines);
  }
  
-struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe)
+struct perf_session *perf_session__new(const char *filename, int mode,
+                                      bool force, bool repipe,
+                                      struct perf_event_ops *ops)
  {
         size_t len = filename ? strlen(filename) + 1 : 0;
         struct perf_session *self = zalloc(sizeof(*self) + len);
@@ -101,10 +143,20 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
         INIT_LIST_HEAD(&self->dead_threads);
         self->hists_tree = RB_ROOT;
         self->last_match = NULL;
-       self->mmap_window = 32;
+       /*
+        * On 64bit we can mmap the data file in one go. No need for tiny mmap
+        * slices. On 32bit we use 32MB.
+        */
+#if BITS_PER_LONG == 64
+       self->mmap_window = ULLONG_MAX;
+#else
+       self->mmap_window = 32 * 1024 * 1024ULL;
+#endif
         self->machines = RB_ROOT;
         self->repipe = repipe;
-       INIT_LIST_HEAD(&self->ordered_samples.samples_head);
+       INIT_LIST_HEAD(&self->ordered_samples.samples);
+       INIT_LIST_HEAD(&self->ordered_samples.sample_cache);
+       INIT_LIST_HEAD(&self->ordered_samples.to_free);
         machine__init(&self->host_machine, "", HOST_KERNEL_ID);
  
         if (mode == O_RDONLY) {
@@ -120,6 +172,13 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
         }
  
         perf_session__update_sample_type(self);
+
+       if (ops && ops->ordering_requires_timestamps &&
+           ops->ordered_samples && !self->sample_id_all) {
+               dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n");
+               ops->ordered_samples = false;
+       }
+
  out:
         return self;
  out_free:
@@ -230,7 +289,15 @@ struct map_symbol *perf_session__resolve_callchain(struct perf_session *self,
         return syms;
  }
  
+static int process_event_synth_stub(event_t *event __used,
+                                   struct perf_session *session __used)
+{
+       dump_printf(": unhandled!\n");
+       return 0;
+}
+
  static int process_event_stub(event_t *event __used,
+                             struct sample_data *sample __used,
                               struct perf_session *session __used)
  {
         dump_printf(": unhandled!\n");
@@ -262,7 +329,7 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
         if (handler->exit == NULL)
                 handler->exit = process_event_stub;
         if (handler->lost == NULL)
-               handler->lost = process_event_stub;
+               handler->lost = event__process_lost;
         if (handler->read == NULL)
                 handler->read = process_event_stub;
         if (handler->throttle == NULL)
@@ -270,13 +337,13 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
         if (handler->unthrottle == NULL)
                 handler->unthrottle = process_event_stub;
         if (handler->attr == NULL)
-               handler->attr = process_event_stub;
+               handler->attr = process_event_synth_stub;
         if (handler->event_type == NULL)
-               handler->event_type = process_event_stub;
+               handler->event_type = process_event_synth_stub;
         if (handler->tracing_data == NULL)
-               handler->tracing_data = process_event_stub;
+               handler->tracing_data = process_event_synth_stub;
         if (handler->build_id == NULL)
-               handler->build_id = process_event_stub;
+               handler->build_id = process_event_synth_stub;
         if (handler->finished_round == NULL) {
                 if (handler->ordered_samples)
                         handler->finished_round = process_finished_round;
@@ -386,33 +453,61 @@ static event__swap_op event__swap_ops[] = {
  
  struct sample_queue {
         u64                     timestamp;
-       struct sample_event     *event;
+       u64                     file_offset;
+       event_t                 *event;
         struct list_head        list;
  };
  
+static void perf_session_free_sample_buffers(struct perf_session *session)
+{
+       struct ordered_samples *os = &session->ordered_samples;
+
+       while (!list_empty(&os->to_free)) {
+               struct sample_queue *sq;
+
+               sq = list_entry(os->to_free.next, struct sample_queue, list);
+               list_del(&sq->list);
+               free(sq);
+       }
+}
+
+static int perf_session_deliver_event(struct perf_session *session,
+                                     event_t *event,
+                                     struct sample_data *sample,
+                                     struct perf_event_ops *ops,
+                                     u64 file_offset);
+
  static void flush_sample_queue(struct perf_session *s,
                                struct perf_event_ops *ops)
  {
-       struct list_head *head = &s->ordered_samples.samples_head;
-       u64 limit = s->ordered_samples.next_flush;
+       struct ordered_samples *os = &s->ordered_samples;
+       struct list_head *head = &os->samples;
         struct sample_queue *tmp, *iter;
+       struct sample_data sample;
+       u64 limit = os->next_flush;
+       u64 last_ts = os->last_sample ? os->last_sample->timestamp : 0ULL;
  
         if (!ops->ordered_samples || !limit)
                 return;
  
         list_for_each_entry_safe(iter, tmp, head, list) {
                 if (iter->timestamp > limit)
-                       return;
+                       break;
  
-               if (iter == s->ordered_samples.last_inserted)
-                       s->ordered_samples.last_inserted = NULL;
+               event__parse_sample(iter->event, s, &sample);
+               perf_session_deliver_event(s, iter->event, &sample, ops,
+                                          iter->file_offset);
  
-               ops->sample((event_t *)iter->event, s);
-
-               s->ordered_samples.last_flush = iter->timestamp;
+               os->last_flush = iter->timestamp;
                 list_del(&iter->list);
-               free(iter->event);
-               free(iter);
+               list_add(&iter->list, &os->sample_cache);
+       }
+
+       if (list_empty(head)) {
+               os->last_sample = NULL;
+       } else if (last_ts <= limit) {
+               os->last_sample =
+                       list_entry(head->prev, struct sample_queue, list);
         }
  }
  
@@ -465,178 +560,265 @@ static int process_finished_round(event_t *event __used,
         return 0;
  }
  
-static void __queue_sample_end(struct sample_queue *new, struct list_head *head)
-{
-       struct sample_queue *iter;
-
-       list_for_each_entry_reverse(iter, head, list) {
-               if (iter->timestamp < new->timestamp) {
-                       list_add(&new->list, &iter->list);
-                       return;
-               }
-       }
-
-       list_add(&new->list, head);
-}
-
-static void __queue_sample_before(struct sample_queue *new,
-                                 struct sample_queue *iter,
-                                 struct list_head *head)
-{
-       list_for_each_entry_continue_reverse(iter, head, list) {
-               if (iter->timestamp < new->timestamp) {
-                       list_add(&new->list, &iter->list);
-                       return;
-               }
-       }
-
-       list_add(&new->list, head);
-}
-
-static void __queue_sample_after(struct sample_queue *new,
-                                struct sample_queue *iter,
-                                struct list_head *head)
-{
-       list_for_each_entry_continue(iter, head, list) {
-               if (iter->timestamp > new->timestamp) {
-                       list_add_tail(&new->list, &iter->list);
-                       return;
-               }
-       }
-       list_add_tail(&new->list, head);
-}
-
  /* The queue is ordered by time */
-static void __queue_sample_event(struct sample_queue *new,
-                                struct perf_session *s)
+static void __queue_event(struct sample_queue *new, struct perf_session *s)
  {
-       struct sample_queue *last_inserted = s->ordered_samples.last_inserted;
-       struct list_head *head = &s->ordered_samples.samples_head;
+       struct ordered_samples *os = &s->ordered_samples;
+       struct sample_queue *sample = os->last_sample;
+       u64 timestamp = new->timestamp;
+       struct list_head *p;
  
+       os->last_sample = new;
  
-       if (!last_inserted) {
-               __queue_sample_end(new, head);
+       if (!sample) {
+               list_add(&new->list, &os->samples);
+               os->max_timestamp = timestamp;
                 return;
         }
  
         /*
-        * Most of the time the current event has a timestamp
-        * very close to the last event inserted, unless we just switched
-        * to another event buffer. Having a sorting based on a list and
-        * on the last inserted event that is close to the current one is
-        * probably more efficient than an rbtree based sorting.
+        * last_sample might point to some random place in the list as it's
+        * the last queued event. We expect that the new event is close to
+        * this.
          */
-       if (last_inserted->timestamp >= new->timestamp)
-               __queue_sample_before(new, last_inserted, head);
-       else
-               __queue_sample_after(new, last_inserted, head);
+       if (sample->timestamp <= timestamp) {
+               while (sample->timestamp <= timestamp) {
+                       p = sample->list.next;
+                       if (p == &os->samples) {
+                               list_add_tail(&new->list, &os->samples);
+                               os->max_timestamp = timestamp;
+                               return;
+                       }
+                       sample = list_entry(p, struct sample_queue, list);
+               }
+               list_add_tail(&new->list, &sample->list);
+       } else {
+               while (sample->timestamp > timestamp) {
+                       p = sample->list.prev;
+                       if (p == &os->samples) {
+                               list_add(&new->list, &os->samples);
+                               return;
+                       }
+                       sample = list_entry(p, struct sample_queue, list);
+               }
+               list_add(&new->list, &sample->list);
+       }
  }
  
-static int queue_sample_event(event_t *event, struct sample_data *data,
-                             struct perf_session *s)
+#define MAX_SAMPLE_BUFFER      (64 * 1024 / sizeof(struct sample_queue))
+
+static int perf_session_queue_event(struct perf_session *s, event_t *event,
+                                   struct sample_data *data, u64 file_offset)
  {
+       struct ordered_samples *os = &s->ordered_samples;
+       struct list_head *sc = &os->sample_cache;
         u64 timestamp = data->time;
         struct sample_queue *new;
  
+       if (!timestamp || timestamp == ~0ULL)
+               return -ETIME;
  
         if (timestamp < s->ordered_samples.last_flush) {
                 printf("Warning: Timestamp below last timeslice flush\n");
                 return -EINVAL;
         }
  
-       new = malloc(sizeof(*new));
-       if (!new)
-               return -ENOMEM;
+       if (!list_empty(sc)) {
+               new = list_entry(sc->next, struct sample_queue, list);
+               list_del(&new->list);
+       } else if (os->sample_buffer) {
+               new = os->sample_buffer + os->sample_buffer_idx;
+               if (++os->sample_buffer_idx == MAX_SAMPLE_BUFFER)
+                       os->sample_buffer = NULL;
+       } else {
+               os->sample_buffer = malloc(MAX_SAMPLE_BUFFER * sizeof(*new));
+               if (!os->sample_buffer)
+                       return -ENOMEM;
+               list_add(&os->sample_buffer->list, &os->to_free);
+               os->sample_buffer_idx = 2;
+               new = os->sample_buffer + 1;
+       }
  
         new->timestamp = timestamp;
+       new->file_offset = file_offset;
+       new->event = event;
  
-       new->event = malloc(event->header.size);
-       if (!new->event) {
-               free(new);
-               return -ENOMEM;
-       }
+       __queue_event(new, s);
  
-       memcpy(new->event, event, event->header.size);
+       return 0;
+}
  
-       __queue_sample_event(new, s);
-       s->ordered_samples.last_inserted = new;
+static void callchain__printf(struct sample_data *sample)
+{
+       unsigned int i;
  
-       if (new->timestamp > s->ordered_samples.max_timestamp)
-               s->ordered_samples.max_timestamp = new->timestamp;
+       printf("... chain: nr:%Lu\n", sample->callchain->nr);
  
-       return 0;
+       for (i = 0; i < sample->callchain->nr; i++)
+               printf("..... %2d: %016Lx\n", i, sample->callchain->ips[i]);
  }
  
-static int perf_session__process_sample(event_t *event, struct perf_session *s,
-                                       struct perf_event_ops *ops)
+static void perf_session__print_tstamp(struct perf_session *session,
+                                      event_t *event,
+                                      struct sample_data *sample)
  {
-       struct sample_data data;
+       if (event->header.type != PERF_RECORD_SAMPLE &&
+           !session->sample_id_all) {
+               fputs("-1 -1 ", stdout);
+               return;
+       }
  
-       if (!ops->ordered_samples)
-               return ops->sample(event, s);
+       if ((session->sample_type & PERF_SAMPLE_CPU))
+               printf("%u ", sample->cpu);
  
-       bzero(&data, sizeof(struct sample_data));
-       event__parse_sample(event, s->sample_type, &data);
+       if (session->sample_type & PERF_SAMPLE_TIME)
+               printf("%Lu ", sample->time);
+}
  
-       queue_sample_event(event, &data, s);
+static void dump_event(struct perf_session *session, event_t *event,
+                      u64 file_offset, struct sample_data *sample)
+{
+       if (!dump_trace)
+               return;
  
-       return 0;
+       printf("\n%#Lx [%#x]: event: %d\n", file_offset, event->header.size,
+              event->header.type);
+
+       trace_event(event);
+
+       if (sample)
+               perf_session__print_tstamp(session, event, sample);
+
+       printf("%#Lx [%#x]: PERF_RECORD_%s", file_offset, event->header.size,
+              event__get_event_name(event->header.type));
  }
  
-static int perf_session__process_event(struct perf_session *self,
-                                      event_t *event,
-                                      struct perf_event_ops *ops,
-                                      u64 offset, u64 head)
+static void dump_sample(struct perf_session *session, event_t *event,
+                       struct sample_data *sample)
  {
-       trace_event(event);
+       if (!dump_trace)
+               return;
  
-       if (event->header.type < PERF_RECORD_HEADER_MAX) {
-               dump_printf("%#Lx [%#x]: PERF_RECORD_%s",
-                           offset + head, event->header.size,
-                           event__name[event->header.type]);
-               hists__inc_nr_events(&self->hists, event->header.type);
-       }
+       printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc,
+              sample->pid, sample->tid, sample->ip, sample->period);
  
-       if (self->header.needs_swap && event__swap_ops[event->header.type])
-               event__swap_ops[event->header.type](event);
+       if (session->sample_type & PERF_SAMPLE_CALLCHAIN)
+               callchain__printf(sample);
+}
+
+static int perf_session_deliver_event(struct perf_session *session,
+                                     event_t *event,
+                                     struct sample_data *sample,
+                                     struct perf_event_ops *ops,
+                                     u64 file_offset)
+{
+       dump_event(session, event, file_offset, sample);
  
         switch (event->header.type) {
         case PERF_RECORD_SAMPLE:
-               return perf_session__process_sample(event, self, ops);
+               dump_sample(session, event, sample);
+               return ops->sample(event, sample, session);
         case PERF_RECORD_MMAP:
-               return ops->mmap(event, self);
+               return ops->mmap(event, sample, session);
         case PERF_RECORD_COMM:
-               return ops->comm(event, self);
+               return ops->comm(event, sample, session);
         case PERF_RECORD_FORK:
-               return ops->fork(event, self);
+               return ops->fork(event, sample, session);
         case PERF_RECORD_EXIT:
-               return ops->exit(event, self);
+               return ops->exit(event, sample, session);
         case PERF_RECORD_LOST:
-               return ops->lost(event, self);
+               return ops->lost(event, sample, session);
         case PERF_RECORD_READ:
-               return ops->read(event, self);
+               return ops->read(event, sample, session);
         case PERF_RECORD_THROTTLE:
-               return ops->throttle(event, self);
+               return ops->throttle(event, sample, session);
         case PERF_RECORD_UNTHROTTLE:
-               return ops->unthrottle(event, self);
+               return ops->unthrottle(event, sample, session);
+       default:
+               ++session->hists.stats.nr_unknown_events;
+               return -1;
+       }
+}
+
+static int perf_session__preprocess_sample(struct perf_session *session,
+                                          event_t *event, struct sample_data *sample)
+{
+       if (event->header.type != PERF_RECORD_SAMPLE ||
+           !(session->sample_type & PERF_SAMPLE_CALLCHAIN))
+               return 0;
+
+       if (!ip_callchain__valid(sample->callchain, event)) {
+               pr_debug("call-chain problem with event, skipping it.\n");
+               ++session->hists.stats.nr_invalid_chains;
+               session->hists.stats.total_invalid_chains += sample->period;
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static int perf_session__process_user_event(struct perf_session *session, event_t *event,
+                                           struct perf_event_ops *ops, u64 file_offset)
+{
+       dump_event(session, event, file_offset, NULL);
+
+       /* These events are processed right away */
+       switch (event->header.type) {
         case PERF_RECORD_HEADER_ATTR:
-               return ops->attr(event, self);
+               return ops->attr(event, session);
         case PERF_RECORD_HEADER_EVENT_TYPE:
-               return ops->event_type(event, self);
+               return ops->event_type(event, session);
         case PERF_RECORD_HEADER_TRACING_DATA:
                 /* setup for reading amidst mmap */
-               lseek(self->fd, offset + head, SEEK_SET);
-               return ops->tracing_data(event, self);
+               lseek(session->fd, file_offset, SEEK_SET);
+               return ops->tracing_data(event, session);
         case PERF_RECORD_HEADER_BUILD_ID:
-               return ops->build_id(event, self);
+               return ops->build_id(event, session);
         case PERF_RECORD_FINISHED_ROUND:
-               return ops->finished_round(event, self, ops);
+               return ops->finished_round(event, session, ops);
         default:
-               ++self->hists.stats.nr_unknown_events;
-               return -1;
+               return -EINVAL;
         }
  }
  
+static int perf_session__process_event(struct perf_session *session,
+                                      event_t *event,
+                                      struct perf_event_ops *ops,
+                                      u64 file_offset)
+{
+       struct sample_data sample;
+       int ret;
+
+       if (session->header.needs_swap && event__swap_ops[event->header.type])
+               event__swap_ops[event->header.type](event);
+
+       if (event->header.type >= PERF_RECORD_HEADER_MAX)
+               return -EINVAL;
+
+       hists__inc_nr_events(&session->hists, event->header.type);
+
+       if (event->header.type >= PERF_RECORD_USER_TYPE_START)
+               return perf_session__process_user_event(session, event, ops, file_offset);
+
+       /*
+        * For all kernel events we get the sample data
+        */
+       event__parse_sample(event, session, &sample);
+
+       /* Preprocess sample records - precheck callchains */
+       if (perf_session__preprocess_sample(session, event, &sample))
+               return 0;
+
+       if (ops->ordered_samples) {
+               ret = perf_session_queue_event(session, event, &sample,
+                                              file_offset);
+               if (ret != -ETIME)
+                       return ret;
+       }
+
+       return perf_session_deliver_event(session, event, &sample, ops,
+                                         file_offset);
+}
+
  void perf_event_header__bswap(struct perf_event_header *self)
  {
         self->type = bswap_32(self->type);
@@ -656,21 +838,33 @@ static struct thread *perf_session__register_idle_thread(struct perf_session *se
         return thread;
  }
  
-int do_read(int fd, void *buf, size_t size)
+static void perf_session__warn_about_errors(const struct perf_session *session,
+                                           const struct perf_event_ops *ops)
  {
-       void *buf_start = buf;
-
-       while (size) {
-               int ret = read(fd, buf, size);
-
-               if (ret <= 0)
-                       return ret;
+       if (ops->lost == event__process_lost &&
+           session->hists.stats.total_lost != 0) {
+               ui__warning("Processed %Lu events and LOST %Lu!\n\n"
+                           "Check IO/CPU overload!\n\n",
+                           session->hists.stats.total_period,
+                           session->hists.stats.total_lost);
+       }
  
-               size -= ret;
-               buf += ret;
+       if (session->hists.stats.nr_unknown_events != 0) {
+               ui__warning("Found %u unknown events!\n\n"
+                           "Is this an older tool processing a perf.data "
+                           "file generated by a more recent tool?\n\n"
+                           "If that is not the case, consider "
+                           "reporting to linux-kernel@vger.kernel.org.\n\n",
+                           session->hists.stats.nr_unknown_events);
         }
  
-       return buf - buf_start;
+       if (session->hists.stats.nr_invalid_chains != 0) {
+               ui__warning("Found invalid callchains!\n\n"
+                           "%u out of %u events were discarded for this reason.\n\n"
+                           "Consider reporting to linux-kernel@vger.kernel.org.\n\n",
+                           session->hists.stats.nr_invalid_chains,
+                           session->hists.stats.nr_events[PERF_RECORD_SAMPLE]);
+       }
  }
  
  #define session_done() (*(volatile int *)(&session_done))
@@ -690,7 +884,7 @@ static int __perf_session__process_pipe_events(struct perf_session *self,
  
         head = 0;
  more:
-       err = do_read(self->fd, &event, sizeof(struct perf_event_header));
+       err = readn(self->fd, &event, sizeof(struct perf_event_header));
         if (err <= 0) {
                 if (err == 0)
                         goto done;
@@ -710,8 +904,7 @@ more:
         p += sizeof(struct perf_event_header);
  
         if (size - sizeof(struct perf_event_header)) {
-               err = do_read(self->fd, p,
-                             size - sizeof(struct perf_event_header));
+               err = readn(self->fd, p, size - sizeof(struct perf_event_header));
                 if (err <= 0) {
                         if (err == 0) {
                                 pr_err("unexpected end of event stream\n");
@@ -724,8 +917,7 @@ more:
         }
  
         if (size == 0 ||
-           (skip = perf_session__process_event(self, &event, ops,
-                                               0, head)) < 0) {
+           (skip = perf_session__process_event(self, &event, ops, head)) < 0) {
                 dump_printf("%#Lx [%#x]: skipping unknown header type: %d\n",
                             head, event.header.size, event.header.type);
                 /*
@@ -740,9 +932,6 @@ more:
  
         head += size;
  
-       dump_printf("\n%#Lx [%#x]: event: %d\n",
-                   head, event.header.size, event.header.type);
-
         if (skip > 0)
                 head += skip;
  
@@ -751,82 +940,91 @@ more:
  done:
         err = 0;
  out_err:
+       perf_session__warn_about_errors(self, ops);
+       perf_session_free_sample_buffers(self);
         return err;
  }
  
-int __perf_session__process_events(struct perf_session *self,
+int __perf_session__process_events(struct perf_session *session,
                                    u64 data_offset, u64 data_size,
                                    u64 file_size, struct perf_event_ops *ops)
  {
-       int err, mmap_prot, mmap_flags;
-       u64 head, shift;
-       u64 offset = 0;
-       size_t  page_size;
+       u64 head, page_offset, file_offset, file_pos, progress_next;
+       int err, mmap_prot, mmap_flags, map_idx = 0;
+       struct ui_progress *progress;
+       size_t  page_size, mmap_size;
+       char *buf, *mmaps[8];
         event_t *event;
         uint32_t size;
-       char *buf;
-       struct ui_progress *progress = ui_progress__new("Processing events...",
-                                                       self->size);
-       if (progress == NULL)
-               return -1;
  
         perf_event_ops__fill_defaults(ops);
  
         page_size = sysconf(_SC_PAGESIZE);
  
-       head = data_offset;
-       shift = page_size * (head / page_size);
-       offset += shift;
-       head -= shift;
+       page_offset = page_size * (data_offset / page_size);
+       file_offset = page_offset;
+       head = data_offset - page_offset;
+
+       if (data_offset + data_size < file_size)
+               file_size = data_offset + data_size;
+
+       progress_next = file_size / 16;
+       progress = ui_progress__new("Processing events...", file_size);
+       if (progress == NULL)
+               return -1;
+
+       mmap_size = session->mmap_window;
+       if (mmap_size > file_size)
+               mmap_size = file_size;
+
+       memset(mmaps, 0, sizeof(mmaps));
  
         mmap_prot  = PROT_READ;
         mmap_flags = MAP_SHARED;
  
-       if (self->header.needs_swap) {
+       if (session->header.needs_swap) {
                 mmap_prot  |= PROT_WRITE;
                 mmap_flags = MAP_PRIVATE;
         }
  remap:
-       buf = mmap(NULL, page_size * self->mmap_window, mmap_prot,
-                  mmap_flags, self->fd, offset);
+       buf = mmap(NULL, mmap_size, mmap_prot, mmap_flags, session->fd,
+                  file_offset);
         if (buf == MAP_FAILED) {
                 pr_err("failed to mmap file\n");
                 err = -errno;
                 goto out_err;
         }
+       mmaps[map_idx] = buf;
+       map_idx = (map_idx + 1) & (ARRAY_SIZE(mmaps) - 1);
+       file_pos = file_offset + head;
  
  more:
         event = (event_t *)(buf + head);
-       ui_progress__update(progress, offset);
  
-       if (self->header.needs_swap)
+       if (session->header.needs_swap)
                 perf_event_header__bswap(&event->header);
         size = event->header.size;
         if (size == 0)
                 size = 8;
  
-       if (head + event->header.size >= page_size * self->mmap_window) {
-               int munmap_ret;
-
-               shift = page_size * (head / page_size);
-
-               munmap_ret = munmap(buf, page_size * self->mmap_window);
-               assert(munmap_ret == 0);
+       if (head + event->header.size >= mmap_size) {
+               if (mmaps[map_idx]) {
+                       munmap(mmaps[map_idx], mmap_size);
+                       mmaps[map_idx] = NULL;
+               }
  
-               offset += shift;
-               head -= shift;
+               page_offset = page_size * (head / page_size);
+               file_offset += page_offset;
+               head -= page_offset;
                 goto remap;
         }
  
         size = event->header.size;
  
-       dump_printf("\n%#Lx [%#x]: event: %d\n",
-                   offset + head, event->header.size, event->header.type);
-
         if (size == 0 ||
-           perf_session__process_event(self, event, ops, offset, head) < 0) {
+           perf_session__process_event(session, event, ops, file_pos) < 0) {
                 dump_printf("%#Lx [%#x]: skipping unknown header type: %d\n",
-                           offset + head, event->header.size,
+                           file_offset + head, event->header.size,
                             event->header.type);
                 /*
                  * assume we lost track of the stream, check alignment, and
@@ -839,19 +1037,24 @@ more:
         }
  
         head += size;
+       file_pos += size;
  
-       if (offset + head >= data_offset + data_size)
-               goto done;
+       if (file_pos >= progress_next) {
+               progress_next += file_size / 16;
+               ui_progress__update(progress, file_pos);
+       }
  
-       if (offset + head < file_size)
+       if (file_pos < file_size)
                 goto more;
-done:
+
         err = 0;
         /* do the final flush for ordered samples */
-       self->ordered_samples.next_flush = ULLONG_MAX;
-       flush_sample_queue(self, ops);
+       session->ordered_samples.next_flush = ULLONG_MAX;
+       flush_sample_queue(session, ops);
  out_err:
         ui_progress__delete(progress);
+       perf_session__warn_about_errors(session, ops);
+       perf_session_free_sample_buffers(session);
         return err;
  }
  
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h

index 9fa0fc2a863f1259caf8caf7494f867d0737a297..decd83f274fd3642fecf039740e75a015431df4d 100644 (file)
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -17,8 +17,12 @@ struct ordered_samples {
         u64                     last_flush;
         u64                     next_flush;
         u64                     max_timestamp;
-       struct list_head        samples_head;
-       struct sample_queue     *last_inserted;
+       struct list_head        samples;
+       struct list_head        sample_cache;
+       struct list_head        to_free;
+       struct sample_queue     *sample_buffer;
+       struct sample_queue     *last_sample;
+       int                     sample_buffer_idx;
  };
  
  struct perf_session {
@@ -42,6 +46,8 @@ struct perf_session {
         int                     fd;
         bool                    fd_pipe;
         bool                    repipe;
+       bool                    sample_id_all;
+       u16                     id_hdr_size;
         int                     cwdlen;
         char                    *cwd;
         struct ordered_samples  ordered_samples;
@@ -50,7 +56,9 @@ struct perf_session {
  
  struct perf_event_ops;
  
-typedef int (*event_op)(event_t *self, struct perf_session *session);
+typedef int (*event_op)(event_t *self, struct sample_data *sample,
+                       struct perf_session *session);
+typedef int (*event_synth_op)(event_t *self, struct perf_session *session);
  typedef int (*event_op2)(event_t *self, struct perf_session *session,
                          struct perf_event_ops *ops);
  
@@ -63,16 +71,19 @@ struct perf_event_ops {
                         lost,
                         read,
                         throttle,
-                       unthrottle,
-                       attr,
+                       unthrottle;
+       event_synth_op  attr,
                         event_type,
                         tracing_data,
                         build_id;
         event_op2       finished_round;
         bool            ordered_samples;
+       bool            ordering_requires_timestamps;
  };
  
-struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe);
+struct perf_session *perf_session__new(const char *filename, int mode,
+                                      bool force, bool repipe,
+                                      struct perf_event_ops *ops);
  void perf_session__delete(struct perf_session *self);
  
  void perf_event_header__bswap(struct perf_event_header *self);
@@ -98,8 +109,9 @@ void mem_bswap_64(void *src, int byte_size);
  
  int perf_session__create_kernel_maps(struct perf_session *self);
  
-int do_read(int fd, void *buf, size_t size);
  void perf_session__update_sample_type(struct perf_session *self);
+void perf_session__set_sample_id_all(struct perf_session *session, bool value);
+void perf_session__set_sample_type(struct perf_session *session, u64 type);
  void perf_session__remove_thread(struct perf_session *self, struct thread *th);
  
  static inline
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c

index b62a553cc67d969c104638b9bf33922f351f06bb..f44fa541d56e67c6bb6c976e78123e99657ffbee 100644 (file)
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -170,7 +170,7 @@ static int hist_entry__dso_snprintf(struct hist_entry *self, char *bf,
                 return repsep_snprintf(bf, size, "%-*s", width, dso_name);
         }
  
-       return repsep_snprintf(bf, size, "%*Lx", width, self->ip);
+       return repsep_snprintf(bf, size, "%-*s", width, "[unknown]");
  }
  
  /* --sort symbol */
@@ -196,7 +196,7 @@ static int hist_entry__sym_snprintf(struct hist_entry *self, char *bf,
  
         if (verbose) {
                 char o = self->ms.map ? dso__symtab_origin(self->ms.map->dso) : '!';
-               ret += repsep_snprintf(bf, size, "%*Lx %c ",
+               ret += repsep_snprintf(bf, size, "%-#*llx %c ",
                                        BITS_PER_LONG / 4, self->ip, o);
         }
  
@@ -205,7 +205,7 @@ static int hist_entry__sym_snprintf(struct hist_entry *self, char *bf,
                 ret += repsep_snprintf(bf + ret, size - ret, "%s",
                                        self->ms.sym->name);
         else
-               ret += repsep_snprintf(bf + ret, size - ret, "%*Lx",
+               ret += repsep_snprintf(bf + ret, size - ret, "%-#*llx",
                                        BITS_PER_LONG / 4, self->ip);
  
         return ret;
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c

index 439ab947daf4af9da352caac290d372ce23846ac..15ccfba8cdf805111d56b1e7f3bf71431ef2c1e4 100644 (file)
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -22,6 +22,10 @@
  #include <limits.h>
  #include <sys/utsname.h>
  
+#ifndef KSYM_NAME_LEN
+#define KSYM_NAME_LEN 128
+#endif
+
  #ifndef NT_GNU_BUILD_ID
  #define NT_GNU_BUILD_ID 3
  #endif
@@ -41,6 +45,7 @@ struct symbol_conf symbol_conf = {
         .exclude_other    = true,
         .use_modules      = true,
         .try_vmlinux_path = true,
+       .symfs            = "",
  };
  
  int dso__name_len(const struct dso *self)
@@ -92,7 +97,7 @@ static void symbols__fixup_end(struct rb_root *self)
                 prev = curr;
                 curr = rb_entry(nd, struct symbol, rb_node);
  
-               if (prev->end == prev->start)
+               if (prev->end == prev->start && prev->end != curr->start)
                         prev->end = curr->start - 1;
         }
  
@@ -121,7 +126,7 @@ static void __map_groups__fixup_end(struct map_groups *self, enum map_type type)
          * We still haven't the actual symbols, so guess the
          * last map final address.
          */
-       curr->end = ~0UL;
+       curr->end = ~0ULL;
  }
  
  static void map_groups__fixup_end(struct map_groups *self)
@@ -425,16 +430,25 @@ size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp)
  
  int kallsyms__parse(const char *filename, void *arg,
                     int (*process_symbol)(void *arg, const char *name,
-                                                    char type, u64 start))
+                                         char type, u64 start, u64 end))
  {
         char *line = NULL;
         size_t n;
-       int err = 0;
+       int err = -1;
+       u64 prev_start = 0;
+       char prev_symbol_type = 0;
+       char *prev_symbol_name;
         FILE *file = fopen(filename, "r");
  
         if (file == NULL)
                 goto out_failure;
  
+       prev_symbol_name = malloc(KSYM_NAME_LEN);
+       if (prev_symbol_name == NULL)
+               goto out_close;
+
+       err = 0;
+
         while (!feof(file)) {
                 u64 start;
                 int line_len, len;
@@ -454,14 +468,33 @@ int kallsyms__parse(const char *filename, void *arg,
                         continue;
  
                 symbol_type = toupper(line[len]);
-               symbol_name = line + len + 2;
+               len += 2;
+               symbol_name = line + len;
+               len = line_len - len;
  
-               err = process_symbol(arg, symbol_name, symbol_type, start);
-               if (err)
+               if (len >= KSYM_NAME_LEN) {
+                       err = -1;
                         break;
+               }
+
+               if (prev_symbol_type) {
+                       u64 end = start;
+                       if (end != prev_start)
+                               --end;
+                       err = process_symbol(arg, prev_symbol_name,
+                                            prev_symbol_type, prev_start, end);
+                       if (err)
+                               break;
+               }
+
+               memcpy(prev_symbol_name, symbol_name, len + 1);
+               prev_symbol_type = symbol_type;
+               prev_start = start;
         }
  
+       free(prev_symbol_name);
         free(line);
+out_close:
         fclose(file);
         return err;
  
@@ -483,7 +516,7 @@ static u8 kallsyms2elf_type(char type)
  }
  
  static int map__process_kallsym_symbol(void *arg, const char *name,
-                                      char type, u64 start)
+                                      char type, u64 start, u64 end)
  {
         struct symbol *sym;
         struct process_kallsyms_args *a = arg;
@@ -492,11 +525,8 @@ static int map__process_kallsym_symbol(void *arg, const char *name,
         if (!symbol_type__is_a(type, a->map->type))
                 return 0;
  
-       /*
-        * Will fix up the end later, when we have all symbols sorted.
-        */
-       sym = symbol__new(start, 0, kallsyms2elf_type(type), name);
-
+       sym = symbol__new(start, end - start + 1,
+                         kallsyms2elf_type(type), name);
         if (sym == NULL)
                 return -ENOMEM;
         /*
@@ -649,7 +679,6 @@ int dso__load_kallsyms(struct dso *self, const char *filename,
         if (dso__load_all_kallsyms(self, filename, map) < 0)
                 return -1;
  
-       symbols__fixup_end(&self->symbols[map->type]);
         if (self->kernel == DSO_TYPE_GUEST_KERNEL)
                 self->origin = DSO__ORIG_GUEST_KERNEL;
         else
@@ -839,8 +868,11 @@ static int dso__synthesize_plt_symbols(struct  dso *self, struct map *map,
         char sympltname[1024];
         Elf *elf;
         int nr = 0, symidx, fd, err = 0;
+       char name[PATH_MAX];
  
-       fd = open(self->long_name, O_RDONLY);
+       snprintf(name, sizeof(name), "%s%s",
+                symbol_conf.symfs, self->long_name);
+       fd = open(name, O_RDONLY);
         if (fd < 0)
                 goto out;
  
@@ -1452,16 +1484,19 @@ int dso__load(struct dso *self, struct map *map, symbol_filter_t filter)
              self->origin++) {
                 switch (self->origin) {
                 case DSO__ORIG_BUILD_ID_CACHE:
-                       if (dso__build_id_filename(self, name, size) == NULL)
+                       /* skip the locally configured cache if a symfs is given */
+                       if (symbol_conf.symfs[0] ||
+                           (dso__build_id_filename(self, name, size) == NULL)) {
                                 continue;
+                       }
                         break;
                 case DSO__ORIG_FEDORA:
-                       snprintf(name, size, "/usr/lib/debug%s.debug",
-                                self->long_name);
+                       snprintf(name, size, "%s/usr/lib/debug%s.debug",
+                                symbol_conf.symfs, self->long_name);
                         break;
                 case DSO__ORIG_UBUNTU:
-                       snprintf(name, size, "/usr/lib/debug%s",
-                                self->long_name);
+                       snprintf(name, size, "%s/usr/lib/debug%s",
+                                symbol_conf.symfs, self->long_name);
                         break;
                 case DSO__ORIG_BUILDID: {
                         char build_id_hex[BUILD_ID_SIZE * 2 + 1];
@@ -1473,19 +1508,26 @@ int dso__load(struct dso *self, struct map *map, symbol_filter_t filter)
                                           sizeof(self->build_id),
                                           build_id_hex);
                         snprintf(name, size,
-                                "/usr/lib/debug/.build-id/%.2s/%s.debug",
-                                build_id_hex, build_id_hex + 2);
+                                "%s/usr/lib/debug/.build-id/%.2s/%s.debug",
+                                symbol_conf.symfs, build_id_hex, build_id_hex + 2);
                         }
                         break;
                 case DSO__ORIG_DSO:
-                       snprintf(name, size, "%s", self->long_name);
+                       snprintf(name, size, "%s%s",
+                            symbol_conf.symfs, self->long_name);
                         break;
                 case DSO__ORIG_GUEST_KMODULE:
                         if (map->groups && map->groups->machine)
                                 root_dir = map->groups->machine->root_dir;
                         else
                                 root_dir = "";
-                       snprintf(name, size, "%s%s", root_dir, self->long_name);
+                       snprintf(name, size, "%s%s%s", symbol_conf.symfs,
+                                root_dir, self->long_name);
+                       break;
+
+               case DSO__ORIG_KMODULE:
+                       snprintf(name, size, "%s%s", symbol_conf.symfs,
+                                self->long_name);
                         break;
  
                 default:
@@ -1784,17 +1826,20 @@ int dso__load_vmlinux(struct dso *self, struct map *map,
                       const char *vmlinux, symbol_filter_t filter)
  {
         int err = -1, fd;
+       char symfs_vmlinux[PATH_MAX];
  
-       fd = open(vmlinux, O_RDONLY);
+       snprintf(symfs_vmlinux, sizeof(symfs_vmlinux), "%s/%s",
+                symbol_conf.symfs, vmlinux);
+       fd = open(symfs_vmlinux, O_RDONLY);
         if (fd < 0)
                 return -1;
  
         dso__set_loaded(self, map->type);
-       err = dso__load_sym(self, map, vmlinux, fd, filter, 0, 0);
+       err = dso__load_sym(self, map, symfs_vmlinux, fd, filter, 0, 0);
         close(fd);
  
         if (err > 0)
-               pr_debug("Using %s for symbols\n", vmlinux);
+               pr_debug("Using %s for symbols\n", symfs_vmlinux);
  
         return err;
  }
@@ -1836,8 +1881,8 @@ static int dso__load_kernel_sym(struct dso *self, struct map *map,
         const char *kallsyms_filename = NULL;
         char *kallsyms_allocated_filename = NULL;
         /*
-        * Step 1: if the user specified a vmlinux filename, use it and only
-        * it, reporting errors to the user if it cannot be used.
+        * Step 1: if the user specified a kallsyms or vmlinux filename, use
+        * it and only it, reporting errors to the user if it cannot be used.
          *
          * For instance, try to analyse an ARM perf.data file _without_ a
          * build-id, or if the user specifies the wrong path to the right
@@ -1850,6 +1895,11 @@ static int dso__load_kernel_sym(struct dso *self, struct map *map,
          * validation in dso__load_vmlinux and will bail out if they don't
          * match.
          */
+       if (symbol_conf.kallsyms_name != NULL) {
+               kallsyms_filename = symbol_conf.kallsyms_name;
+               goto do_kallsyms;
+       }
+
         if (symbol_conf.vmlinux_name != NULL) {
                 err = dso__load_vmlinux(self, map,
                                         symbol_conf.vmlinux_name, filter);
@@ -1867,6 +1917,10 @@ static int dso__load_kernel_sym(struct dso *self, struct map *map,
                         goto out_fixup;
         }
  
+       /* do not try local files if a symfs was given */
+       if (symbol_conf.symfs[0] != 0)
+               return -1;
+
         /*
          * Say the kernel DSO was created when processing the build-id header table,
          * we have a build-id, so check if it is the same as the running kernel,
@@ -2136,7 +2190,7 @@ struct process_args {
  };
  
  static int symbol__in_kernel(void *arg, const char *name,
-                            char type __used, u64 start)
+                            char type __used, u64 start, u64 end __used)
  {
         struct process_args *args = arg;
  
@@ -2257,9 +2311,6 @@ static int vmlinux_path__init(void)
         struct utsname uts;
         char bf[PATH_MAX];
  
-       if (uname(&uts) < 0)
-               return -1;
-
         vmlinux_path = malloc(sizeof(char *) * 5);
         if (vmlinux_path == NULL)
                 return -1;
@@ -2272,6 +2323,14 @@ static int vmlinux_path__init(void)
         if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
                 goto out_fail;
         ++vmlinux_path__nr_entries;
+
+       /* only try running kernel version if no symfs was given */
+       if (symbol_conf.symfs[0] != 0)
+               return 0;
+
+       if (uname(&uts) < 0)
+               return -1;
+
         snprintf(bf, sizeof(bf), "/boot/vmlinux-%s", uts.release);
         vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
         if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
@@ -2331,6 +2390,8 @@ static int setup_list(struct strlist **list, const char *list_str,
  
  int symbol__init(void)
  {
+       const char *symfs;
+
         if (symbol_conf.initialized)
                 return 0;
  
@@ -2359,6 +2420,18 @@ int symbol__init(void)
                        symbol_conf.sym_list_str, "symbol") < 0)
                 goto out_free_comm_list;
  
+       /*
+        * A path to symbols of "/" is identical to ""
+        * reset here for simplicity.
+        */
+       symfs = realpath(symbol_conf.symfs, NULL);
+       if (symfs == NULL)
+               symfs = symbol_conf.symfs;
+       if (strcmp(symfs, "/") == 0)
+               symbol_conf.symfs = "";
+       if (symfs != symbol_conf.symfs)
+               free((void *)symfs);
+
         symbol_conf.initialized = true;
         return 0;
  
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h

index 6c6eafdb932dacefd29c3ea21e5bef90a80cc9a9..670cd1c88f54dc932d18b7c79d609d61e7560295 100644 (file)
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -72,6 +72,7 @@ struct symbol_conf {
                         show_cpu_utilization,
                         initialized;
         const char      *vmlinux_name,
+                       *kallsyms_name,
                         *source_prefix,
                         *field_sep;
         const char      *default_guest_vmlinux_name,
@@ -85,6 +86,7 @@ struct symbol_conf {
         struct strlist  *dso_list,
                         *comm_list,
                         *sym_list;
+       const char      *symfs;
  };
  
  extern struct symbol_conf symbol_conf;
@@ -215,7 +217,7 @@ bool __dsos__read_build_ids(struct list_head *head, bool with_hits);
  int build_id__sprintf(const u8 *self, int len, char *bf);
  int kallsyms__parse(const char *filename, void *arg,
                     int (*process_symbol)(void *arg, const char *name,
-                                         char type, u64 start));
+                                         char type, u64 start, u64 end));
  
  void machine__destroy_kernel_maps(struct machine *self);
  int __machine__create_kernel_maps(struct machine *self, struct dso *kernel);
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c

index 8c72d888e449989dcdf1c349fdc4a97ead6b694a..00f4eade2e3e9c1fdd2e9c92c83222a10c1555fd 100644 (file)
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -16,35 +16,50 @@ static int filter(const struct dirent *dir)
                 return 1;
  }
  
-int find_all_tid(int pid, pid_t ** all_tid)
+struct thread_map *thread_map__new_by_pid(pid_t pid)
  {
+       struct thread_map *threads;
         char name[256];
         int items;
         struct dirent **namelist = NULL;
-       int ret = 0;
         int i;
  
         sprintf(name, "/proc/%d/task", pid);
         items = scandir(name, &namelist, filter, NULL);
         if (items <= 0)
-                return -ENOENT;
-       *all_tid = malloc(sizeof(pid_t) * items);
-       if (!*all_tid) {
-               ret = -ENOMEM;
-               goto failure;
-       }
-
-       for (i = 0; i < items; i++)
-               (*all_tid)[i] = atoi(namelist[i]->d_name);
+                return NULL;
  
-       ret = items;
+       threads = malloc(sizeof(*threads) + sizeof(pid_t) * items);
+       if (threads != NULL) {
+               for (i = 0; i < items; i++)
+                       threads->map[i] = atoi(namelist[i]->d_name);
+               threads->nr = items;
+       }
  
-failure:
         for (i=0; i<items; i++)
                 free(namelist[i]);
         free(namelist);
  
-       return ret;
+       return threads;
+}
+
+struct thread_map *thread_map__new_by_tid(pid_t tid)
+{
+       struct thread_map *threads = malloc(sizeof(*threads) + sizeof(pid_t));
+
+       if (threads != NULL) {
+               threads->map[0] = tid;
+               threads->nr     = 1;
+       }
+
+       return threads;
+}
+
+struct thread_map *thread_map__new(pid_t pid, pid_t tid)
+{
+       if (pid != -1)
+               return thread_map__new_by_pid(pid);
+       return thread_map__new_by_tid(tid);
  }
  
  static struct thread *thread__new(pid_t pid)
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h

index 688500ff826f008a4c48cd5bf782ad60c1363325..d7574101054a8ae9b6ded40fd0a813c3cacf9965 100644 (file)
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -18,11 +18,24 @@ struct thread {
         int                     comm_len;
  };
  
+struct thread_map {
+       int nr;
+       int map[];
+};
+
  struct perf_session;
  
  void thread__delete(struct thread *self);
  
-int find_all_tid(int pid, pid_t ** all_tid);
+struct thread_map *thread_map__new_by_pid(pid_t pid);
+struct thread_map *thread_map__new_by_tid(pid_t tid);
+struct thread_map *thread_map__new(pid_t pid, pid_t tid);
+
+static inline void thread_map__delete(struct thread_map *threads)
+{
+       free(threads);
+}
+
  int thread__set_comm(struct thread *self, const char *comm);
  int thread__comm_len(struct thread *self);
  struct thread *perf_session__findnew(struct perf_session *self, pid_t pid);
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c

index b1572601286cad7020c323d175177d0445eb806f..35729f4c40cb7a98e891a2013d4b6e812a24237e 100644 (file)
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -34,11 +34,13 @@
  #include <ctype.h>
  #include <errno.h>
  #include <stdbool.h>
+#include <linux/list.h>
  #include <linux/kernel.h>
  
  #include "../perf.h"
  #include "trace-event.h"
  #include "debugfs.h"
+#include "evsel.h"
  
  #define VERSION "0.5"
  
@@ -469,16 +471,17 @@ out:
  }
  
  static struct tracepoint_path *
-get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events)
+get_tracepoints_path(struct list_head *pattrs)
  {
         struct tracepoint_path path, *ppath = &path;
-       int i, nr_tracepoints = 0;
+       struct perf_evsel *pos;
+       int nr_tracepoints = 0;
  
-       for (i = 0; i < nb_events; i++) {
-               if (pattrs[i].type != PERF_TYPE_TRACEPOINT)
+       list_for_each_entry(pos, pattrs, node) {
+               if (pos->attr.type != PERF_TYPE_TRACEPOINT)
                         continue;
                 ++nr_tracepoints;
-               ppath->next = tracepoint_id_to_path(pattrs[i].config);
+               ppath->next = tracepoint_id_to_path(pos->attr.config);
                 if (!ppath->next)
                         die("%s\n", "No memory to alloc tracepoints list");
                 ppath = ppath->next;
@@ -487,21 +490,21 @@ get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events)
         return nr_tracepoints > 0 ? path.next : NULL;
  }
  
-bool have_tracepoints(struct perf_event_attr *pattrs, int nb_events)
+bool have_tracepoints(struct list_head *pattrs)
  {
-       int i;
+       struct perf_evsel *pos;
  
-       for (i = 0; i < nb_events; i++)
-               if (pattrs[i].type == PERF_TYPE_TRACEPOINT)
+       list_for_each_entry(pos, pattrs, node)
+               if (pos->attr.type == PERF_TYPE_TRACEPOINT)
                         return true;
  
         return false;
  }
  
-int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events)
+int read_tracing_data(int fd, struct list_head *pattrs)
  {
         char buf[BUFSIZ];
-       struct tracepoint_path *tps = get_tracepoints_path(pattrs, nb_events);
+       struct tracepoint_path *tps = get_tracepoints_path(pattrs);
  
         /*
          * What? No tracepoints? No sense writing anything here, bail out.
@@ -545,14 +548,13 @@ int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events)
         return 0;
  }
  
-ssize_t read_tracing_data_size(int fd, struct perf_event_attr *pattrs,
-                              int nb_events)
+ssize_t read_tracing_data_size(int fd, struct list_head *pattrs)
  {
         ssize_t size;
         int err = 0;
  
         calc_data_size = 1;
-       err = read_tracing_data(fd, pattrs, nb_events);
+       err = read_tracing_data(fd, pattrs);
         size = calc_data_size - 1;
         calc_data_size = 0;
  
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h

index b3e86b1e44443909e518573ab0a1d49f8167f151..b5f12ca24d99f877877b701ae6f8b0d7f6b3acca 100644 (file)
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -262,9 +262,8 @@ raw_field_value(struct event *event, const char *name, void *data);
  void *raw_field_ptr(struct event *event, const char *name, void *data);
  unsigned long long eval_flag(const char *flag);
  
-int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events);
-ssize_t read_tracing_data_size(int fd, struct perf_event_attr *pattrs,
-                              int nb_events);
+int read_tracing_data(int fd, struct list_head *pattrs);
+ssize_t read_tracing_data_size(int fd, struct list_head *pattrs);
  
  /* taken from kernel/trace/trace.h */
  enum trace_flag_type {
diff --git a/tools/perf/util/ui/util.c b/tools/perf/util/ui/util.c

index 056c69521a38098a8053d18f3e615d3a35390fd4..7b5a8926624e49be67d802fc91afc56451bead23 100644 (file)
--- a/tools/perf/util/ui/util.c
+++ b/tools/perf/util/ui/util.c
@@ -104,10 +104,24 @@ out_destroy_form:
         return rc;
  }
  
-static const char yes[] = "Yes", no[] = "No";
+static const char yes[] = "Yes", no[] = "No",
+                 warning_str[] = "Warning!", ok[] = "Ok";
  
  bool ui__dialog_yesno(const char *msg)
  {
         /* newtWinChoice should really be accepting const char pointers... */
         return newtWinChoice(NULL, (char *)yes, (char *)no, (char *)msg) == 1;
  }
+
+void ui__warning(const char *format, ...)
+{
+       va_list args;
+
+       va_start(args, format);
+       if (use_browser > 0)
+               newtWinMessagev((char *)warning_str, (char *)ok,
+                               (char *)format, args);
+       else
+               vfprintf(stderr, format, args);
+       va_end(args);
+}
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c

index 214265674ddda0a59488cc0871a0943b20fc6bd4..5b3ea49aa63ea1ff35b7d949b49a661187aa16e6 100644 (file)
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -114,3 +114,20 @@ unsigned long convert_unit(unsigned long value, char *unit)
  
         return value;
  }
+
+int readn(int fd, void *buf, size_t n)
+{
+       void *buf_start = buf;
+
+       while (n) {
+               int ret = read(fd, buf, n);
+
+               if (ret <= 0)
+                       return ret;
+
+               n -= ret;
+               buf += ret;
+       }
+
+       return buf - buf_start;
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h

index 7562707ddd1c491755dc8ea5121637918ba1b844..e833f26f3bfc7d702cd58ba55c7ae85657fe3801 100644 (file)
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -265,6 +265,7 @@ void argv_free(char **argv);
  bool strglobmatch(const char *str, const char *pat);
  bool strlazymatch(const char *str, const char *pat);
  unsigned long convert_unit(unsigned long value, char *unit);
+int readn(int fd, void *buf, size_t size);
  
  #define _STR(x) #x
  #define STR(x) _STR(x)
diff --git a/tools/perf/util/xyarray.c b/tools/perf/util/xyarray.c

new file mode 100644 (file)

index 0000000..22afbf6
--- /dev/null
+++ b/tools/perf/util/xyarray.c
@@ -0,0 +1,20 @@
+#include "xyarray.h"
+#include "util.h"
+
+struct xyarray *xyarray__new(int xlen, int ylen, size_t entry_size)
+{
+       size_t row_size = ylen * entry_size;
+       struct xyarray *xy = zalloc(sizeof(*xy) + xlen * row_size);
+
+       if (xy != NULL) {
+               xy->entry_size = entry_size;
+               xy->row_size   = row_size;
+       }
+
+       return xy;
+}
+
+void xyarray__delete(struct xyarray *xy)
+{
+       free(xy);
+}
diff --git a/tools/perf/util/xyarray.h b/tools/perf/util/xyarray.h

new file mode 100644 (file)

index 0000000..c488a07
--- /dev/null
+++ b/tools/perf/util/xyarray.h
@@ -0,0 +1,20 @@
+#ifndef _PERF_XYARRAY_H_
+#define _PERF_XYARRAY_H_ 1
+
+#include <sys/types.h>
+
+struct xyarray {
+       size_t row_size;
+       size_t entry_size;
+       char contents[];
+};
+
+struct xyarray *xyarray__new(int xlen, int ylen, size_t entry_size);
+void xyarray__delete(struct xyarray *xy);
+
+static inline void *xyarray__entry(struct xyarray *xy, int x, int y)
+{
+       return &xy->contents[x * xy->row_size + y * xy->entry_size];
+}
+
+#endif /* _PERF_XYARRAY_H_ */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 6 Jan 2011 19:11:50 +0000 (11:11 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 6 Jan 2011 19:11:50 +0000 (11:11 -0800)