* 'x86-alternatives-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
x86, suspend: Avoid unnecessary smp alternatives switch during suspend/resume
* 'x86-fpu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
x86-64, asm: Use fxsaveq/fxrestorq in more places
* 'x86-hwmon-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
x86, hwmon: Add core threshold notification to therm_throt.c
* 'x86-paravirt-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
x86, paravirt: Use native_halt on a halt, not native_safe_halt
* 'core-locking-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
locking, lockdep: Convert sprintf_symbol to %pS
* 'irq-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
irq: Better struct irqaction layout
W: http://oops.ghostprotocols.net:81/blog/
P: 1024D/9224DF01 D5DF E3BB E3C8 BCBB F8AD 841A B6AB 4681 9224 DF01
D: IPX, LLC, DCCP, cyc2x, wl3501_cs, net/ hacks
-S: R. Brasílio Itiberê, 4270/1010 - Água Verde
-S: 80240-060 - Curitiba - Paraná
S: Brazil
N: Karsten Merker
CONFIG_RCU_TRACE debugfs Files and Formats
-The rcutree implementation of RCU provides debugfs trace output that
-summarizes counters and state. This information is useful for debugging
-RCU itself, and can sometimes also help to debug abuses of RCU.
-The following sections describe the debugfs files and formats.
+The rcutree and rcutiny implementations of RCU provide debugfs trace
+output that summarizes counters and state. This information is useful for
+debugging RCU itself, and can sometimes also help to debug abuses of RCU.
+The following sections describe the debugfs files and formats, first
+for rcutree and next for rcutiny.
-Hierarchical RCU debugfs Files and Formats
+CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats
-This implementation of RCU provides three debugfs files under the
+These implementations of RCU provides five debugfs files under the
top-level directory RCU: rcu/rcudata (which displays fields in struct
-rcu_data), rcu/rcugp (which displays grace-period counters), and
-rcu/rcuhier (which displays the struct rcu_node hierarchy).
+rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of
+rcu/rcudata), rcu/rcugp (which displays grace-period counters),
+rcu/rcuhier (which displays the struct rcu_node hierarchy), and
+rcu/rcu_pending (which displays counts of the reasons that the
+rcu_pending() function decided that there was core RCU work to do).
The output of "cat rcu/rcudata" looks as follows:
been registered in absence of CPU-hotplug activity.
o "co" is the number of RCU callbacks that have been orphaned due to
- this CPU going offline.
+ this CPU going offline. These orphaned callbacks have been moved
+ to an arbitrarily chosen online CPU.
o "ca" is the number of RCU callbacks that have been adopted due to
other CPUs going offline. Note that ci+co-ca+ql is the number of
The output of "cat rcu/rcuhier" looks as follows, with very long lines:
-c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0
+c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6
1/1 .>. 0:127 ^0
3/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
3/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
rcu_bh:
-c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0
+c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0
0/1 .>. 0:127 ^0
0/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3
0/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3
exited immediately (without even being counted in nfqs above)
due to contention on ->fqslock.
-o "oqlen" is the number of callbacks on the "orphan" callback
- list. RCU callbacks are placed on this list by CPUs going
- offline, and are "adopted" either by the CPU helping the outgoing
- CPU or by the next rcu_barrier*() call, whichever comes first.
-
o Each element of the form "1/1 0:127 ^0" represents one struct
rcu_node. Each line represents one level of the hierarchy, from
root to leaves. It is best to think of the rcu_data structures
readers will note that the rcu "nn" number for a given CPU very
closely matches the rcu_bh "np" number for that same CPU. This
is due to short-circuit evaluation in rcu_pending().
+
+
+CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats
+
+These implementations of RCU provides a single debugfs file under the
+top-level directory RCU, namely rcu/rcudata, which displays fields in
+rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU,
+rcu_preempt_ctrlblk.
+
+The output of "cat rcu/rcudata" is as follows:
+
+rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=...
+ ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274
+ normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0
+ exp balk: bt=0 nos=0
+rcu_sched: qlen: 0
+rcu_bh: qlen: 0
+
+This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the
+rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds.
+The last three lines of the rcu_preempt section appear only in
+CONFIG_RCU_BOOST kernel builds. The fields are as follows:
+
+o "qlen" is the number of RCU callbacks currently waiting either
+ for an RCU grace period or waiting to be invoked. This is the
+ only field present for rcu_sched and rcu_bh, due to the
+ short-circuiting of grace period in those two cases.
+
+o "gp" is the number of grace periods that have completed.
+
+o "g197/p197/c197" displays the grace-period state, with the
+ "g" number being the number of grace periods that have started
+ (mod 256), the "p" number being the number of grace periods
+ that the CPU has responded to (also mod 256), and the "c"
+ number being the number of grace periods that have completed
+ (once again mode 256).
+
+ Why have both "gp" and "g"? Because the data flowing into
+ "gp" is only present in a CONFIG_RCU_TRACE kernel.
+
+o "tasks" is a set of bits. The first bit is "T" if there are
+ currently tasks that have recently blocked within an RCU
+ read-side critical section, the second bit is "N" if any of the
+ aforementioned tasks are blocking the current RCU grace period,
+ and the third bit is "E" if any of the aforementioned tasks are
+ blocking the current expedited grace period. Each bit is "."
+ if the corresponding condition does not hold.
+
+o "ttb" is a single bit. It is "B" if any of the blocked tasks
+ need to be priority boosted and "." otherwise.
+
+o "btg" indicates whether boosting has been carried out during
+ the current grace period, with "exp" indicating that boosting
+ is in progress for an expedited grace period, "no" indicating
+ that boosting has not yet started for a normal grace period,
+ "begun" indicating that boosting has bebug for a normal grace
+ period, and "done" indicating that boosting has completed for
+ a normal grace period.
+
+o "ntb" is the total number of tasks subjected to RCU priority boosting
+ periods since boot.
+
+o "neb" is the number of expedited grace periods that have had
+ to resort to RCU priority boosting since boot.
+
+o "nnb" is the number of normal grace periods that have had
+ to resort to RCU priority boosting since boot.
+
+o "j" is the low-order 12 bits of the jiffies counter in hexadecimal.
+
+o "bt" is the low-order 12 bits of the value that the jiffies counter
+ will have at the next time that boosting is scheduled to begin.
+
+o In the line beginning with "normal balk", the fields are as follows:
+
+ o "nt" is the number of times that the system balked from
+ boosting because there were no blocked tasks to boost.
+ Note that the system will balk from boosting even if the
+ grace period is overdue when the currently running task
+ is looping within an RCU read-side critical section.
+ There is no point in boosting in this case, because
+ boosting a running task won't make it run any faster.
+
+ o "gt" is the number of times that the system balked
+ from boosting because, although there were blocked tasks,
+ none of them were preventing the current grace period
+ from completing.
+
+ o "bt" is the number of times that the system balked
+ from boosting because boosting was already in progress.
+
+ o "b" is the number of times that the system balked from
+ boosting because boosting had already completed for
+ the grace period in question.
+
+ o "ny" is the number of times that the system balked from
+ boosting because it was not yet time to start boosting
+ the grace period in question.
+
+ o "nos" is the number of times that the system balked from
+ boosting for inexplicable ("not otherwise specified")
+ reasons. This can actually happen due to races involving
+ increments of the jiffies counter.
+
+o In the line beginning with "exp balk", the fields are as follows:
+
+ o "bt" is the number of times that the system balked from
+ boosting because there were no blocked tasks to boost.
+
+ o "nos" is the number of times that the system balked from
+ boosting for inexplicable ("not otherwise specified")
+ reasons.
aic7*seq.h*
aicasm
aicdb.h*
+altivec1.c
+altivec2.c
+altivec4.c
+altivec8.c
asm-offsets.h
asm_offsets.h
autoconf.h*
build
bvmlinux
bzImage*
+capflags.c
classlist.h*
comp*.log
compile.h*
docproc
elf2ecoff
elfconfig.h*
+evergreen_reg_safe.h
fixdep
flask.h
fore200e_mkfirm
*_gray256.c
ihex2fw
ikconfig.h*
+inat-tables.c
initramfs_data.cpio
initramfs_data.cpio.gz
initramfs_list
+int16.c
+int1.c
+int2.c
+int32.c
+int4.c
+int8.c
kallsyms
kconfig
keywords.c
mktables
mktree
modpost
+modules.builtin
modules.order
modversions.h*
ncscope.*
pca200e_ecd.bin2
piggy.gz
piggyback
+piggy.S
pnmtologo
ppc_defs.h*
pss_boot.h
qconf
+r100_reg_safe.h
+r200_reg_safe.h
+r300_reg_safe.h
+r420_reg_safe.h
+r600_reg_safe.h
raid6altivec*.c
raid6int*.c
raid6tables.c
relocs
+rn50_reg_safe.h
+rs600_reg_safe.h
+rv515_reg_safe.h
series
setup
setup.bin
sm_tbl*
split-include
syscalltab.h
+tables.c
tags
tftpboot.img
timeconst.h
vmlinux-*
vmlinux.aout
vmlinux.lds
+voffset.h
vsyscall.lds
vsyscall_32.lds
wanxlfw.inc
wakeup.lds
zImage*
zconf.hash.c
+zoffset.h
char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
locking rules:
- none have BKL
dcache_lock rename_lock ->d_lock may block
d_revalidate: no no no yes
d_hash no no no yes
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *);
int (*readlink) (struct dentry *, char __user *,int);
- int (*follow_link) (struct dentry *, struct nameidata *);
+ void * (*follow_link) (struct dentry *, struct nameidata *);
+ void (*put_link) (struct dentry *, struct nameidata *, void *);
void (*truncate) (struct inode *);
int (*permission) (struct inode *, int, struct nameidata *);
+ int (*check_acl)(struct inode *, int);
int (*setattr) (struct dentry *, struct iattr *);
int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*removexattr) (struct dentry *, const char *);
+ void (*truncate_range)(struct inode *, loff_t, loff_t);
+ long (*fallocate)(struct inode *inode, int mode, loff_t offset, loff_t len);
+ int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
locking rules:
- all may block, none have BKL
+ all may block
i_mutex(inode)
lookup: yes
create: yes
rename: yes (all) (see below)
readlink: no
follow_link: no
+put_link: no
truncate: yes (see below)
setattr: yes
permission: no
+check_acl: no
getattr: no
setxattr: yes
getxattr: no
listxattr: no
removexattr: yes
+truncate_range: yes
+fallocate: no
+fiemap: no
Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
victim.
cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
->truncate() is never called directly - it's a callback, not a
-method. It's called by vmtruncate() - library function normally used by
+method. It's called by vmtruncate() - deprecated library function used by
->setattr(). Locking information above applies to that call (i.e. is
inherited from ->setattr() - vmtruncate() is used when ATTR_SIZE had been
passed).
struct inode *(*alloc_inode)(struct super_block *sb);
void (*destroy_inode)(struct inode *);
void (*dirty_inode) (struct inode *);
- int (*write_inode) (struct inode *, int);
+ int (*write_inode) (struct inode *, struct writeback_control *wbc);
int (*drop_inode) (struct inode *);
void (*evict_inode) (struct inode *);
void (*put_super) (struct super_block *);
int (*show_options)(struct seq_file *, struct vfsmount *);
ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
+ int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
locking rules:
All may block [not true, see below]
- None have BKL
s_umount
alloc_inode:
destroy_inode:
show_options: no (namespace_sem)
quota_read: no (see below)
quota_write: no (see below)
+bdev_try_to_free_page: no (see below)
->statfs() has s_umount (shared) when called by ustat(2) (native or
compat), but that's an accident of bad API; s_umount is used to pin
dqio_sem) (unless an admin really wants to screw up something and
writes to quota files with quotas on). For other details about locking
see also dquot_operations section.
+->bdev_try_to_free_page is called from the ->releasepage handler of
+the block device inode. See there for more details.
--------------------------- file_system_type ---------------------------
prototypes:
int (*get_sb) (struct file_system_type *, int,
const char *, void *, struct vfsmount *);
+ struct dentry *(*mount) (struct file_system_type *, int,
+ const char *, void *);
void (*kill_sb) (struct super_block *);
locking rules:
- may block BKL
-get_sb yes no
-kill_sb yes no
+ may block
+get_sb yes
+mount yes
+kill_sb yes
->get_sb() returns error or 0 with locked superblock attached to the vfsmount
(exclusive on ->s_umount).
+->mount() returns ERR_PTR or the root dentry.
->kill_sb() takes a write-locked superblock, does all shutdown work on it,
unlocks and drops the reference.
void (*freepage)(struct page *);
int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
- int (*launder_page) (struct page *);
+ int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **,
+ unsigned long *);
+ int (*migratepage)(struct address_space *, struct page *, struct page *);
+ int (*launder_page)(struct page *);
+ int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long);
+ int (*error_remove_page)(struct address_space *, struct page *);
locking rules:
All except set_page_dirty and freepage may block
- BKL PageLocked(page) i_mutex
-writepage: no yes, unlocks (see below)
-readpage: no yes, unlocks
-sync_page: no maybe
-writepages: no
-set_page_dirty no no
-readpages: no
-write_begin: no locks the page yes
-write_end: no yes, unlocks yes
-perform_write: no n/a yes
-bmap: no
-invalidatepage: no yes
-releasepage: no yes
-freepage: no yes
-direct_IO: no
-launder_page: no yes
+ PageLocked(page) i_mutex
+writepage: yes, unlocks (see below)
+readpage: yes, unlocks
+sync_page: maybe
+writepages:
+set_page_dirty no
+readpages:
+write_begin: locks the page yes
+write_end: yes, unlocks yes
+bmap:
+invalidatepage: yes
+releasepage: yes
+freepage: yes
+direct_IO:
+get_xip_mem: maybe
+migratepage: yes (both)
+launder_page: yes
+is_partially_uptodate: yes
+error_remove_page: yes
->write_begin(), ->write_end(), ->sync_page() and ->readpage()
may be called from the request handler (/dev/loop).
not locked.
->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some
-filesystems and by the swapper. The latter will eventually go away. All
-instances do not actually need the BKL. Please, keep it that way and don't
-breed new callers.
+filesystems and by the swapper. The latter will eventually go away. Please,
+keep it that way and don't breed new callers.
->invalidatepage() is called when the filesystem must attempt to drop
some or all of the buffers from the page when it is being truncated. It
getting mapped back in and redirtied, it needs to be kept locked
across the entire operation.
- Note: currently almost all instances of address_space methods are
-using BKL for internal serialization and that's one of the worst sources
-of contention. Normally they are calling library functions (in fs/buffer.c)
-and pass foo_get_block() as a callback (on local block-based filesystems,
-indeed). BKL is not needed for library stuff and is usually taken by
-foo_get_block(). It's an overkill, since block bitmaps can be protected by
-internal fs locking and real critical areas are much smaller than the areas
-filesystems protect now.
-
----------------------- file_lock_operations ------------------------------
prototypes:
- void (*fl_insert)(struct file_lock *); /* lock insertion callback */
- void (*fl_remove)(struct file_lock *); /* lock removal callback */
void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
void (*fl_release_private)(struct file_lock *);
locking rules:
- BKL may block
-fl_insert: yes no
-fl_remove: yes no
-fl_copy_lock: yes no
-fl_release_private: yes yes
+ file_lock_lock may block
+fl_copy_lock: yes no
+fl_release_private: maybe no
----------------------- lock_manager_operations ---------------------------
prototypes:
int (*fl_compare_owner)(struct file_lock *, struct file_lock *);
void (*fl_notify)(struct file_lock *); /* unblock callback */
+ int (*fl_grant)(struct file_lock *, struct file_lock *, int);
void (*fl_release_private)(struct file_lock *);
void (*fl_break)(struct file_lock *); /* break_lease callback */
+ int (*fl_mylease)(struct file_lock *, struct file_lock *);
+ int (*fl_change)(struct file_lock **, int);
locking rules:
- BKL may block
-fl_compare_owner: yes no
-fl_notify: yes no
-fl_release_private: yes yes
-fl_break: yes no
-
- Currently only NFSD and NLM provide instances of this class. None of the
-them block. If you have out-of-tree instances - please, show up. Locking
-in that area will change.
+ file_lock_lock may block
+fl_compare_owner: yes no
+fl_notify: yes no
+fl_grant: no no
+fl_release_private: maybe no
+fl_break: yes no
+fl_mylease: yes no
+fl_change yes no
+
--------------------------- buffer_head -----------------------------------
prototypes:
void (*b_end_io)(struct buffer_head *bh, int uptodate);
void (*swap_slot_free_notify) (struct block_device *, unsigned long);
locking rules:
- BKL bd_mutex
-open: no yes
-release: no yes
-ioctl: no no
-compat_ioctl: no no
-direct_access: no no
-media_changed: no no
-unlock_native_capacity: no no
-revalidate_disk: no no
-getgeo: no no
-swap_slot_free_notify: no no (see below)
+ bd_mutex
+open: yes
+release: yes
+ioctl: no
+compat_ioctl: no
+direct_access: no
+media_changed: no
+unlock_native_capacity: no
+revalidate_disk: no
+getgeo: no
+swap_slot_free_notify: no (see below)
media_changed, unlock_native_capacity and revalidate_disk are called only from
check_disk_change().
unsigned long (*get_unmapped_area)(struct file *, unsigned long,
unsigned long, unsigned long, unsigned long);
int (*check_flags)(int);
+ int (*flock) (struct file *, int, struct file_lock *);
+ ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *,
+ size_t, unsigned int);
+ ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *,
+ size_t, unsigned int);
+ int (*setlease)(struct file *, long, struct file_lock **);
};
locking rules:
- All may block.
- BKL
-llseek: no (see below)
-read: no
-aio_read: no
-write: no
-aio_write: no
-readdir: no
-poll: no
-unlocked_ioctl: no
-compat_ioctl: no
-mmap: no
-open: no
-flush: no
-release: no
-fsync: no (see below)
-aio_fsync: no
-fasync: no
-lock: yes
-readv: no
-writev: no
-sendfile: no
-sendpage: no
-get_unmapped_area: no
-check_flags: no
+ All may block except for ->setlease.
+ No VFS locks held on entry except for ->fsync and ->setlease.
+
+->fsync() has i_mutex on inode.
+
+->setlease has the file_list_lock held and must not sleep.
->llseek() locking has moved from llseek to the individual llseek
implementations. If your fs is not using generic_file_llseek, you
Note: this does not protect the file->f_pos against concurrent modifications
since this is something the userspace has to take care about.
-Note: ext2_release() was *the* source of contention on fs-intensive
-loads and dropping BKL on ->release() helps to get rid of that (we still
-grab BKL for cases when we close a file that had been opened r/w, but that
-can and should be done using the internal locking with smaller critical areas).
-Current worst offender is ext2_get_block()...
-
-->fasync() is called without BKL protection, and is responsible for
-maintaining the FASYNC bit in filp->f_flags. Most instances call
-fasync_helper(), which does that maintenance, so it's not normally
-something one needs to worry about. Return values > 0 will be mapped to
-zero in the VFS layer.
+->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags.
+Most instances call fasync_helper(), which does that maintenance, so it's
+not normally something one needs to worry about. Return values > 0 will be
+mapped to zero in the VFS layer.
->readdir() and ->ioctl() on directories must be changed. Ideally we would
move ->readdir() to inode_operations and use a separate method for directory
->read on directories probably must go away - we should just enforce -EISDIR
in sys_read() and friends.
-->fsync() has i_mutex on inode.
-
--------------------------- dquot_operations -------------------------------
prototypes:
int (*write_dquot) (struct dquot *);
int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
locking rules:
- BKL mmap_sem PageLocked(page)
-open: no yes
-close: no yes
-fault: no yes can return with page locked
-page_mkwrite: no yes can return with page locked
-access: no yes
+ mmap_sem PageLocked(page)
+open: yes
+close: yes
+fault: yes can return with page locked
+page_mkwrite: yes can return with page locked
+access: yes
->fault() is called when a previously not present pte is about
to be faulted in. The filesystem must find and return the page associated
(if you break something or notice that it is broken and do not fix it yourself
- at least put it here)
-
-ipc/shm.c::shm_delete() - may need BKL.
-->read() and ->write() in many drivers are (probably) missing BKL.
Notes: Further information in
http://www.oreilly.com/catalog/linuxdrive2/
- * Title: "Linux Device Drivers, 3nd Edition"
+ * Title: "Linux Device Drivers, 3rd Edition"
Authors: Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman
Publisher: O'Reilly & Associates.
Date: 2005.
Pages: 600.
ISBN: 0-13-101908-2
- * Title: "The Design and Implementation of the 4.4 BSD UNIX
- Operating System"
- Author: Marshall Kirk McKusick, Keith Bostic, Michael J. Karels,
- John S. Quarterman.
- Publisher: Addison-Wesley.
- Date: 1996.
- ISBN: 0-201-54979-4
-
* Title: "Programming for the real world - POSIX.4"
Author: Bill O. Gallmeister.
Publisher: O'Reilly & Associates, Inc..
POSIX. Good reference.
* Title: "UNIX Systems for Modern Architectures: Symmetric
- Multiprocesssing and Caching for Kernel Programmers"
+ Multiprocessing and Caching for Kernel Programmers"
Author: Curt Schimmel.
Publisher: Addison Wesley.
Date: June, 1994.
Pages: 432.
ISBN: 0-201-63338-8
- * Title: "The Design and Implementation of the 4.3 BSD UNIX
- Operating System"
- Author: Samuel J. Leffler, Marshall Kirk McKusick, Michael J.
- Karels, John S. Quarterman.
- Publisher: Addison-Wesley.
- Date: 1989 (reprinted with corrections on October, 1990).
- ISBN: 0-201-06196-1
-
- * Title: "The Design of the UNIX Operating System"
- Author: Maurice J. Bach.
- Publisher: Prentice Hall.
- Date: 1986.
- Pages: 471.
- ISBN: 0-13-201757-1
-
MISCELLANEOUS:
* Name: linux/Documentation
nmi_watchdog= [KNL,BUGS=X86] Debugging features for SMP kernels
Format: [panic,][num]
- Valid num: 0,1,2
+ Valid num: 0
0 - turn nmi_watchdog off
- 1 - use the IO-APIC timer for the NMI watchdog
- 2 - use the local APIC for the NMI watchdog using
- a performance counter. Note: This will use one
- performance counter and the local APIC's performance
- vector.
When panic is specified, panic when an NMI watchdog
timeout occurs.
This is useful when you use a panic=... timeout and
need the box quickly up again.
- Instead of 1 and 2 it is possible to use the following
- symbolic names: lapic and ioapic
- Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic
netpoll.carrier_timeout=
[NET] Specifies amount of time (in seconds) that
noapic [SMP,APIC] Tells the kernel to not make use of any
IOAPICs that may be present in the system.
+ noautogroup Disable scheduler automatic task group creation.
+
nobats [PPC] Do not use BATs for mapping kernel lowmem
on "Classic" PPC cores.
nousb [USB] Disable the USB subsystem
- nowatchdog [KNL] Disable the lockup detector.
+ nowatchdog [KNL] Disable the lockup detector (NMI watchdog).
nowb [ARM]
to facilitate early boot debugging.
See also Documentation/trace/events.txt
- tsc= Disable clocksource-must-verify flag for TSC.
+ tsc= Disable clocksource stability checks for TSC.
Format: <string>
[x86] reliable: mark tsc clocksource as reliable, this
- disables clocksource verification at runtime.
- Used to enable high-resolution timer mode on older
- hardware, and in virtualized environment.
+ disables clocksource verification at runtime, as well
+ as the stability checks done at bootup. Used to enable
+ high-resolution timer mode on older hardware, and in
+ virtualized environment.
[x86] noirqtime: Do not use TSC to do irq accounting.
Used to run time disable IRQ_TIME_ACCOUNTING on any
platforms where RDTSC is slow and this accounting
--- /dev/null
+
+ Subsystem Trace Points: power
+
+The power tracing system captures events related to power transitions
+within the kernel. Broadly speaking there are three major subheadings:
+
+ o Power state switch which reports events related to suspend (S-states),
+ cpuidle (C-states) and cpufreq (P-states)
+ o System clock related changes
+ o Power domains related changes and transitions
+
+This document describes what each of the tracepoints is and why they
+might be useful.
+
+Cf. include/trace/events/power.h for the events definitions.
+
+1. Power state switch events
+============================
+
+1.1 New trace API
+-----------------
+
+A 'cpu' event class gathers the CPU-related events: cpuidle and
+cpufreq.
+
+cpu_idle "state=%lu cpu_id=%lu"
+cpu_frequency "state=%lu cpu_id=%lu"
+
+A suspend event is used to indicate the system going in and out of the
+suspend mode:
+
+machine_suspend "state=%lu"
+
+
+Note: the value of '-1' or '4294967295' for state means an exit from the current state,
+i.e. trace_cpu_idle(4, smp_processor_id()) means that the system
+enters the idle state 4, while trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id())
+means that the system exits the previous idle state.
+
+The event which has 'state=4294967295' in the trace is very important to the user
+space tools which are using it to detect the end of the current state, and so to
+correctly draw the states diagrams and to calculate accurate statistics etc.
+
+1.2 DEPRECATED trace API
+------------------------
+
+A new Kconfig option CONFIG_EVENT_POWER_TRACING_DEPRECATED with the default value of
+'y' has been created. This allows the legacy trace power API to be used conjointly
+with the new trace API.
+The Kconfig option, the old trace API (in include/trace/events/power.h) and the
+old trace points will disappear in a future release (namely 2.6.41).
+
+power_start "type=%lu state=%lu cpu_id=%lu"
+power_frequency "type=%lu state=%lu cpu_id=%lu"
+power_end "cpu_id=%lu"
+
+The 'type' parameter takes one of those macros:
+ . POWER_NONE = 0,
+ . POWER_CSTATE = 1, /* C-State */
+ . POWER_PSTATE = 2, /* Fequency change or DVFS */
+
+The 'state' parameter is set depending on the type:
+ . Target C-state for type=POWER_CSTATE,
+ . Target frequency for type=POWER_PSTATE,
+
+power_end is used to indicate the exit of a state, corresponding to the latest
+power_start event.
+
+2. Clocks events
+================
+The clock events are used for clock enable/disable and for
+clock rate change.
+
+clock_enable "%s state=%lu cpu_id=%lu"
+clock_disable "%s state=%lu cpu_id=%lu"
+clock_set_rate "%s state=%lu cpu_id=%lu"
+
+The first parameter gives the clock name (e.g. "gpio1_iclk").
+The second parameter is '1' for enable, '0' for disable, the target
+clock rate for set_rate.
+
+3. Power domains events
+=======================
+The power domain events are used for power domains transitions
+
+power_domain_target "%s state=%lu cpu_id=%lu"
+
+The first parameter gives the power domain name (e.g. "mpu_pwrdm").
+The second parameter is the power domain target state.
+
0x00000001 lguest
0x00000002 Xen
0x00000003 Moorestown MID
+ 0x00000004 CE4100 TV Platform
Field name: hardware_subarch_data
Type: write (subarch-dependent)
ARM/NOMADIK ARCHITECTURE
M: Alessandro Rubini <rubini@unipv.it>
+M: Linus Walleij <linus.walleij@stericsson.com>
M: STEricsson <STEricsson_nomadik_linux@list.st.com>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Maintained
F: arch/arm/mach-nomadik/
F: arch/arm/plat-nomadik/
+F: drivers/i2c/busses/i2c-nomadik.c
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-stericsson.git
ARM/OPENMOKO NEO FREERUNNER (GTA02) MACHINE SUPPORT
M: Nelson Castillo <arhuaco@freaks-unidos.net>
F: drivers/rtc/rtc-coh901331.c
F: drivers/watchdog/coh901327_wdt.c
F: drivers/dma/coh901318*
+F: drivers/mfd/ab3100*
+F: drivers/rtc/rtc-ab3100.c
+F: drivers/rtc/rtc-coh901331.c
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-stericsson.git
-ARM/U8500 ARM ARCHITECTURE
+ARM/Ux500 ARM ARCHITECTURE
M: Srinidhi Kasagar <srinidhi.kasagar@stericsson.com>
+M: Linus Walleij <linus.walleij@stericsson.com>
L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Maintained
F: arch/arm/mach-ux500/
+F: drivers/dma/ste_dma40*
+F: drivers/mfd/ab3550*
+F: drivers/mfd/abx500*
+F: drivers/mfd/ab8500*
+F: drivers/mfd/stmpe*
+F: drivers/rtc/rtc-ab8500.c
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-stericsson.git
ARM/VFP SUPPORT
M: Russell King <linux@arm.linux.org.uk>
S: Maintained
F: Documentation/timers/
F: kernel/hrtimer.c
+F: kernel/time/clockevents.c
+F: kernel/time/tick*.*
+F: kernel/time/timer_*.c
+F include/linux/clockevents.h
F: include/linux/hrtimer.h
HIGH-SPEED SCC DRIVER FOR AX.25
M: Peter Zijlstra <a.p.zijlstra@chello.nl>
M: Paul Mackerras <paulus@samba.org>
M: Ingo Molnar <mingo@elte.hu>
-M: Arnaldo Carvalho de Melo <acme@redhat.com>
+M: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
S: Supported
F: kernel/perf_event*.c
F: include/linux/perf_event.h
S: Supported
F: sound/soc/s3c24xx
+TIMEKEEPING, NTP
+M: John Stultz <johnstul@us.ibm.com>
+M: Thomas Gleixner <tglx@linutronix.de>
+S: Supported
+F: include/linux/clocksource.h
+F: include/linux/time.h
+F: include/linux/timex.h
+F: include/linux/timekeeping.h
+F: kernel/time/clocksource.c
+F: kernel/time/time*.c
+F: kernel/time/ntp.c
+
TLG2300 VIDEO4LINUX-2 DRIVER
M: Huang Shijie <shijie8@gmail.com>
M: Kang Yong <kangyong@telegent.com>
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 37
-EXTRAVERSION = -rc8
+EXTRAVERSION =
NAME = Flesh-Eating Bats with Fangs
# *DOCUMENTATION*
config HAVE_ARCH_JUMP_LABEL
bool
+config HAVE_ARCH_MUTEX_CPU_RELAX
+ bool
+
source "kernel/gcov/Kconfig"
#ifndef __ASM_ALPHA_PERF_EVENT_H
#define __ASM_ALPHA_PERF_EVENT_H
-#ifdef CONFIG_PERF_EVENTS
-extern void init_hw_perf_events(void);
-#else
-static inline void init_hw_perf_events(void) { }
-#endif
-
#endif /* __ASM_ALPHA_PERF_EVENT_H */
wrent(entInt, 0);
alpha_mv.init_irq();
-
- init_hw_perf_events();
}
/*
#include <linux/kernel.h>
#include <linux/kdebug.h>
#include <linux/mutex.h>
+#include <linux/init.h>
#include <asm/hwrpb.h>
#include <asm/atomic.h>
/*
* Init call to initialise performance events at kernel startup.
*/
-void __init init_hw_perf_events(void)
+int __init init_hw_perf_events(void)
{
pr_info("Performance events: ");
if (!supported_cpu()) {
pr_cont("No support for your CPU.\n");
- return;
+ return 0;
}
pr_cont("Supported CPU type!\n");
/* And set up PMU specification */
alpha_pmu = &ev67_pmu;
- perf_pmu_register(&pmu);
-}
+ perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
+ return 0;
+}
+early_initcall(init_hw_perf_events);
IT8152_PD_IRQ(0) Audio controller (ACR)
*/
#define IT8152_IRQ(x) (IRQ_BOARD_START + (x))
+#define IT8152_LAST_IRQ (IRQ_BOARD_START + 40)
/* IRQ-sources in 3 groups - local devices, LPC (serial), and external PCI */
#define IT8152_LD_IRQ_COUNT 9
extern void *kmap_high_get(struct page *page);
extern void kunmap_high(struct page *page);
-extern void *kmap_high_l1_vipt(struct page *page, pte_t *saved_pte);
-extern void kunmap_high_l1_vipt(struct page *page, pte_t saved_pte);
-
/*
* The following functions are already defined by <linux/highmem.h>
* when CONFIG_HIGHMEM is not set.
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-/* DO NOT EDIT!! - this file automatically generated
- * from .s file by awk -f s2h.awk
- */
/* Size definitions
* Copyright (C) ARM Limited 1998. All rights reserved.
*/
/* handy sizes */
#define SZ_16 0x00000010
+#define SZ_32 0x00000020
+#define SZ_64 0x00000040
+#define SZ_128 0x00000080
#define SZ_256 0x00000100
#define SZ_512 0x00000200
#define rmb() dmb()
#define wmb() mb()
#else
+#include <asm/memory.h>
#define mb() do { if (arch_is_coherent()) dmb(); else barrier(); } while (0)
#define rmb() do { if (arch_is_coherent()) dmb(); else barrier(); } while (0)
#define wmb() do { if (arch_is_coherent()) dmb(); else barrier(); } while (0)
ldr r1, [tsk, #TI_FLAGS]
tst r1, #_TIF_WORK_MASK
bne fast_work_pending
+#if defined(CONFIG_IRQSOFF_TRACER)
+ asm_trace_hardirqs_on
+#endif
/* perform architecture specific actions before user return */
arch_ret_to_user r1, lr
tst r1, #_TIF_WORK_MASK
bne work_pending
no_work_pending:
+#if defined(CONFIG_IRQSOFF_TRACER)
+ asm_trace_hardirqs_on
+#endif
/* perform architecture specific actions before user return */
arch_ret_to_user r1, lr
pr_info("no hardware support available\n");
}
- perf_pmu_register(&pmu);
+ perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
return 0;
}
-arch_initcall(init_hw_perf_events);
+early_initcall(init_hw_perf_events);
/*
* Callchain handling code.
* All kernel threads share the same mm context; grab a
* reference and switch to it.
*/
- atomic_inc(&mm->mm_users);
atomic_inc(&mm->mm_count);
current->active_mm = mm;
cpumask_set_cpu(cpu, mm_cpumask(mm));
config ARCH_PXA_ESERIES
bool "PXA based Toshiba e-series PDAs"
select PXA25x
+ select FB_W100
config MACH_E330
bool "Toshiba e330"
@ Let us ensure we jump to resume_after_mmu only when the mcr above
@ actually took effect. They call it the "cpwait" operation.
- mrc p15, 0, r1, c2, c0, 0 @ queue a dependency on CP15
- sub pc, r2, r1, lsr #32 @ jump to virtual addr
+ mrc p15, 0, r0, c2, c0, 0 @ queue a dependency on CP15
+ sub pc, r2, r0, lsr #32 @ jump to virtual addr
nop
nop
nop
*/
#include <linux/init.h>
+#include <linux/highmem.h>
#include <asm/cacheflush.h>
-#include <asm/kmap_types.h>
-#include <asm/fixmap.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
#include <plat/cache-feroceon-l2.h>
-#include "mm.h"
/*
* Low-level cache maintenance operations.
* between which we don't want to be preempted.
*/
-static inline unsigned long l2_start_va(unsigned long paddr)
+static inline unsigned long l2_get_va(unsigned long paddr)
{
#ifdef CONFIG_HIGHMEM
/*
- * Let's do our own fixmap stuff in a minimal way here.
* Because range ops can't be done on physical addresses,
* we simply install a virtual mapping for it only for the
* TLB lookup to occur, hence no need to flush the untouched
- * memory mapping. This is protected with the disabling of
- * interrupts by the caller.
+ * memory mapping afterwards (note: a cache flush may happen
+ * in some circumstances depending on the path taken in kunmap_atomic).
*/
- unsigned long idx = KM_L2_CACHE + KM_TYPE_NR * smp_processor_id();
- unsigned long vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
- set_pte_ext(TOP_PTE(vaddr), pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL), 0);
- local_flush_tlb_kernel_page(vaddr);
- return vaddr + (paddr & ~PAGE_MASK);
+ void *vaddr = kmap_atomic_pfn(paddr >> PAGE_SHIFT);
+ return (unsigned long)vaddr + (paddr & ~PAGE_MASK);
#else
return __phys_to_virt(paddr);
#endif
}
+static inline void l2_put_va(unsigned long vaddr)
+{
+#ifdef CONFIG_HIGHMEM
+ kunmap_atomic((void *)vaddr);
+#endif
+}
+
static inline void l2_clean_pa(unsigned long addr)
{
__asm__("mcr p15, 1, %0, c15, c9, 3" : : "r" (addr));
*/
BUG_ON((start ^ end) >> PAGE_SHIFT);
- raw_local_irq_save(flags);
- va_start = l2_start_va(start);
+ va_start = l2_get_va(start);
va_end = va_start + (end - start);
+ raw_local_irq_save(flags);
__asm__("mcr p15, 1, %0, c15, c9, 4\n\t"
"mcr p15, 1, %1, c15, c9, 5"
: : "r" (va_start), "r" (va_end));
raw_local_irq_restore(flags);
+ l2_put_va(va_start);
}
static inline void l2_clean_inv_pa(unsigned long addr)
*/
BUG_ON((start ^ end) >> PAGE_SHIFT);
- raw_local_irq_save(flags);
- va_start = l2_start_va(start);
+ va_start = l2_get_va(start);
va_end = va_start + (end - start);
+ raw_local_irq_save(flags);
__asm__("mcr p15, 1, %0, c15, c11, 4\n\t"
"mcr p15, 1, %1, c15, c11, 5"
: : "r" (va_start), "r" (va_end));
raw_local_irq_restore(flags);
+ l2_put_va(va_start);
}
static inline void l2_inv_all(void)
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/init.h>
+#include <linux/highmem.h>
#include <asm/system.h>
#include <asm/cputype.h>
#include <asm/cacheflush.h>
-#include <asm/kmap_types.h>
-#include <asm/fixmap.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-#include "mm.h"
#define CR_L2 (1 << 26)
dsb();
}
+static inline void l2_unmap_va(unsigned long va)
+{
#ifdef CONFIG_HIGHMEM
-#define l2_map_save_flags(x) raw_local_save_flags(x)
-#define l2_map_restore_flags(x) raw_local_irq_restore(x)
-#else
-#define l2_map_save_flags(x) ((x) = 0)
-#define l2_map_restore_flags(x) ((void)(x))
+ if (va != -1)
+ kunmap_atomic((void *)va);
#endif
+}
-static inline unsigned long l2_map_va(unsigned long pa, unsigned long prev_va,
- unsigned long flags)
+static inline unsigned long l2_map_va(unsigned long pa, unsigned long prev_va)
{
#ifdef CONFIG_HIGHMEM
unsigned long va = prev_va & PAGE_MASK;
/*
* Switching to a new page. Because cache ops are
* using virtual addresses only, we must put a mapping
- * in place for it. We also enable interrupts for a
- * short while and disable them again to protect this
- * mapping.
+ * in place for it.
*/
- unsigned long idx;
- raw_local_irq_restore(flags);
- idx = KM_L2_CACHE + KM_TYPE_NR * smp_processor_id();
- va = __fix_to_virt(FIX_KMAP_BEGIN + idx);
- raw_local_irq_restore(flags | PSR_I_BIT);
- set_pte_ext(TOP_PTE(va), pfn_pte(pa >> PAGE_SHIFT, PAGE_KERNEL), 0);
- local_flush_tlb_kernel_page(va);
+ l2_unmap_va(prev_va);
+ va = (unsigned long)kmap_atomic_pfn(pa >> PAGE_SHIFT);
}
return va + (pa_offset >> (32 - PAGE_SHIFT));
#else
static void xsc3_l2_inv_range(unsigned long start, unsigned long end)
{
- unsigned long vaddr, flags;
+ unsigned long vaddr;
if (start == 0 && end == -1ul) {
xsc3_l2_inv_all();
}
vaddr = -1; /* to force the first mapping */
- l2_map_save_flags(flags);
/*
* Clean and invalidate partial first cache line.
*/
if (start & (CACHE_LINE_SIZE - 1)) {
- vaddr = l2_map_va(start & ~(CACHE_LINE_SIZE - 1), vaddr, flags);
+ vaddr = l2_map_va(start & ~(CACHE_LINE_SIZE - 1), vaddr);
xsc3_l2_clean_mva(vaddr);
xsc3_l2_inv_mva(vaddr);
start = (start | (CACHE_LINE_SIZE - 1)) + 1;
* Invalidate all full cache lines between 'start' and 'end'.
*/
while (start < (end & ~(CACHE_LINE_SIZE - 1))) {
- vaddr = l2_map_va(start, vaddr, flags);
+ vaddr = l2_map_va(start, vaddr);
xsc3_l2_inv_mva(vaddr);
start += CACHE_LINE_SIZE;
}
* Clean and invalidate partial last cache line.
*/
if (start < end) {
- vaddr = l2_map_va(start, vaddr, flags);
+ vaddr = l2_map_va(start, vaddr);
xsc3_l2_clean_mva(vaddr);
xsc3_l2_inv_mva(vaddr);
}
- l2_map_restore_flags(flags);
+ l2_unmap_va(vaddr);
dsb();
}
static void xsc3_l2_clean_range(unsigned long start, unsigned long end)
{
- unsigned long vaddr, flags;
+ unsigned long vaddr;
vaddr = -1; /* to force the first mapping */
- l2_map_save_flags(flags);
start &= ~(CACHE_LINE_SIZE - 1);
while (start < end) {
- vaddr = l2_map_va(start, vaddr, flags);
+ vaddr = l2_map_va(start, vaddr);
xsc3_l2_clean_mva(vaddr);
start += CACHE_LINE_SIZE;
}
- l2_map_restore_flags(flags);
+ l2_unmap_va(vaddr);
dsb();
}
static void xsc3_l2_flush_range(unsigned long start, unsigned long end)
{
- unsigned long vaddr, flags;
+ unsigned long vaddr;
if (start == 0 && end == -1ul) {
xsc3_l2_flush_all();
}
vaddr = -1; /* to force the first mapping */
- l2_map_save_flags(flags);
start &= ~(CACHE_LINE_SIZE - 1);
while (start < end) {
- vaddr = l2_map_va(start, vaddr, flags);
+ vaddr = l2_map_va(start, vaddr);
xsc3_l2_clean_mva(vaddr);
xsc3_l2_inv_mva(vaddr);
start += CACHE_LINE_SIZE;
}
- l2_map_restore_flags(flags);
+ l2_unmap_va(vaddr);
dsb();
}
#include <linux/init.h>
#include <linux/device.h>
#include <linux/dma-mapping.h>
+#include <linux/highmem.h>
#include <asm/memory.h>
#include <asm/highmem.h>
op(vaddr, len, dir);
kunmap_high(page);
} else if (cache_is_vipt()) {
- pte_t saved_pte;
- vaddr = kmap_high_l1_vipt(page, &saved_pte);
+ /* unmapped pages might still be cached */
+ vaddr = kmap_atomic(page);
op(vaddr + offset, len, dir);
- kunmap_high_l1_vipt(page, saved_pte);
+ kunmap_atomic(vaddr);
}
} else {
vaddr = page_address(page) + offset;
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
+#include <linux/highmem.h>
#include <asm/cacheflush.h>
#include <asm/cachetype.h>
__cpuc_flush_dcache_area(addr, PAGE_SIZE);
kunmap_high(page);
} else if (cache_is_vipt()) {
- pte_t saved_pte;
- addr = kmap_high_l1_vipt(page, &saved_pte);
+ /* unmapped pages might still be cached */
+ addr = kmap_atomic(page);
__cpuc_flush_dcache_area(addr, PAGE_SIZE);
- kunmap_high_l1_vipt(page, saved_pte);
+ kunmap_atomic(addr);
}
}
pte = TOP_PTE(vaddr);
return pte_page(*pte);
}
-
-#ifdef CONFIG_CPU_CACHE_VIPT
-
-#include <linux/percpu.h>
-
-/*
- * The VIVT cache of a highmem page is always flushed before the page
- * is unmapped. Hence unmapped highmem pages need no cache maintenance
- * in that case.
- *
- * However unmapped pages may still be cached with a VIPT cache, and
- * it is not possible to perform cache maintenance on them using physical
- * addresses unfortunately. So we have no choice but to set up a temporary
- * virtual mapping for that purpose.
- *
- * Yet this VIPT cache maintenance may be triggered from DMA support
- * functions which are possibly called from interrupt context. As we don't
- * want to keep interrupt disabled all the time when such maintenance is
- * taking place, we therefore allow for some reentrancy by preserving and
- * restoring the previous fixmap entry before the interrupted context is
- * resumed. If the reentrancy depth is 0 then there is no need to restore
- * the previous fixmap, and leaving the current one in place allow it to
- * be reused the next time without a TLB flush (common with DMA).
- */
-
-static DEFINE_PER_CPU(int, kmap_high_l1_vipt_depth);
-
-void *kmap_high_l1_vipt(struct page *page, pte_t *saved_pte)
-{
- unsigned int idx, cpu;
- int *depth;
- unsigned long vaddr, flags;
- pte_t pte, *ptep;
-
- if (!in_interrupt())
- preempt_disable();
-
- cpu = smp_processor_id();
- depth = &per_cpu(kmap_high_l1_vipt_depth, cpu);
-
- idx = KM_L1_CACHE + KM_TYPE_NR * cpu;
- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
- ptep = TOP_PTE(vaddr);
- pte = mk_pte(page, kmap_prot);
-
- raw_local_irq_save(flags);
- (*depth)++;
- if (pte_val(*ptep) == pte_val(pte)) {
- *saved_pte = pte;
- } else {
- *saved_pte = *ptep;
- set_pte_ext(ptep, pte, 0);
- local_flush_tlb_kernel_page(vaddr);
- }
- raw_local_irq_restore(flags);
-
- return (void *)vaddr;
-}
-
-void kunmap_high_l1_vipt(struct page *page, pte_t saved_pte)
-{
- unsigned int idx, cpu = smp_processor_id();
- int *depth = &per_cpu(kmap_high_l1_vipt_depth, cpu);
- unsigned long vaddr, flags;
- pte_t pte, *ptep;
-
- idx = KM_L1_CACHE + KM_TYPE_NR * cpu;
- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
- ptep = TOP_PTE(vaddr);
- pte = mk_pte(page, kmap_prot);
-
- BUG_ON(pte_val(*ptep) != pte_val(pte));
- BUG_ON(*depth <= 0);
-
- raw_local_irq_save(flags);
- (*depth)--;
- if (*depth != 0 && pte_val(pte) != pte_val(saved_pte)) {
- set_pte_ext(ptep, saved_pte, 0);
- local_flush_tlb_kernel_page(vaddr);
- }
- raw_local_irq_restore(flags);
-
- if (!in_interrupt())
- preempt_enable();
-}
-
-#endif /* CONFIG_CPU_CACHE_VIPT */
return 0;
}
-arch_initcall(init_hw_perf_events);
+early_initcall(init_hw_perf_events);
#endif /* defined(CONFIG_CPU_MIPS32)... */
tmp = CROSS_GxICR(irq, new);
x &= GxICR_LEVEL | GxICR_ENABLE;
- if (GxICR(irq) & GxICR_REQUEST) {
+ if (GxICR(irq) & GxICR_REQUEST)
x |= GxICR_REQUEST | GxICR_DETECT;
CROSS_GxICR(irq, new) = x;
tmp = CROSS_GxICR(irq, new);
return register_fsl_emb_pmu(&e500_pmu);
}
-arch_initcall(init_e500_pmu);
+early_initcall(init_e500_pmu);
return register_power_pmu(&mpc7450_pmu);
}
-arch_initcall(init_mpc7450_pmu);
+early_initcall(init_mpc7450_pmu);
freeze_events_kernel = MMCR0_FCHV;
#endif /* CONFIG_PPC64 */
- perf_pmu_register(&power_pmu);
+ perf_pmu_register(&power_pmu, "cpu", PERF_TYPE_RAW);
perf_cpu_notifier(power_pmu_notifier);
return 0;
pr_info("%s performance monitor hardware support registered\n",
pmu->name);
- perf_pmu_register(&fsl_emb_pmu);
+ perf_pmu_register(&fsl_emb_pmu, "cpu", PERF_TYPE_RAW);
return 0;
}
return register_power_pmu(&power4_pmu);
}
-arch_initcall(init_power4_pmu);
+early_initcall(init_power4_pmu);
return register_power_pmu(&power5p_pmu);
}
-arch_initcall(init_power5p_pmu);
+early_initcall(init_power5p_pmu);
return register_power_pmu(&power5_pmu);
}
-arch_initcall(init_power5_pmu);
+early_initcall(init_power5_pmu);
return register_power_pmu(&power6_pmu);
}
-arch_initcall(init_power6_pmu);
+early_initcall(init_power6_pmu);
return register_power_pmu(&power7_pmu);
}
-arch_initcall(init_power7_pmu);
+early_initcall(init_power7_pmu);
return register_power_pmu(&ppc970_pmu);
}
-arch_initcall(init_ppc970_pmu);
+early_initcall(init_ppc970_pmu);
select HAVE_KERNEL_LZMA
select HAVE_KERNEL_LZO
select HAVE_GET_USER_PAGES_FAST
+ select HAVE_ARCH_MUTEX_CPU_RELAX
select ARCH_INLINE_SPIN_TRYLOCK
select ARCH_INLINE_SPIN_TRYLOCK_BH
select ARCH_INLINE_SPIN_LOCK
*/
#include <asm-generic/mutex-dec.h>
+
+#define arch_mutex_cpu_relax() barrier()
return register_sh_pmu(&sh7750_pmu);
}
-arch_initcall(sh7750_pmu_init);
+early_initcall(sh7750_pmu_init);
return register_sh_pmu(&sh4a_pmu);
}
-arch_initcall(sh4a_pmu_init);
+early_initcall(sh4a_pmu_init);
WARN_ON(_pmu->num_events > MAX_HWEVENTS);
- perf_pmu_register(&pmu);
+ perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
perf_cpu_notifier(sh_pmu_notifier);
return 0;
}
#ifdef CONFIG_PERF_EVENTS
#include <asm/ptrace.h>
-extern void init_hw_perf_events(void);
-
#define perf_arch_fetch_caller_regs(regs, ip) \
do { \
unsigned long _pstate, _asi, _pil, _i7, _fp; \
(regs)->u_regs[UREG_I6] = _fp; \
(regs)->u_regs[UREG_I7] = _i7; \
} while (0)
-#else
-static inline void init_hw_perf_events(void) { }
#endif
#endif
atomic_set(&nmi_active, -1);
}
}
- if (!err)
- init_hw_perf_events();
return err;
}
return false;
}
-void __init init_hw_perf_events(void)
+int __init init_hw_perf_events(void)
{
pr_info("Performance events: ");
if (!supported_pmu()) {
pr_cont("No support for PMU type '%s'\n", sparc_pmu_type);
- return;
+ return 0;
}
pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type);
- perf_pmu_register(&pmu);
+ perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
register_die_notifier(&perf_event_nmi_notifier);
+
+ return 0;
}
+early_initcall(init_hw_perf_events);
void perf_callchain_kernel(struct perf_callchain_entry *entry,
struct pt_regs *regs)
If unsure, choose "PC-compatible" instead.
+config X86_INTEL_CE
+ bool "CE4100 TV platform"
+ depends on PCI
+ depends on PCI_GODIRECT
+ depends on X86_32
+ depends on X86_EXTENDED_PLATFORM
+ select X86_REBOOTFIXUPS
+ ---help---
+ Select for the Intel CE media processor (CE4100) SOC.
+ This option compiles in support for the CE4100 SOC for settop
+ boxes and media devices.
+
config X86_MRST
bool "Moorestown MID platform"
depends on PCI
depends on X86_EXTENDED_PLATFORM
depends on X86_IO_APIC
select APB_TIMER
+ select I2C
+ select SPI
+ select INTEL_SCU_IPC
+ select X86_PLATFORM_DEVICES
---help---
Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
Internet Device(MID) platform. Moorestown consists of two chips:
Support for Unisys ES7000 systems. Say 'Y' here if this kernel is
supposed to run on an IA32-based Unisys ES7000 system.
+config X86_32_IRIS
+ tristate "Eurobraille/Iris poweroff module"
+ depends on X86_32
+ ---help---
+ The Iris machines from EuroBraille do not have APM or ACPI support
+ to shut themselves down properly. A special I/O sequence is
+ needed to do so, which is what this module does at
+ kernel shutdown.
+
+ This is only for Iris machines from EuroBraille.
+
+ If unused, say N.
+
config SCHED_OMIT_FRAME_POINTER
def_bool y
prompt "Single-depth WCHAN output"
comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
-config K8_NUMA
+config AMD_NUMA
def_bool y
prompt "Old style AMD Opteron NUMA detection"
depends on X86_64 && NUMA && PCI
---help---
- Enable K8 NUMA node topology detection. You should say Y here if
- you have a multi processor AMD K8 system. This uses an old
- method to read the NUMA configuration directly from the builtin
- Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
- instead, which also takes priority if both are compiled in.
+ Enable AMD NUMA node topology detection. You should say Y here if
+ you have a multi processor AMD system. This uses an old method to
+ read the NUMA configuration directly from the builtin Northbridge
+ of Opteron. It is recommended to use X86_64_ACPI_NUMA instead,
+ which also takes priority if both are compiled in.
config X86_64_ACPI_NUMA
def_bool y
feature as well as for the change_page_attr() infrastructure.
If in doubt, say "N"
+config DEBUG_SET_MODULE_RONX
+ bool "Set loadable kernel module data as NX and text as RO"
+ depends on MODULES
+ ---help---
+ This option helps catch unintended modifications to loadable
+ kernel module's text and read-only data. It also prevents execution
+ of module data. Such protection may interfere with run-time code
+ patching and dynamic kernel tracing - and they might also protect
+ against certain classes of kernel exploits.
+ If in doubt, say "N".
+
config DEBUG_NX_TEST
tristate "Testcase for the NX non-executable stack feature"
depends on DEBUG_KERNEL && m
hlt
jmp 1b
-#include "../../kernel/verify_cpu_64.S"
+#include "../../kernel/verify_cpu.S"
/*
* Be careful here startup_64 needs to be at a predictable
extern void alternatives_smp_module_del(struct module *mod);
extern void alternatives_smp_switch(int smp);
extern int alternatives_text_reserved(void *start, void *end);
+extern bool skip_smp_alternatives;
#else
static inline void alternatives_smp_module_add(struct module *mod, char *name,
void *locks, void *locks_end,
* On the local CPU you need to be protected again NMI or MCE handlers seeing an
* inconsistent instruction while you patch.
*/
+struct text_poke_param {
+ void *addr;
+ const void *opcode;
+ size_t len;
+};
+
extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
+extern void text_poke_smp_batch(struct text_poke_param *params, int n);
#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
#define IDEAL_NOP_SIZE_5 5
#include <linux/pci.h>
-extern struct pci_device_id k8_nb_ids[];
+extern struct pci_device_id amd_nb_misc_ids[];
struct bootnode;
-extern int early_is_k8_nb(u32 value);
-extern int cache_k8_northbridges(void);
-extern void k8_flush_garts(void);
-extern int k8_get_nodes(struct bootnode *nodes);
-extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
-extern int k8_scan_nodes(void);
+extern int early_is_amd_nb(u32 value);
+extern int amd_cache_northbridges(void);
+extern void amd_flush_garts(void);
+extern int amd_get_nodes(struct bootnode *nodes);
+extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn);
+extern int amd_scan_nodes(void);
-struct k8_northbridge_info {
+struct amd_northbridge {
+ struct pci_dev *misc;
+};
+
+struct amd_northbridge_info {
u16 num;
- u8 gart_supported;
- struct pci_dev **nb_misc;
+ u64 flags;
+ struct amd_northbridge *nb;
};
-extern struct k8_northbridge_info k8_northbridges;
+extern struct amd_northbridge_info amd_northbridges;
+
+#define AMD_NB_GART 0x1
+#define AMD_NB_L3_INDEX_DISABLE 0x2
#ifdef CONFIG_AMD_NB
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
+static inline int amd_nb_num(void)
{
- return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL;
+ return amd_northbridges.num;
}
-#else
+static inline int amd_nb_has_feature(int feature)
+{
+ return ((amd_northbridges.flags & feature) == feature);
+}
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
+static inline struct amd_northbridge *node_to_amd_nb(int node)
{
- return NULL;
+ return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
}
+
+#else
+
+#define amd_nb_num(x) 0
+#define amd_nb_has_feature(x) false
+#define node_to_amd_nb(x) NULL
+
#endif
extern void setup_secondary_APIC_clock(void);
extern int APIC_init_uniprocessor(void);
extern void enable_NMI_through_LVT0(void);
+extern int apic_force_enable(void);
/*
* On 32bit this is mach-xxx local
#ifdef CONFIG_X86_32
# define MAX_IO_APICS 64
+# define MAX_LOCAL_APIC 256
#else
# define MAX_IO_APICS 128
# define MAX_LOCAL_APIC 32768
X86_SUBARCH_LGUEST,
X86_SUBARCH_XEN,
X86_SUBARCH_MRST,
+ X86_SUBARCH_CE4100,
X86_NR_SUBARCHS,
};
FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
__end_of_permanent_fixed_addresses,
+
+#ifdef CONFIG_X86_MRST
+ FIX_LNW_VRTC,
+#endif
/*
* 256 temporary boot-time mappings, used by early_ioremap(),
* before ioremap() is functional.
int err;
/* See comment in fxsave() below. */
+#ifdef CONFIG_AS_FXSAVEQ
+ asm volatile("1: fxrstorq %[fx]\n\t"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : [err] "=r" (err)
+ : [fx] "m" (*fx), "0" (0));
+#else
asm volatile("1: rex64/fxrstor (%[fx])\n\t"
"2:\n"
".section .fixup,\"ax\"\n"
_ASM_EXTABLE(1b, 3b)
: [err] "=r" (err)
: [fx] "R" (fx), "m" (*fx), "0" (0));
+#endif
return err;
}
return -EFAULT;
/* See comment in fxsave() below. */
+#ifdef CONFIG_AS_FXSAVEQ
+ asm volatile("1: fxsaveq %[fx]\n\t"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : [err] "=r" (err), [fx] "=m" (*fx)
+ : "0" (0));
+#else
asm volatile("1: rex64/fxsave (%[fx])\n\t"
"2:\n"
".section .fixup,\"ax\"\n"
_ASM_EXTABLE(1b, 3b)
: [err] "=r" (err), "=m" (*fx)
: [fx] "R" (fx), "0" (0));
+#endif
if (unlikely(err) &&
__clear_user(fx, sizeof(struct i387_fxsave_struct)))
err = -EFAULT;
extern int io_apic_set_pci_routing(struct device *dev, int irq,
struct io_apic_irq_attr *irq_attr);
void setup_IO_APIC_irq_extra(u32 gsi);
-extern void ioapic_init_mappings(void);
+extern void ioapic_and_gsi_init(void);
extern void ioapic_insert_resources(void);
extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
-extern void probe_nr_irqs_gsi(void);
extern int get_nr_irqs_gsi(void);
-
extern void setup_ioapic_ids_from_mpc(void);
+extern void setup_ioapic_ids_from_mpc_nocheck(void);
struct mp_ioapic_gsi{
u32 gsi_base;
#define io_apic_assign_pci_irqs 0
#define setup_ioapic_ids_from_mpc x86_init_noop
static const int timer_through_8259 = 0;
-static inline void ioapic_init_mappings(void) { }
+static inline void ioapic_and_gsi_init(void) { }
static inline void ioapic_insert_resources(void) { }
-static inline void probe_nr_irqs_gsi(void) { }
#define gsi_top (NR_IRQS_LEGACY)
static inline int mp_find_ioapic(u32 gsi) { return 0; }
return ((irq == 2) ? 9 : irq);
}
-#ifdef CONFIG_X86_LOCAL_APIC
-# define ARCH_HAS_NMI_WATCHDOG
-#endif
-
#ifdef CONFIG_X86_32
extern void irq_ctx_init(int cpu);
#else
extern int __must_check __die(const char *, struct pt_regs *, long);
extern void show_registers(struct pt_regs *regs);
extern void show_trace(struct task_struct *t, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp);
+ unsigned long *sp);
extern void __show_regs(struct pt_regs *regs, int all);
extern void show_regs(struct pt_regs *regs);
extern unsigned long oops_begin(void);
#ifdef CONFIG_MICROCODE_AMD
extern struct microcode_ops * __init init_amd_microcode(void);
+
+static inline void get_ucode_data(void *to, const u8 *from, size_t n)
+{
+ memcpy(to, from, n);
+}
+
#else
static inline struct microcode_ops * __init init_amd_microcode(void)
{
#include <asm/mpspec_def.h>
#include <asm/x86_init.h>
+#include <asm/apicdef.h>
-extern int apic_version[MAX_APICS];
+extern int apic_version[];
extern int pic_mode;
#ifdef CONFIG_X86_32
int active_high_low);
#endif /* CONFIG_ACPI */
-#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS)
+#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
struct physid_mask {
unsigned long mask[PHYSID_ARRAY_SIZE];
test_and_set_bit(physid, (map).mask)
#define physids_and(dst, src1, src2) \
- bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+ bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
#define physids_or(dst, src1, src2) \
- bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+ bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
#define physids_clear(map) \
- bitmap_zero((map).mask, MAX_APICS)
+ bitmap_zero((map).mask, MAX_LOCAL_APIC)
#define physids_complement(dst, src) \
- bitmap_complement((dst).mask, (src).mask, MAX_APICS)
+ bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC)
#define physids_empty(map) \
- bitmap_empty((map).mask, MAX_APICS)
+ bitmap_empty((map).mask, MAX_LOCAL_APIC)
#define physids_equal(map1, map2) \
- bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
+ bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC)
#define physids_weight(map) \
- bitmap_weight((map).mask, MAX_APICS)
+ bitmap_weight((map).mask, MAX_LOCAL_APIC)
#define physids_shift_right(d, s, n) \
- bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
+ bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC)
#define physids_shift_left(d, s, n) \
- bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
+ bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC)
static inline unsigned long physids_coerce(physid_mask_t *map)
{
map->mask[0] = physids;
}
-/* Note: will create very large stack frames if physid_mask_t is big */
-#define physid_mask_of_physid(physid) \
- ({ \
- physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
- physid_set(physid, __physid_mask); \
- __physid_mask; \
- })
-
static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
{
physids_clear(*map);
#ifdef CONFIG_X86_32
# define MAX_MPC_ENTRY 1024
-# define MAX_APICS 256
-#else
-# if NR_CPUS <= 255
-# define MAX_APICS 255
-# else
-# define MAX_APICS 32768
-# endif
#endif
/* Intel MP Floating Pointer Structure */
--- /dev/null
+#ifndef _MRST_VRTC_H
+#define _MRST_VRTC_H
+
+extern unsigned char vrtc_cmos_read(unsigned char reg);
+extern void vrtc_cmos_write(unsigned char val, unsigned char reg);
+extern unsigned long vrtc_get_time(void);
+extern int vrtc_set_mmss(unsigned long nowtime);
+
+#endif
#include <linux/sfi.h>
extern int pci_mrst_init(void);
-int __init sfi_parse_mrtc(struct sfi_table_header *table);
+extern int __init sfi_parse_mrtc(struct sfi_table_header *table);
+extern int sfi_mrtc_num;
+extern struct sfi_rtc_table_entry sfi_mrtc_array[];
/*
* Medfield is the follow-up of Moorestown, it combines two chip solution into
extern struct console early_hsu_console;
extern void hsu_early_console_init(void);
+
+extern void intel_scu_devices_create(void);
+extern void intel_scu_devices_destroy(void);
+
+/* VRTC timer */
+#define MRST_VRTC_MAP_SZ (1024)
+/*#define MRST_VRTC_PGOFFSET (0xc00) */
+
+extern void mrst_rtc_init(void);
+
#endif /* _ASM_X86_MRST_H */
#define MSR_AMD64_IBSCTL 0xc001103a
#define MSR_AMD64_IBSBRTARGET 0xc001103b
+/* Fam 15h MSRs */
+#define MSR_F15H_PERF_CTL 0xc0010200
+#define MSR_F15H_PERF_CTR 0xc0010201
+
/* Fam 10h MSRs */
#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
#define FAM10H_MMIO_CONF_ENABLE (1<<0)
#include <asm/irq.h>
#include <asm/io.h>
-#ifdef ARCH_HAS_NMI_WATCHDOG
-
-/**
- * do_nmi_callback
- *
- * Check to see if a callback exists and execute it. Return 1
- * if the handler exists and was handled successfully.
- */
-int do_nmi_callback(struct pt_regs *regs, int cpu);
+#ifdef CONFIG_X86_LOCAL_APIC
extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
-extern int check_nmi_watchdog(void);
-#if !defined(CONFIG_LOCKUP_DETECTOR)
-extern int nmi_watchdog_enabled;
-#endif
extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
extern int reserve_perfctr_nmi(unsigned int);
extern void release_perfctr_nmi(unsigned int);
extern int reserve_evntsel_nmi(unsigned int);
extern void release_evntsel_nmi(unsigned int);
-extern void setup_apic_nmi_watchdog(void *);
-extern void stop_apic_nmi_watchdog(void *);
-extern void disable_timer_nmi_watchdog(void);
-extern void enable_timer_nmi_watchdog(void);
-extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason);
-extern void cpu_nmi_set_wd_enabled(void);
-
-extern atomic_t nmi_active;
-extern unsigned int nmi_watchdog;
-#define NMI_NONE 0
-#define NMI_IO_APIC 1
-#define NMI_LOCAL_APIC 2
-#define NMI_INVALID 3
-
struct ctl_table;
extern int proc_nmi_enabled(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
void arch_trigger_all_cpu_backtrace(void);
#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
-
-static inline void localise_nmi_watchdog(void)
-{
- if (nmi_watchdog == NMI_IO_APIC)
- nmi_watchdog = NMI_LOCAL_APIC;
-}
-
-/* check if nmi_watchdog is active (ie was specified at boot) */
-static inline int nmi_watchdog_active(void)
-{
- /*
- * actually it should be:
- * return (nmi_watchdog == NMI_LOCAL_APIC ||
- * nmi_watchdog == NMI_IO_APIC)
- * but since they are power of two we could use a
- * cheaper way --cvg
- */
- return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
-}
#endif
-void lapic_watchdog_stop(void);
-int lapic_watchdog_init(unsigned nmi_hz);
-int lapic_wd_event(unsigned nmi_hz);
-unsigned lapic_adjust_nmi_hz(unsigned hz);
-void disable_lapic_nmi_watchdog(void);
-void enable_lapic_nmi_watchdog(void);
void stop_nmi(void);
void restart_nmi(void);
static inline void halt(void)
{
- PVOP_VCALL0(pv_irq_ops.safe_halt);
+ PVOP_VCALL0(pv_irq_ops.halt);
}
static inline void wbinvd(void)
#define PCIBIOS_MIN_CARDBUS_IO 0x4000
+extern int pcibios_enabled;
void pcibios_config_init(void);
struct pci_bus *pcibios_scan_root(int bus);
#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */
#ifdef CONFIG_PERF_EVENTS
-extern void init_hw_perf_events(void);
extern void perf_events_lapic_init(void);
#define PERF_EVENT_INDEX_OFFSET 0
}
#else
-static inline void init_hw_perf_events(void) { }
static inline void perf_events_lapic_init(void) { }
#endif
};
/*
- * P4 PEBS specifics (Replay Event only)
- *
- * Format (bits):
- * 0-6: metric from P4_PEBS_METRIC enum
- * 7 : reserved
- * 8 : reserved
- * 9-11 : reserved
- *
* Note we have UOP and PEBS bits reserved for now
* just in case if we will need them once
*/
P4_PEBS_METRIC__max
};
+/*
+ * Notes on internal configuration of ESCR+CCCR tuples
+ *
+ * Since P4 has quite the different architecture of
+ * performance registers in compare with "architectural"
+ * once and we have on 64 bits to keep configuration
+ * of performance event, the following trick is used.
+ *
+ * 1) Since both ESCR and CCCR registers have only low
+ * 32 bits valuable, we pack them into a single 64 bit
+ * configuration. Low 32 bits of such config correspond
+ * to low 32 bits of CCCR register and high 32 bits
+ * correspond to low 32 bits of ESCR register.
+ *
+ * 2) The meaning of every bit of such config field can
+ * be found in Intel SDM but it should be noted that
+ * we "borrow" some reserved bits for own usage and
+ * clean them or set to a proper value when we do
+ * a real write to hardware registers.
+ *
+ * 3) The format of bits of config is the following
+ * and should be either 0 or set to some predefined
+ * values:
+ *
+ * Low 32 bits
+ * -----------
+ * 0-6: P4_PEBS_METRIC enum
+ * 7-11: reserved
+ * 12: reserved (Enable)
+ * 13-15: reserved (ESCR select)
+ * 16-17: Active Thread
+ * 18: Compare
+ * 19: Complement
+ * 20-23: Threshold
+ * 24: Edge
+ * 25: reserved (FORCE_OVF)
+ * 26: reserved (OVF_PMI_T0)
+ * 27: reserved (OVF_PMI_T1)
+ * 28-29: reserved
+ * 30: reserved (Cascade)
+ * 31: reserved (OVF)
+ *
+ * High 32 bits
+ * ------------
+ * 0: reserved (T1_USR)
+ * 1: reserved (T1_OS)
+ * 2: reserved (T0_USR)
+ * 3: reserved (T0_OS)
+ * 4: Tag Enable
+ * 5-8: Tag Value
+ * 9-24: Event Mask (may use P4_ESCR_EMASK_BIT helper)
+ * 25-30: enum P4_EVENTS
+ * 31: reserved (HT thread)
+ */
+
#endif /* PERF_EVENT_P4_H */
static inline void x86_mrst_early_setup(void) { }
#endif
+#ifdef CONFIG_X86_INTEL_CE
+extern void x86_ce4100_early_setup(void);
+#else
+static inline void x86_ce4100_early_setup(void) { }
+#endif
+
#ifndef _SETUP
/*
setup_IO_APIC();
else {
nr_ioapics = 0;
- localise_nmi_watchdog();
}
#endif
}
#define _ASM_X86_STACKTRACE_H
#include <linux/uaccess.h>
+#include <linux/ptrace.h>
extern int kstack_depth_to_print;
};
void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
+ unsigned long *stack,
const struct stacktrace_ops *ops, void *data);
#ifdef CONFIG_X86_32
#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
#endif
+#ifdef CONFIG_FRAME_POINTER
+static inline unsigned long
+stack_frame(struct task_struct *task, struct pt_regs *regs)
+{
+ unsigned long bp;
+
+ if (regs)
+ return regs->bp;
+
+ if (task == current) {
+ /* Grab bp right from our regs */
+ get_bp(bp);
+ return bp;
+ }
+
+ /* bp is the last reg pushed by switch_to */
+ return *(unsigned long *)task->thread.sp;
+}
+#else
+static inline unsigned long
+stack_frame(struct task_struct *task, struct pt_regs *regs)
+{
+ return 0;
+}
+#endif
+
extern void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl);
+ unsigned long *stack, char *log_lvl);
extern void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl);
+ unsigned long *sp, char *log_lvl);
extern unsigned int code_bytes;
unsigned long long native_sched_clock(void);
extern int recalibrate_cpu_khz(void);
-#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
-extern int timer_ack;
-#else
-# define timer_ack (0)
-#endif
-
extern int no_timer_check;
/* Accelerators for sched_clock()
* BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
* set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
*
- * We will use 31 sets, one for sending BAU messages from each of the 32
+ * We will use one set for sending BAU messages from each of the
* cpu's on the uvhub.
*
* TLB shootdown will use the first of the 8 descriptors of each set.
* Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
*/
+#define MAX_CPUS_PER_UVHUB 64
+#define MAX_CPUS_PER_SOCKET 32
+#define UV_ADP_SIZE 64 /* hardware-provided max. */
+#define UV_CPUS_PER_ACT_STATUS 32 /* hardware-provided max. */
#define UV_ITEMS_PER_DESCRIPTOR 8
/* the 'throttle' to prevent the hardware stay-busy bug */
#define MAX_BAU_CONCURRENT 3
-#define UV_CPUS_PER_ACT_STATUS 32
#define UV_ACT_STATUS_MASK 0x3
#define UV_ACT_STATUS_SIZE 2
-#define UV_ADP_SIZE 32
#define UV_DISTRIBUTION_SIZE 256
#define UV_SW_ACK_NPENDING 8
#define UV_NET_ENDPOINT_INTD 0x38
* number of destination side software ack resources
*/
#define DEST_NUM_RESOURCES 8
-#define MAX_CPUS_PER_NODE 32
/*
* completion statuses for sending a TLB flush message
*/
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
-obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_APB_TIMER) += apb_timer.o
{
unsigned int ver = 0;
+ if (id >= (MAX_LOCAL_APIC-1)) {
+ printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
+ return;
+ }
+
if (!enabled) {
++disabled_cpus;
return;
acpi_register_lapic_address(acpi_lapic_addr);
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
- acpi_parse_sapic, MAX_APICS);
+ acpi_parse_sapic, MAX_LOCAL_APIC);
if (!count) {
x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
- acpi_parse_x2apic, MAX_APICS);
+ acpi_parse_x2apic, MAX_LOCAL_APIC);
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
- acpi_parse_lapic, MAX_APICS);
+ acpi_parse_lapic, MAX_LOCAL_APIC);
}
if (!count && !x2count) {
printk(KERN_ERR PREFIX "No LAPIC entries present\n");
mutex_unlock(&smp_alt);
}
+bool skip_smp_alternatives;
void alternatives_smp_switch(int smp)
{
struct smp_alt_module *mod;
printk("lockdep: fixing up alternatives.\n");
#endif
- if (noreplace_smp || smp_alt_once)
+ if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
return;
BUG_ON(!smp && (num_online_cpus() > 1));
static int wrote_text;
struct text_poke_params {
- void *addr;
- const void *opcode;
- size_t len;
+ struct text_poke_param *params;
+ int nparams;
};
static int __kprobes stop_machine_text_poke(void *data)
{
struct text_poke_params *tpp = data;
+ struct text_poke_param *p;
+ int i;
if (atomic_dec_and_test(&stop_machine_first)) {
- text_poke(tpp->addr, tpp->opcode, tpp->len);
+ for (i = 0; i < tpp->nparams; i++) {
+ p = &tpp->params[i];
+ text_poke(p->addr, p->opcode, p->len);
+ }
smp_wmb(); /* Make sure other cpus see that this has run */
wrote_text = 1;
} else {
smp_mb(); /* Load wrote_text before following execution */
}
- flush_icache_range((unsigned long)tpp->addr,
- (unsigned long)tpp->addr + tpp->len);
+ for (i = 0; i < tpp->nparams; i++) {
+ p = &tpp->params[i];
+ flush_icache_range((unsigned long)p->addr,
+ (unsigned long)p->addr + p->len);
+ }
+
return 0;
}
void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
{
struct text_poke_params tpp;
+ struct text_poke_param p;
- tpp.addr = addr;
- tpp.opcode = opcode;
- tpp.len = len;
+ p.addr = addr;
+ p.opcode = opcode;
+ p.len = len;
+ tpp.params = &p;
+ tpp.nparams = 1;
atomic_set(&stop_machine_first, 1);
wrote_text = 0;
/* Use __stop_machine() because the caller already got online_cpus. */
return addr;
}
+/**
+ * text_poke_smp_batch - Update instructions on a live kernel on SMP
+ * @params: an array of text_poke parameters
+ * @n: the number of elements in params.
+ *
+ * Modify multi-byte instruction by using stop_machine() on SMP. Since the
+ * stop_machine() is heavy task, it is better to aggregate text_poke requests
+ * and do it once if possible.
+ *
+ * Note: Must be called under get_online_cpus() and text_mutex.
+ */
+void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
+{
+ struct text_poke_params tpp = {.params = params, .nparams = n};
+
+ atomic_set(&stop_machine_first, 1);
+ wrote_text = 0;
+ stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+}
+
#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
#ifdef CONFIG_X86_64
static u32 *flush_words;
-struct pci_device_id k8_nb_ids[] = {
+struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
{}
};
-EXPORT_SYMBOL(k8_nb_ids);
+EXPORT_SYMBOL(amd_nb_misc_ids);
-struct k8_northbridge_info k8_northbridges;
-EXPORT_SYMBOL(k8_northbridges);
+struct amd_northbridge_info amd_northbridges;
+EXPORT_SYMBOL(amd_northbridges);
-static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
+static struct pci_dev *next_northbridge(struct pci_dev *dev,
+ struct pci_device_id *ids)
{
do {
dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
if (!dev)
break;
- } while (!pci_match_id(&k8_nb_ids[0], dev));
+ } while (!pci_match_id(ids, dev));
return dev;
}
-int cache_k8_northbridges(void)
+int amd_cache_northbridges(void)
{
- int i;
- struct pci_dev *dev;
+ int i = 0;
+ struct amd_northbridge *nb;
+ struct pci_dev *misc;
- if (k8_northbridges.num)
+ if (amd_nb_num())
return 0;
- dev = NULL;
- while ((dev = next_k8_northbridge(dev)) != NULL)
- k8_northbridges.num++;
+ misc = NULL;
+ while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
+ i++;
- /* some CPU families (e.g. family 0x11) do not support GART */
- if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
- boot_cpu_data.x86 == 0x15)
- k8_northbridges.gart_supported = 1;
+ if (i == 0)
+ return 0;
- k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
- sizeof(void *), GFP_KERNEL);
- if (!k8_northbridges.nb_misc)
+ nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
+ if (!nb)
return -ENOMEM;
- if (!k8_northbridges.num) {
- k8_northbridges.nb_misc[0] = NULL;
- return 0;
- }
+ amd_northbridges.nb = nb;
+ amd_northbridges.num = i;
- if (k8_northbridges.gart_supported) {
- flush_words = kmalloc(k8_northbridges.num * sizeof(u32),
- GFP_KERNEL);
- if (!flush_words) {
- kfree(k8_northbridges.nb_misc);
- return -ENOMEM;
- }
- }
+ misc = NULL;
+ for (i = 0; i != amd_nb_num(); i++) {
+ node_to_amd_nb(i)->misc = misc =
+ next_northbridge(misc, amd_nb_misc_ids);
+ }
+
+ /* some CPU families (e.g. family 0x11) do not support GART */
+ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+ boot_cpu_data.x86 == 0x15)
+ amd_northbridges.flags |= AMD_NB_GART;
+
+ /*
+ * Some CPU families support L3 Cache Index Disable. There are some
+ * limitations because of E382 and E388 on family 0x10.
+ */
+ if (boot_cpu_data.x86 == 0x10 &&
+ boot_cpu_data.x86_model >= 0x8 &&
+ (boot_cpu_data.x86_model > 0x9 ||
+ boot_cpu_data.x86_mask >= 0x1))
+ amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
- dev = NULL;
- i = 0;
- while ((dev = next_k8_northbridge(dev)) != NULL) {
- k8_northbridges.nb_misc[i] = dev;
- if (k8_northbridges.gart_supported)
- pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
- }
- k8_northbridges.nb_misc[i] = NULL;
return 0;
}
-EXPORT_SYMBOL_GPL(cache_k8_northbridges);
+EXPORT_SYMBOL_GPL(amd_cache_northbridges);
/* Ignores subdevice/subvendor but as far as I can figure out
they're useless anyways */
-int __init early_is_k8_nb(u32 device)
+int __init early_is_amd_nb(u32 device)
{
struct pci_device_id *id;
u32 vendor = device & 0xffff;
device >>= 16;
- for (id = k8_nb_ids; id->vendor; id++)
+ for (id = amd_nb_misc_ids; id->vendor; id++)
if (vendor == id->vendor && device == id->device)
return 1;
return 0;
}
-void k8_flush_garts(void)
+int amd_cache_gart(void)
+{
+ int i;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return 0;
+
+ flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
+ if (!flush_words) {
+ amd_northbridges.flags &= ~AMD_NB_GART;
+ return -ENOMEM;
+ }
+
+ for (i = 0; i != amd_nb_num(); i++)
+ pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+ &flush_words[i]);
+
+ return 0;
+}
+
+void amd_flush_garts(void)
{
int flushed, i;
unsigned long flags;
static DEFINE_SPINLOCK(gart_lock);
- if (!k8_northbridges.gart_supported)
+ if (!amd_nb_has_feature(AMD_NB_GART))
return;
/* Avoid races between AGP and IOMMU. In theory it's not needed
that it doesn't matter to serialize more. -AK */
spin_lock_irqsave(&gart_lock, flags);
flushed = 0;
- for (i = 0; i < k8_northbridges.num; i++) {
- pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c,
- flush_words[i]|1);
+ for (i = 0; i < amd_nb_num(); i++) {
+ pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+ flush_words[i] | 1);
flushed++;
}
- for (i = 0; i < k8_northbridges.num; i++) {
+ for (i = 0; i < amd_nb_num(); i++) {
u32 w;
/* Make sure the hardware actually executed the flush*/
for (;;) {
- pci_read_config_dword(k8_northbridges.nb_misc[i],
+ pci_read_config_dword(node_to_amd_nb(i)->misc,
0x9c, &w);
if (!(w & 1))
break;
if (!flushed)
printk("nothing to flush?\n");
}
-EXPORT_SYMBOL_GPL(k8_flush_garts);
+EXPORT_SYMBOL_GPL(amd_flush_garts);
-static __init int init_k8_nbs(void)
+static __init int init_amd_nbs(void)
{
int err = 0;
- err = cache_k8_northbridges();
+ err = amd_cache_northbridges();
if (err < 0)
- printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
+ printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n");
+
+ if (amd_cache_gart() < 0)
+ printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, "
+ "GART support disabled.\n");
return err;
}
/* This has to go after the PCI subsystem */
-fs_initcall(init_k8_nbs);
+fs_initcall(init_amd_nbs);
if (system_state == SYSTEM_BOOTING) {
irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+ irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
/* APB timer irqs are set up as mp_irqs, timer is edge type */
__set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
if (request_irq(adev->irq, apbt_interrupt_handler,
* Do an PCI bus scan by hand because we're running before the PCI
* subsystem.
*
- * All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+ * All AMD AGP bridges are AGPv3 compliant, so we can do this scan
* generically. It's probably overkill to always scan all slots because
* the AGP bridges should be always an own bus on the HT hierarchy,
* but do it here for future safety.
dev_limit = bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
- if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
dev_limit = bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
- if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
dev_limit = bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
- if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
iommu_detected = 1;
dev_base = bus_dev_ranges[i].dev_base;
dev_limit = bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
- if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
#
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
-ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
-obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
-endif
-obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o
+obj-y += hw_nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SMP) += ipi.o
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/dmi.h>
-#include <linux/nmi.h>
#include <linux/smp.h>
#include <linux/mm.h>
reserved = reserve_eilvt_offset(offset, new);
if (reserved != new) {
- pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but "
- "vector 0x%x was already reserved by another core, "
- "APIC%lX=0x%x\n",
- smp_processor_id(), new, reserved, reg, old);
+ pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+ "vector 0x%x, but the register is already in use for "
+ "vector 0x%x on another cpu\n",
+ smp_processor_id(), reg, offset, new, reserved);
return -EINVAL;
}
if (!eilvt_entry_is_changeable(old, new)) {
- pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but "
- "register already in use, APIC%lX=0x%x\n",
- smp_processor_id(), new, reg, old);
+ pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+ "vector 0x%x, but the register is already in use for "
+ "vector 0x%x on this cpu\n",
+ smp_processor_id(), reg, offset, new, old);
return -EBUSY;
}
* PIT/HPET going. Otherwise register lapic as a dummy
* device.
*/
- if (nmi_watchdog != NMI_IO_APIC)
- lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
- else
- pr_warning("APIC timer registered as dummy,"
- " due to nmi_watchdog=%d!\n", nmi_watchdog);
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
/* Setup the lapic or request the broadcast */
setup_APIC_timer();
}
#endif
- setup_apic_nmi_watchdog(NULL);
apic_pm_activate();
/*
return 0;
}
#else
+
+static int apic_verify(void)
+{
+ u32 features, h, l;
+
+ /*
+ * The APIC feature bit should now be enabled
+ * in `cpuid'
+ */
+ features = cpuid_edx(1);
+ if (!(features & (1 << X86_FEATURE_APIC))) {
+ pr_warning("Could not enable APIC!\n");
+ return -1;
+ }
+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+ /* The BIOS may have set up the APIC at some other address */
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (l & MSR_IA32_APICBASE_ENABLE)
+ mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+
+ pr_info("Found and enabled local APIC!\n");
+ return 0;
+}
+
+int apic_force_enable(void)
+{
+ u32 h, l;
+
+ if (disable_apic)
+ return -1;
+
+ /*
+ * Some BIOSes disable the local APIC in the APIC_BASE
+ * MSR. This can only be done in software for Intel P6 or later
+ * and AMD K7 (Model > 1) or later.
+ */
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+ pr_info("Local APIC disabled by BIOS -- reenabling.\n");
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ enabled_via_apicbase = 1;
+ }
+ return apic_verify();
+}
+
/*
* Detect and initialize APIC
*/
static int __init detect_init_APIC(void)
{
- u32 h, l, features;
-
/* Disabled by kernel option? */
if (disable_apic)
return -1;
"you can enable it with \"lapic\"\n");
return -1;
}
- /*
- * Some BIOSes disable the local APIC in the APIC_BASE
- * MSR. This can only be done in software for Intel P6 or later
- * and AMD K7 (Model > 1) or later.
- */
- rdmsr(MSR_IA32_APICBASE, l, h);
- if (!(l & MSR_IA32_APICBASE_ENABLE)) {
- pr_info("Local APIC disabled by BIOS -- reenabling.\n");
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
- wrmsr(MSR_IA32_APICBASE, l, h);
- enabled_via_apicbase = 1;
- }
- }
- /*
- * The APIC feature bit should now be enabled
- * in `cpuid'
- */
- features = cpuid_edx(1);
- if (!(features & (1 << X86_FEATURE_APIC))) {
- pr_warning("Could not enable APIC!\n");
- return -1;
+ if (apic_force_enable())
+ return -1;
+ } else {
+ if (apic_verify())
+ return -1;
}
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
- /* The BIOS may have set up the APIC at some other address */
- rdmsr(MSR_IA32_APICBASE, l, h);
- if (l & MSR_IA32_APICBASE_ENABLE)
- mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
-
- pr_info("Found and enabled local APIC!\n");
apic_pm_activate();
* This initializes the IO-APIC and APIC hardware if this is
* a UP kernel.
*/
-int apic_version[MAX_APICS];
+int apic_version[MAX_LOCAL_APIC];
int __init APIC_init_uniprocessor(void)
{
setup_IO_APIC();
else {
nr_ioapics = 0;
- localise_nmi_watchdog();
}
-#else
- localise_nmi_watchdog();
#endif
x86_init.timers.setup_percpu_clockev();
-#ifdef CONFIG_X86_64
- check_nmi_watchdog();
-#endif
-
return 0;
}
#include <linux/nmi.h>
#include <linux/module.h>
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
u64 hw_nmi_get_sample_period(void)
{
return (u64)(cpu_khz) * 1000 * 60;
}
+#endif
-#ifdef ARCH_HAS_NMI_WATCHDOG
-
+#ifdef arch_trigger_all_cpu_backtrace
/* For reliability, we're prepared to waste bits here. */
static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
+/* "in progress" flag of arch_trigger_all_cpu_backtrace */
+static unsigned long backtrace_flag;
+
void arch_trigger_all_cpu_backtrace(void)
{
int i;
+ if (test_and_set_bit(0, &backtrace_flag))
+ /*
+ * If there is already a trigger_all_cpu_backtrace() in progress
+ * (backtrace_flag == 1), don't output double cpu dump infos.
+ */
+ return;
+
cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
printk(KERN_INFO "sending NMI to all CPUs:\n");
break;
mdelay(1);
}
+
+ clear_bit(0, &backtrace_flag);
+ smp_mb__after_clear_bit();
}
static int __kprobes
{
struct die_args *args = __args;
struct pt_regs *regs;
- int cpu = smp_processor_id();
+ int cpu;
switch (cmd) {
case DIE_NMI:
}
regs = args->regs;
+ cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
}
early_initcall(register_trigger_all_cpu_backtrace);
#endif
-
-/* STUB calls to mimic old nmi_watchdog behaviour */
-#if defined(CONFIG_X86_LOCAL_APIC)
-unsigned int nmi_watchdog = NMI_NONE;
-EXPORT_SYMBOL(nmi_watchdog);
-void acpi_nmi_enable(void) { return; }
-void acpi_nmi_disable(void) { return; }
-#endif
-atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
-EXPORT_SYMBOL(nmi_active);
-int unknown_nmi_panic;
-void cpu_nmi_set_wd_enabled(void) { return; }
-void stop_apic_nmi_watchdog(void *unused) { return; }
-void setup_apic_nmi_watchdog(void *unused) { return; }
-int __init check_nmi_watchdog(void) { return 0; }
#include <asm/dma.h>
#include <asm/timer.h>
#include <asm/i8259.h>
-#include <asm/nmi.h>
#include <asm/msidef.h>
#include <asm/hypertransport.h>
#include <asm/setup.h>
*
* by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
*/
-
-void __init setup_ioapic_ids_from_mpc(void)
+void __init setup_ioapic_ids_from_mpc_nocheck(void)
{
union IO_APIC_reg_00 reg_00;
physid_mask_t phys_id_present_map;
unsigned char old_id;
unsigned long flags;
- if (acpi_ioapic)
- return;
- /*
- * Don't check I/O APIC IDs for xAPIC systems. They have
- * no meaning without the serial APIC bus.
- */
- if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
- return;
/*
* This is broken; anything with a real cpu count has to
* circumvent this idiocy regardless.
physids_or(phys_id_present_map, phys_id_present_map, tmp);
}
-
/*
* We need to adjust the IRQ routing table
* if the ID changed.
apic_printk(APIC_VERBOSE, " ok.\n");
}
}
+
+void __init setup_ioapic_ids_from_mpc(void)
+{
+
+ if (acpi_ioapic)
+ return;
+ /*
+ * Don't check I/O APIC IDs for xAPIC systems. They have
+ * no meaning without the serial APIC bus.
+ */
+ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ return;
+ setup_ioapic_ids_from_mpc_nocheck();
+}
#endif
int no_timer_check __initdata;
"edge");
}
-static void __init setup_nmi(void)
-{
- /*
- * Dirty trick to enable the NMI watchdog ...
- * We put the 8259A master into AEOI mode and
- * unmask on all local APICs LVT0 as NMI.
- *
- * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
- * is from Maciej W. Rozycki - so we do not have to EOI from
- * the NMI handler or the timer interrupt.
- */
- apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
-
- enable_NMI_through_LVT0();
-
- apic_printk(APIC_VERBOSE, " done.\n");
-}
-
/*
* This looks a bit hackish but it's about the only one way of sending
* a few INTA cycles to 8259As and any associated glue logic. ICR does
*/
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
legacy_pic->init(1);
-#ifdef CONFIG_X86_32
- {
- unsigned int ver;
-
- ver = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(ver);
- timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
- }
-#endif
pin1 = find_isa_irq_pin(0, mp_INT);
apic1 = find_isa_irq_apic(0, mp_INT);
unmask_ioapic(cfg);
}
if (timer_irq_works()) {
- if (nmi_watchdog == NMI_IO_APIC) {
- setup_nmi();
- legacy_pic->unmask(0);
- }
if (disable_timer_pin_1 > 0)
clear_IO_APIC_pin(0, pin1);
goto out;
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
timer_through_8259 = 1;
- if (nmi_watchdog == NMI_IO_APIC) {
- legacy_pic->mask(0);
- setup_nmi();
- legacy_pic->unmask(0);
- }
goto out;
}
/*
apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
}
- if (nmi_watchdog == NMI_IO_APIC) {
- apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
- "through the IO-APIC - disabling NMI Watchdog!\n");
- nmi_watchdog = NMI_NONE;
- }
-#ifdef CONFIG_X86_32
- timer_ack = 0;
-#endif
-
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as Virtual Wire IRQ...\n");
return reg_01.bits.entries + 1;
}
-void __init probe_nr_irqs_gsi(void)
+static void __init probe_nr_irqs_gsi(void)
{
int nr;
return res;
}
-void __init ioapic_init_mappings(void)
+void __init ioapic_and_gsi_init(void)
{
unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
struct resource *ioapic_res;
ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
ioapic_res++;
}
+
+ probe_nr_irqs_gsi();
}
void __init ioapic_insert_resources(void)
printk(KERN_INFO "Early APIC setup for system timer0\n");
#ifndef CONFIG_SMP
- phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+ physid_set_mask_of_physid(boot_cpu_physical_apicid,
+ &phys_cpu_present_map);
#endif
/* Make sure the irq descriptor is set up */
cfg = alloc_irq_and_cfg_at(0, 0);
+++ /dev/null
-/*
- * NMI watchdog support on APIC systems
- *
- * Started by Ingo Molnar <mingo@redhat.com>
- *
- * Fixes:
- * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
- * Mikael Pettersson : Power Management for local APIC NMI watchdog.
- * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
- * Pavel Machek and
- * Mikael Pettersson : PM converted to driver model. Disable/enable API.
- */
-
-#include <asm/apic.h>
-
-#include <linux/nmi.h>
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/sysdev.h>
-#include <linux/sysctl.h>
-#include <linux/percpu.h>
-#include <linux/kprobes.h>
-#include <linux/cpumask.h>
-#include <linux/kernel_stat.h>
-#include <linux/kdebug.h>
-#include <linux/smp.h>
-
-#include <asm/i8259.h>
-#include <asm/io_apic.h>
-#include <asm/proto.h>
-#include <asm/timer.h>
-
-#include <asm/mce.h>
-
-#include <asm/mach_traps.h>
-
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-
-/* For reliability, we're prepared to waste bits here. */
-static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-
-/* nmi_active:
- * >0: the lapic NMI watchdog is active, but can be disabled
- * <0: the lapic NMI watchdog has not been set up, and cannot
- * be enabled
- * 0: the lapic NMI watchdog is disabled, but can be enabled
- */
-atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
-EXPORT_SYMBOL(nmi_active);
-
-unsigned int nmi_watchdog = NMI_NONE;
-EXPORT_SYMBOL(nmi_watchdog);
-
-static int panic_on_timeout;
-
-static unsigned int nmi_hz = HZ;
-static DEFINE_PER_CPU(short, wd_enabled);
-static int endflag __initdata;
-
-static inline unsigned int get_nmi_count(int cpu)
-{
- return per_cpu(irq_stat, cpu).__nmi_count;
-}
-
-static inline int mce_in_progress(void)
-{
-#if defined(CONFIG_X86_MCE)
- return atomic_read(&mce_entry) > 0;
-#endif
- return 0;
-}
-
-/*
- * Take the local apic timer and PIT/HPET into account. We don't
- * know which one is active, when we have highres/dyntick on
- */
-static inline unsigned int get_timer_irqs(int cpu)
-{
- return per_cpu(irq_stat, cpu).apic_timer_irqs +
- per_cpu(irq_stat, cpu).irq0_irqs;
-}
-
-#ifdef CONFIG_SMP
-/*
- * The performance counters used by NMI_LOCAL_APIC don't trigger when
- * the CPU is idle. To make sure the NMI watchdog really ticks on all
- * CPUs during the test make them busy.
- */
-static __init void nmi_cpu_busy(void *data)
-{
- local_irq_enable_in_hardirq();
- /*
- * Intentionally don't use cpu_relax here. This is
- * to make sure that the performance counter really ticks,
- * even if there is a simulator or similar that catches the
- * pause instruction. On a real HT machine this is fine because
- * all other CPUs are busy with "useless" delay loops and don't
- * care if they get somewhat less cycles.
- */
- while (endflag == 0)
- mb();
-}
-#endif
-
-static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
-{
- printk(KERN_CONT "\n");
-
- printk(KERN_WARNING
- "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
- cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
-
- printk(KERN_WARNING
- "Please report this to bugzilla.kernel.org,\n");
- printk(KERN_WARNING
- "and attach the output of the 'dmesg' command.\n");
-
- per_cpu(wd_enabled, cpu) = 0;
- atomic_dec(&nmi_active);
-}
-
-static void __acpi_nmi_disable(void *__unused)
-{
- apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
-int __init check_nmi_watchdog(void)
-{
- unsigned int *prev_nmi_count;
- int cpu;
-
- if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
- return 0;
-
- prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
- if (!prev_nmi_count)
- goto error;
-
- printk(KERN_INFO "Testing NMI watchdog ... ");
-
-#ifdef CONFIG_SMP
- if (nmi_watchdog == NMI_LOCAL_APIC)
- smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
-#endif
-
- for_each_possible_cpu(cpu)
- prev_nmi_count[cpu] = get_nmi_count(cpu);
- local_irq_enable();
- mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
-
- for_each_online_cpu(cpu) {
- if (!per_cpu(wd_enabled, cpu))
- continue;
- if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
- report_broken_nmi(cpu, prev_nmi_count);
- }
- endflag = 1;
- if (!atomic_read(&nmi_active)) {
- kfree(prev_nmi_count);
- atomic_set(&nmi_active, -1);
- goto error;
- }
- printk("OK.\n");
-
- /*
- * now that we know it works we can reduce NMI frequency to
- * something more reasonable; makes a difference in some configs
- */
- if (nmi_watchdog == NMI_LOCAL_APIC)
- nmi_hz = lapic_adjust_nmi_hz(1);
-
- kfree(prev_nmi_count);
- return 0;
-error:
- if (nmi_watchdog == NMI_IO_APIC) {
- if (!timer_through_8259)
- legacy_pic->mask(0);
- on_each_cpu(__acpi_nmi_disable, NULL, 1);
- }
-
-#ifdef CONFIG_X86_32
- timer_ack = 0;
-#endif
- return -1;
-}
-
-static int __init setup_nmi_watchdog(char *str)
-{
- unsigned int nmi;
-
- if (!strncmp(str, "panic", 5)) {
- panic_on_timeout = 1;
- str = strchr(str, ',');
- if (!str)
- return 1;
- ++str;
- }
-
- if (!strncmp(str, "lapic", 5))
- nmi_watchdog = NMI_LOCAL_APIC;
- else if (!strncmp(str, "ioapic", 6))
- nmi_watchdog = NMI_IO_APIC;
- else {
- get_option(&str, &nmi);
- if (nmi >= NMI_INVALID)
- return 0;
- nmi_watchdog = nmi;
- }
-
- return 1;
-}
-__setup("nmi_watchdog=", setup_nmi_watchdog);
-
-/*
- * Suspend/resume support
- */
-#ifdef CONFIG_PM
-
-static int nmi_pm_active; /* nmi_active before suspend */
-
-static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
-{
- /* only CPU0 goes here, other CPUs should be offline */
- nmi_pm_active = atomic_read(&nmi_active);
- stop_apic_nmi_watchdog(NULL);
- BUG_ON(atomic_read(&nmi_active) != 0);
- return 0;
-}
-
-static int lapic_nmi_resume(struct sys_device *dev)
-{
- /* only CPU0 goes here, other CPUs should be offline */
- if (nmi_pm_active > 0) {
- setup_apic_nmi_watchdog(NULL);
- touch_nmi_watchdog();
- }
- return 0;
-}
-
-static struct sysdev_class nmi_sysclass = {
- .name = "lapic_nmi",
- .resume = lapic_nmi_resume,
- .suspend = lapic_nmi_suspend,
-};
-
-static struct sys_device device_lapic_nmi = {
- .id = 0,
- .cls = &nmi_sysclass,
-};
-
-static int __init init_lapic_nmi_sysfs(void)
-{
- int error;
-
- /*
- * should really be a BUG_ON but b/c this is an
- * init call, it just doesn't work. -dcz
- */
- if (nmi_watchdog != NMI_LOCAL_APIC)
- return 0;
-
- if (atomic_read(&nmi_active) < 0)
- return 0;
-
- error = sysdev_class_register(&nmi_sysclass);
- if (!error)
- error = sysdev_register(&device_lapic_nmi);
- return error;
-}
-
-/* must come after the local APIC's device_initcall() */
-late_initcall(init_lapic_nmi_sysfs);
-
-#endif /* CONFIG_PM */
-
-static void __acpi_nmi_enable(void *__unused)
-{
- apic_write(APIC_LVT0, APIC_DM_NMI);
-}
-
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
- if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
- on_each_cpu(__acpi_nmi_enable, NULL, 1);
-}
-
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
- if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
- on_each_cpu(__acpi_nmi_disable, NULL, 1);
-}
-
-/*
- * This function is called as soon the LAPIC NMI watchdog driver has everything
- * in place and it's ready to check if the NMIs belong to the NMI watchdog
- */
-void cpu_nmi_set_wd_enabled(void)
-{
- __get_cpu_var(wd_enabled) = 1;
-}
-
-void setup_apic_nmi_watchdog(void *unused)
-{
- if (__get_cpu_var(wd_enabled))
- return;
-
- /* cheap hack to support suspend/resume */
- /* if cpu0 is not active neither should the other cpus */
- if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
- return;
-
- switch (nmi_watchdog) {
- case NMI_LOCAL_APIC:
- if (lapic_watchdog_init(nmi_hz) < 0) {
- __get_cpu_var(wd_enabled) = 0;
- return;
- }
- /* FALL THROUGH */
- case NMI_IO_APIC:
- __get_cpu_var(wd_enabled) = 1;
- atomic_inc(&nmi_active);
- }
-}
-
-void stop_apic_nmi_watchdog(void *unused)
-{
- /* only support LOCAL and IO APICs for now */
- if (!nmi_watchdog_active())
- return;
- if (__get_cpu_var(wd_enabled) == 0)
- return;
- if (nmi_watchdog == NMI_LOCAL_APIC)
- lapic_watchdog_stop();
- else
- __acpi_nmi_disable(NULL);
- __get_cpu_var(wd_enabled) = 0;
- atomic_dec(&nmi_active);
-}
-
-/*
- * the best way to detect whether a CPU has a 'hard lockup' problem
- * is to check it's local APIC timer IRQ counts. If they are not
- * changing then that CPU has some problem.
- *
- * as these watchdog NMI IRQs are generated on every CPU, we only
- * have to check the current processor.
- *
- * since NMIs don't listen to _any_ locks, we have to be extremely
- * careful not to rely on unsafe variables. The printk might lock
- * up though, so we have to break up any console locks first ...
- * [when there will be more tty-related locks, break them up here too!]
- */
-
-static DEFINE_PER_CPU(unsigned, last_irq_sum);
-static DEFINE_PER_CPU(long, alert_counter);
-static DEFINE_PER_CPU(int, nmi_touch);
-
-void touch_nmi_watchdog(void)
-{
- if (nmi_watchdog_active()) {
- unsigned cpu;
-
- /*
- * Tell other CPUs to reset their alert counters. We cannot
- * do it ourselves because the alert count increase is not
- * atomic.
- */
- for_each_present_cpu(cpu) {
- if (per_cpu(nmi_touch, cpu) != 1)
- per_cpu(nmi_touch, cpu) = 1;
- }
- }
-
- /*
- * Tickle the softlockup detector too:
- */
- touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-
-notrace __kprobes int
-nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
-{
- /*
- * Since current_thread_info()-> is always on the stack, and we
- * always switch the stack NMI-atomically, it's safe to use
- * smp_processor_id().
- */
- unsigned int sum;
- int touched = 0;
- int cpu = smp_processor_id();
- int rc = 0;
-
- sum = get_timer_irqs(cpu);
-
- if (__get_cpu_var(nmi_touch)) {
- __get_cpu_var(nmi_touch) = 0;
- touched = 1;
- }
-
- /* We can be called before check_nmi_watchdog, hence NULL check. */
- if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
-
- raw_spin_lock(&lock);
- printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
- show_regs(regs);
- dump_stack();
- raw_spin_unlock(&lock);
- cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
-
- rc = 1;
- }
-
- /* Could check oops_in_progress here too, but it's safer not to */
- if (mce_in_progress())
- touched = 1;
-
- /* if the none of the timers isn't firing, this cpu isn't doing much */
- if (!touched && __get_cpu_var(last_irq_sum) == sum) {
- /*
- * Ayiee, looks like this CPU is stuck ...
- * wait a few IRQs (5 seconds) before doing the oops ...
- */
- __this_cpu_inc(alert_counter);
- if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
- /*
- * die_nmi will return ONLY if NOTIFY_STOP happens..
- */
- die_nmi("BUG: NMI Watchdog detected LOCKUP",
- regs, panic_on_timeout);
- } else {
- __get_cpu_var(last_irq_sum) = sum;
- __this_cpu_write(alert_counter, 0);
- }
-
- /* see if the nmi watchdog went off */
- if (!__get_cpu_var(wd_enabled))
- return rc;
- switch (nmi_watchdog) {
- case NMI_LOCAL_APIC:
- rc |= lapic_wd_event(nmi_hz);
- break;
- case NMI_IO_APIC:
- /*
- * don't know how to accurately check for this.
- * just assume it was a watchdog timer interrupt
- * This matches the old behaviour.
- */
- rc = 1;
- break;
- }
- return rc;
-}
-
-#ifdef CONFIG_SYSCTL
-
-static void enable_ioapic_nmi_watchdog_single(void *unused)
-{
- __get_cpu_var(wd_enabled) = 1;
- atomic_inc(&nmi_active);
- __acpi_nmi_enable(NULL);
-}
-
-static void enable_ioapic_nmi_watchdog(void)
-{
- on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
- touch_nmi_watchdog();
-}
-
-static void disable_ioapic_nmi_watchdog(void)
-{
- on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
-}
-
-static int __init setup_unknown_nmi_panic(char *str)
-{
- unknown_nmi_panic = 1;
- return 1;
-}
-__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
-
-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
-{
- unsigned char reason = get_nmi_reason();
- char buf[64];
-
- sprintf(buf, "NMI received for unknown reason %02x\n", reason);
- die_nmi(buf, regs, 1); /* Always panic here */
- return 0;
-}
-
-/*
- * proc handler for /proc/sys/kernel/nmi
- */
-int proc_nmi_enabled(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
-{
- int old_state;
-
- nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
- old_state = nmi_watchdog_enabled;
- proc_dointvec(table, write, buffer, length, ppos);
- if (!!old_state == !!nmi_watchdog_enabled)
- return 0;
-
- if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
- printk(KERN_WARNING
- "NMI watchdog is permanently disabled\n");
- return -EIO;
- }
-
- if (nmi_watchdog == NMI_LOCAL_APIC) {
- if (nmi_watchdog_enabled)
- enable_lapic_nmi_watchdog();
- else
- disable_lapic_nmi_watchdog();
- } else if (nmi_watchdog == NMI_IO_APIC) {
- if (nmi_watchdog_enabled)
- enable_ioapic_nmi_watchdog();
- else
- disable_ioapic_nmi_watchdog();
- } else {
- printk(KERN_WARNING
- "NMI watchdog doesn't know what hardware to touch\n");
- return -EIO;
- }
- return 0;
-}
-
-#endif /* CONFIG_SYSCTL */
-
-int do_nmi_callback(struct pt_regs *regs, int cpu)
-{
-#ifdef CONFIG_SYSCTL
- if (unknown_nmi_panic)
- return unknown_nmi_panic_callback(regs, cpu);
-#endif
- return 0;
-}
-
-void arch_trigger_all_cpu_backtrace(void)
-{
- int i;
-
- cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
-
- printk(KERN_INFO "sending NMI to all CPUs:\n");
- apic->send_IPI_all(NMI_VECTOR);
-
- /* Wait for up to 10 seconds for all CPUs to do the backtrace */
- for (i = 0; i < 10 * 1000; i++) {
- if (cpumask_empty(to_cpumask(backtrace_mask)))
- break;
- mdelay(1);
- }
-}
EXPORT_SYMBOL_GPL(uv_apicid_hibits);
static DEFINE_SPINLOCK(uv_nmi_lock);
+static unsigned long __init uv_early_read_mmr(unsigned long addr)
+{
+ unsigned long val, *mmr;
+
+ mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
+ val = *mmr;
+ early_iounmap(mmr, sizeof(*mmr));
+ return val;
+}
+
static inline bool is_GRU_range(u64 start, u64 end)
{
return start >= gru_start_paddr && end <= gru_end_paddr;
return is_ISA_range(start, end) || is_GRU_range(start, end);
}
-static int early_get_nodeid(void)
+static int __init early_get_pnodeid(void)
{
union uvh_node_id_u node_id;
- unsigned long *mmr;
-
- mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
- node_id.v = *mmr;
- early_iounmap(mmr, sizeof(*mmr));
+ union uvh_rh_gam_config_mmr_u m_n_config;
+ int pnode;
/* Currently, all blades have same revision number */
+ node_id.v = uv_early_read_mmr(UVH_NODE_ID);
+ m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
uv_min_hub_revision_id = node_id.s.revision;
- return node_id.s.node_id;
+ pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
+ return pnode;
}
static void __init early_get_apic_pnode_shift(void)
{
- unsigned long *mmr;
-
- mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr));
- uvh_apicid.v = *mmr;
- early_iounmap(mmr, sizeof(*mmr));
+ uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
if (!uvh_apicid.v)
/*
* Old bios, use default value
static void __init uv_set_apicid_hibit(void)
{
union uvh_lb_target_physical_apic_id_mask_u apicid_mask;
- unsigned long *mmr;
- mmr = early_ioremap(UV_LOCAL_MMR_BASE |
- UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK, sizeof(*mmr));
- apicid_mask.v = *mmr;
- early_iounmap(mmr, sizeof(*mmr));
+ apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK);
uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK;
}
static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- int nodeid;
+ int pnodeid;
if (!strcmp(oem_id, "SGI")) {
- nodeid = early_get_nodeid();
+ pnodeid = early_get_pnodeid();
early_get_apic_pnode_shift();
x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
x86_platform.nmi_init = uv_nmi_init;
uv_system_type = UV_X2APIC;
else if (!strcmp(oem_table_id, "UVH")) {
__get_cpu_var(x2apic_extra_bits) =
- nodeid << (uvh_apicid.s.pnode_shift - 1);
+ pnodeid << uvh_apicid.s.pnode_shift;
uv_system_type = UV_NON_UNIQUE_APIC;
uv_set_apicid_hibit();
return 1;
void __init uv_system_init(void)
{
union uvh_rh_gam_config_mmr_u m_n_config;
+ union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
union uvh_node_id_u node_id;
unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
- int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
+ int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io;
int gnode_extra, max_pnode = 0;
unsigned long mmr_base, present, paddr;
- unsigned short pnode_mask;
+ unsigned short pnode_mask, pnode_io_mask;
map_low_mmrs();
m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
m_val = m_n_config.s.m_skt;
n_val = m_n_config.s.n_skt;
+ mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
+ n_io = mmioh.s.n_io;
mmr_base =
uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
~UV_MMR_ENABLE;
pnode_mask = (1 << n_val) - 1;
+ pnode_io_mask = (1 << n_io) - 1;
+
node_id.v = uv_read_local_mmr(UVH_NODE_ID);
gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
gnode_upper = ((unsigned long)gnode_extra << m_val);
- printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
- n_val, m_val, gnode_upper, gnode_extra);
+ printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n",
+ n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask);
printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
for (j = 0; j < 64; j++) {
if (!test_bit(j, &present))
continue;
- pnode = (i * 64 + j);
+ pnode = (i * 64 + j) & pnode_mask;
uv_blade_info[blade].pnode = pnode;
uv_blade_info[blade].nr_possible_cpus = 0;
uv_blade_info[blade].nr_online_cpus = 0;
/*
* apic_pnode_shift must be set before calling uv_apicid_to_pnode();
*/
+ uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
pnode = uv_apicid_to_pnode(apicid);
blade = boot_pnode_to_blade(pnode);
uv_cpu_hub_info(cpu)->numa_blade_id = blade;
uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
uv_cpu_hub_info(cpu)->pnode = pnode;
- uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
map_gru_high(max_pnode);
map_mmr_high(max_pnode);
- map_mmioh_high(max_pnode);
+ map_mmioh_high(max_pnode & pnode_io_mask);
uv_cpu_init();
uv_scir_register_cpu_notifier();
#else
vgetcpu_set_mode();
#endif
- init_hw_perf_events();
}
void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
};
struct amd_l3_cache {
- struct pci_dev *dev;
- bool can_disable;
+ struct amd_northbridge *nb;
unsigned indices;
u8 subcaches[4];
};
/*
* L3 cache descriptors
*/
-static struct amd_l3_cache **__cpuinitdata l3_caches;
-
static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
{
unsigned int sc0, sc1, sc2, sc3;
u32 val = 0;
- pci_read_config_dword(l3->dev, 0x1C4, &val);
+ pci_read_config_dword(l3->nb->misc, 0x1C4, &val);
/* calculate subcache sizes */
l3->subcaches[0] = sc0 = !(val & BIT(0));
l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
}
-static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
-{
- struct amd_l3_cache *l3;
- struct pci_dev *dev = node_to_k8_nb_misc(node);
-
- l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
- if (!l3) {
- printk(KERN_WARNING "Error allocating L3 struct\n");
- return NULL;
- }
-
- l3->dev = dev;
-
- amd_calc_l3_indices(l3);
-
- return l3;
-}
-
-static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
- int index)
+static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf,
+ int index)
{
+ static struct amd_l3_cache *__cpuinitdata l3_caches;
int node;
- if (boot_cpu_data.x86 != 0x10)
- return;
-
- if (index < 3)
- return;
-
- /* see errata #382 and #388 */
- if (boot_cpu_data.x86_model < 0x8)
- return;
-
- if ((boot_cpu_data.x86_model == 0x8 ||
- boot_cpu_data.x86_model == 0x9)
- &&
- boot_cpu_data.x86_mask < 0x1)
- return;
-
- /* not in virtualized environments */
- if (k8_northbridges.num == 0)
+ /* only for L3, and not in virtualized environments */
+ if (index < 3 || amd_nb_num() == 0)
return;
/*
* never freed but this is done only on shutdown so it doesn't matter.
*/
if (!l3_caches) {
- int size = k8_northbridges.num * sizeof(struct amd_l3_cache *);
+ int size = amd_nb_num() * sizeof(struct amd_l3_cache);
l3_caches = kzalloc(size, GFP_ATOMIC);
if (!l3_caches)
node = amd_get_nb_id(smp_processor_id());
- if (!l3_caches[node]) {
- l3_caches[node] = amd_init_l3_cache(node);
- l3_caches[node]->can_disable = true;
+ if (!l3_caches[node].nb) {
+ l3_caches[node].nb = node_to_amd_nb(node);
+ amd_calc_l3_indices(&l3_caches[node]);
}
- WARN_ON(!l3_caches[node]);
-
- this_leaf->l3 = l3_caches[node];
+ this_leaf->l3 = &l3_caches[node];
}
/*
{
unsigned int reg = 0;
- pci_read_config_dword(l3->dev, 0x1BC + slot * 4, ®);
+ pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, ®);
/* check whether this slot is activated already */
if (reg & (3UL << 30))
{
int index;
- if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+ if (!this_leaf->l3 ||
+ !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
return -EINVAL;
index = amd_get_l3_disable_slot(this_leaf->l3, slot);
if (!l3->subcaches[i])
continue;
- pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+ pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
/*
* We need to WBINVD on a core on the node containing the L3
wbinvd_on_cpu(cpu);
reg |= BIT(31);
- pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+ pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
}
}
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+ if (!this_leaf->l3 ||
+ !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
return -EINVAL;
cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
#define STORE_CACHE_DISABLE(slot) \
static ssize_t \
store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
- const char *buf, size_t count) \
+ const char *buf, size_t count) \
{ \
return store_cache_disable(this_leaf, buf, count, slot); \
}
show_cache_disable_1, store_cache_disable_1);
#else /* CONFIG_AMD_NB */
-static void __cpuinit
-amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
-{
-};
+#define amd_init_l3_cache(x, y)
#endif /* CONFIG_AMD_NB */
static int
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
amd_cpuid4(index, &eax, &ebx, &ecx);
- amd_check_l3_disable(this_leaf, index);
+ amd_init_l3_cache(this_leaf, index);
} else {
cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
}
define_one_ro(shared_cpu_map);
define_one_ro(shared_cpu_list);
-#define DEFAULT_SYSFS_CACHE_ATTRS \
- &type.attr, \
- &level.attr, \
- &coherency_line_size.attr, \
- &physical_line_partition.attr, \
- &ways_of_associativity.attr, \
- &number_of_sets.attr, \
- &size.attr, \
- &shared_cpu_map.attr, \
- &shared_cpu_list.attr
-
static struct attribute *default_attrs[] = {
- DEFAULT_SYSFS_CACHE_ATTRS,
+ &type.attr,
+ &level.attr,
+ &coherency_line_size.attr,
+ &physical_line_partition.attr,
+ &ways_of_associativity.attr,
+ &number_of_sets.attr,
+ &size.attr,
+ &shared_cpu_map.attr,
+ &shared_cpu_list.attr,
NULL
};
-static struct attribute *default_l3_attrs[] = {
- DEFAULT_SYSFS_CACHE_ATTRS,
#ifdef CONFIG_AMD_NB
- &cache_disable_0.attr,
- &cache_disable_1.attr,
+static struct attribute ** __cpuinit amd_l3_attrs(void)
+{
+ static struct attribute **attrs;
+ int n;
+
+ if (attrs)
+ return attrs;
+
+ n = sizeof (default_attrs) / sizeof (struct attribute *);
+
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ n += 2;
+
+ attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
+ if (attrs == NULL)
+ return attrs = default_attrs;
+
+ for (n = 0; default_attrs[n]; n++)
+ attrs[n] = default_attrs[n];
+
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+ attrs[n++] = &cache_disable_0.attr;
+ attrs[n++] = &cache_disable_1.attr;
+ }
+
+ return attrs;
+}
#endif
- NULL
-};
static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
{
this_leaf = CPUID4_INFO_IDX(cpu, i);
- if (this_leaf->l3 && this_leaf->l3->can_disable)
- ktype_cache.default_attrs = default_l3_attrs;
- else
- ktype_cache.default_attrs = default_attrs;
-
+ ktype_cache.default_attrs = default_attrs;
+#ifdef CONFIG_AMD_NB
+ if (this_leaf->l3)
+ ktype_cache.default_attrs = amd_l3_attrs();
+#endif
retval = kobject_init_and_add(&(this_object->kobj),
&ktype_cache,
per_cpu(ici_cache_kobject, cpu),
#include <asm/mce.h>
#include <asm/msr.h>
-#define PFX "mce_threshold: "
-#define VERSION "version 1.1.1"
#define NR_BANKS 6
#define NR_BLOCKS 9
#define THRESHOLD_MAX 0xFFF
struct list_head miscj;
};
-/* defaults used early on boot */
-static struct threshold_block threshold_defaults = {
- .interrupt_enable = 0,
- .threshold_limit = THRESHOLD_MAX,
-};
-
struct threshold_bank {
struct kobject *kobj;
struct threshold_block *blocks;
struct thresh_restart {
struct threshold_block *b;
int reset;
+ int set_lvt_off;
+ int lvt_off;
u16 old_limit;
};
+static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
+{
+ int msr = (hi & MASK_LVTOFF_HI) >> 20;
+
+ if (apic < 0) {
+ pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
+ "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
+ b->bank, b->block, b->address, hi, lo);
+ return 0;
+ }
+
+ if (apic != msr) {
+ pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
+ "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
+ b->cpu, apic, b->bank, b->block, b->address, hi, lo);
+ return 0;
+ }
+
+ return 1;
+};
+
/* must be called with correct cpu affinity */
/* Called via smp_call_function_single() */
static void threshold_restart_bank(void *_tr)
{
struct thresh_restart *tr = _tr;
- u32 mci_misc_hi, mci_misc_lo;
+ u32 hi, lo;
- rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+ rdmsr(tr->b->address, lo, hi);
- if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+ if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
tr->reset = 1; /* limit cannot be lower than err count */
if (tr->reset) { /* reset err count and overflow bit */
- mci_misc_hi =
- (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
+ hi =
+ (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
(THRESHOLD_MAX - tr->b->threshold_limit);
} else if (tr->old_limit) { /* change limit w/o reset */
- int new_count = (mci_misc_hi & THRESHOLD_MAX) +
+ int new_count = (hi & THRESHOLD_MAX) +
(tr->old_limit - tr->b->threshold_limit);
- mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
+ hi = (hi & ~MASK_ERR_COUNT_HI) |
(new_count & THRESHOLD_MAX);
}
+ if (tr->set_lvt_off) {
+ if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
+ /* set new lvt offset */
+ hi &= ~MASK_LVTOFF_HI;
+ hi |= tr->lvt_off << 20;
+ }
+ }
+
tr->b->interrupt_enable ?
- (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
- (mci_misc_hi &= ~MASK_INT_TYPE_HI);
+ (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
+ (hi &= ~MASK_INT_TYPE_HI);
- mci_misc_hi |= MASK_COUNT_EN_HI;
- wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+ hi |= MASK_COUNT_EN_HI;
+ wrmsr(tr->b->address, lo, hi);
+}
+
+static void mce_threshold_block_init(struct threshold_block *b, int offset)
+{
+ struct thresh_restart tr = {
+ .b = b,
+ .set_lvt_off = 1,
+ .lvt_off = offset,
+ };
+
+ b->threshold_limit = THRESHOLD_MAX;
+ threshold_restart_bank(&tr);
+};
+
+static int setup_APIC_mce(int reserved, int new)
+{
+ if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
+ APIC_EILVT_MSG_FIX, 0))
+ return new;
+
+ return reserved;
}
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
+ struct threshold_block b;
unsigned int cpu = smp_processor_id();
u32 low = 0, high = 0, address = 0;
unsigned int bank, block;
- struct thresh_restart tr;
- int lvt_off = -1;
- u8 offset;
+ int offset = -1;
for (bank = 0; bank < NR_BANKS; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
if (shared_bank[bank] && c->cpu_core_id)
break;
#endif
- offset = (high & MASK_LVTOFF_HI) >> 20;
- if (lvt_off < 0) {
- if (setup_APIC_eilvt(offset,
- THRESHOLD_APIC_VECTOR,
- APIC_EILVT_MSG_FIX, 0)) {
- pr_err(FW_BUG "cpu %d, failed to "
- "setup threshold interrupt "
- "for bank %d, block %d "
- "(MSR%08X=0x%x%08x)",
- smp_processor_id(), bank, block,
- address, high, low);
- continue;
- }
- lvt_off = offset;
- } else if (lvt_off != offset) {
- pr_err(FW_BUG "cpu %d, invalid threshold "
- "interrupt offset %d for bank %d,"
- "block %d (MSR%08X=0x%x%08x)",
- smp_processor_id(), lvt_off, bank,
- block, address, high, low);
- continue;
- }
-
- high &= ~MASK_LVTOFF_HI;
- high |= lvt_off << 20;
- wrmsr(address, low, high);
+ offset = setup_APIC_mce(offset,
+ (high & MASK_LVTOFF_HI) >> 20);
- threshold_defaults.address = address;
- tr.b = &threshold_defaults;
- tr.reset = 0;
- tr.old_limit = 0;
- threshold_restart_bank(&tr);
+ memset(&b, 0, sizeof(b));
+ b.cpu = cpu;
+ b.bank = bank;
+ b.block = block;
+ b.address = address;
+ mce_threshold_block_init(&b, offset);
mce_threshold_vector = amd_threshold_interrupt;
}
}
b->interrupt_enable = !!new;
+ memset(&tr, 0, sizeof(tr));
tr.b = b;
- tr.reset = 0;
- tr.old_limit = 0;
smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
if (new < 1)
new = 1;
+ memset(&tr, 0, sizeof(tr));
tr.old_limit = b->threshold_limit;
b->threshold_limit = new;
tr.b = b;
- tr.reset = 0;
smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
continue;
err = threshold_create_bank(cpu, bank);
if (err)
- goto out;
+ return err;
}
-out:
+
return err;
}
{
int i;
- if (nmi_watchdog == NMI_LOCAL_APIC)
- disable_lapic_nmi_watchdog();
-
for (i = 0; i < x86_pmu.num_counters; i++) {
if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
goto perfctr_fail;
for (i--; i >= 0; i--)
release_perfctr_nmi(x86_pmu.perfctr + i);
- if (nmi_watchdog == NMI_LOCAL_APIC)
- enable_lapic_nmi_watchdog();
-
return false;
}
release_perfctr_nmi(x86_pmu.perfctr + i);
release_evntsel_nmi(x86_pmu.eventsel + i);
}
-
- if (nmi_watchdog == NMI_LOCAL_APIC)
- enable_lapic_nmi_watchdog();
}
#else
static bool check_hw_exists(void)
{
u64 val, val_new = 0;
- int ret = 0;
+ int i, reg, ret = 0;
+
+ /*
+ * Check to see if the BIOS enabled any of the counters, if so
+ * complain and bail.
+ */
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ reg = x86_pmu.eventsel + i;
+ ret = rdmsrl_safe(reg, &val);
+ if (ret)
+ goto msr_fail;
+ if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
+ goto bios_fail;
+ }
+ if (x86_pmu.num_counters_fixed) {
+ reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+ ret = rdmsrl_safe(reg, &val);
+ if (ret)
+ goto msr_fail;
+ for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+ if (val & (0x03 << i*4))
+ goto bios_fail;
+ }
+ }
+
+ /*
+ * Now write a value and read it back to see if it matches,
+ * this is needed to detect certain hardware emulators (qemu/kvm)
+ * that don't trap on the MSR access and always return 0s.
+ */
val = 0xabcdUL;
- ret |= checking_wrmsrl(x86_pmu.perfctr, val);
+ ret = checking_wrmsrl(x86_pmu.perfctr, val);
ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new);
if (ret || val != val_new)
- return false;
+ goto msr_fail;
return true;
+
+bios_fail:
+ printk(KERN_CONT "Broken BIOS detected, using software events only.\n");
+ printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
+ return false;
+
+msr_fail:
+ printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
+ return false;
}
static void reserve_ds_buffers(void);
struct hw_perf_event *hwc = &event->hw;
u64 config;
- if (!hwc->sample_period) {
+ if (!is_sampling_event(event)) {
hwc->sample_period = x86_pmu.max_period;
hwc->last_period = hwc->sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
pr_info("no hardware sampling interrupt available.\n");
}
-void __init init_hw_perf_events(void)
+int __init init_hw_perf_events(void)
{
struct event_constraint *c;
int err;
err = amd_pmu_init();
break;
default:
- return;
+ return 0;
}
if (err != 0) {
pr_cont("no PMU driver, software events only.\n");
- return;
+ return 0;
}
pmu_check_apic();
/* sanity check that the hardware exists or is emulated */
- if (!check_hw_exists()) {
- pr_cont("Broken PMU hardware detected, software events only.\n");
- return;
- }
+ if (!check_hw_exists())
+ return 0;
pr_cont("%s PMU driver.\n", x86_pmu.name);
pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
- perf_pmu_register(&pmu);
+ perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
perf_cpu_notifier(x86_pmu_notifier);
+
+ return 0;
}
+early_initcall(init_hw_perf_events);
static inline void x86_pmu_read(struct perf_event *event)
{
perf_callchain_store(entry, regs->ip);
- dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
+ dump_trace(NULL, regs, NULL, &backtrace_ops, entry);
}
#ifdef CONFIG_COMPAT
#ifdef CONFIG_CPU_SUP_AMD
-static DEFINE_RAW_SPINLOCK(amd_nb_lock);
-
static __initconst const u64 amd_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
return &emptyconstraint;
}
-static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
+static struct amd_nb *amd_alloc_nb(int cpu)
{
struct amd_nb *nb;
int i;
if (!nb)
return NULL;
- nb->nb_id = nb_id;
+ nb->nb_id = -1;
/*
* initialize all possible NB constraints
if (boot_cpu_data.x86_max_cores < 2)
return NOTIFY_OK;
- cpuc->amd_nb = amd_alloc_nb(cpu, -1);
+ cpuc->amd_nb = amd_alloc_nb(cpu);
if (!cpuc->amd_nb)
return NOTIFY_BAD;
nb_id = amd_get_nb_id(cpu);
WARN_ON_ONCE(nb_id == BAD_APICID);
- raw_spin_lock(&amd_nb_lock);
-
for_each_online_cpu(i) {
nb = per_cpu(cpu_hw_events, i).amd_nb;
if (WARN_ON_ONCE(!nb))
cpuc->amd_nb->nb_id = nb_id;
cpuc->amd_nb->refcnt++;
-
- raw_spin_unlock(&amd_nb_lock);
}
static void amd_pmu_cpu_dead(int cpu)
cpuhw = &per_cpu(cpu_hw_events, cpu);
- raw_spin_lock(&amd_nb_lock);
-
if (cpuhw->amd_nb) {
struct amd_nb *nb = cpuhw->amd_nb;
cpuhw->amd_nb = NULL;
}
-
- raw_spin_unlock(&amd_nb_lock);
}
static __initconst const struct x86_pmu amd_pmu = {
if (ret)
return ret;
+ if (event->attr.precise_ip &&
+ (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use INST_RETIRED.ANY_P
+ * (0x00c0), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * INST_RETIRED.ANY_P counts the number of cycles that retires
+ * CNTMASK instructions. By setting CNTMASK to a value (16)
+ * larger than the maximum number of instructions that can be
+ * retired per cycle (4) and then inverting the condition, we
+ * count all cycles that retire 16 or less instructions, which
+ * is every cycle.
+ *
+ * Thereby we gain a PEBS capable cycle counter.
+ */
+ u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */
+
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+
if (event->attr.type != PERF_TYPE_RAW)
return 0;
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/smp.h>
-#include <linux/nmi.h>
+#include <asm/nmi.h>
#include <linux/kprobes.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
-struct nmi_watchdog_ctlblk {
- unsigned int cccr_msr;
- unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
- unsigned int evntsel_msr; /* the MSR to select the events to handle */
-};
-
-/* Interface defining a CPU specific perfctr watchdog */
-struct wd_ops {
- int (*reserve)(void);
- void (*unreserve)(void);
- int (*setup)(unsigned nmi_hz);
- void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
- void (*stop)(void);
- unsigned perfctr;
- unsigned evntsel;
- u64 checkbit;
-};
-
-static const struct wd_ops *wd_ops;
-
/*
* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
* offset from MSR_P4_BSU_ESCR0.
static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
-static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
-
/* converts an msr to an appropriate reservation bit */
static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
{
clear_bit(counter, evntsel_nmi_owner);
}
EXPORT_SYMBOL(release_evntsel_nmi);
-
-void disable_lapic_nmi_watchdog(void)
-{
- BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
- if (atomic_read(&nmi_active) <= 0)
- return;
-
- on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
-
- if (wd_ops)
- wd_ops->unreserve();
-
- BUG_ON(atomic_read(&nmi_active) != 0);
-}
-
-void enable_lapic_nmi_watchdog(void)
-{
- BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
- /* are we already enabled */
- if (atomic_read(&nmi_active) != 0)
- return;
-
- /* are we lapic aware */
- if (!wd_ops)
- return;
- if (!wd_ops->reserve()) {
- printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
- return;
- }
-
- on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
- touch_nmi_watchdog();
-}
-
-/*
- * Activate the NMI watchdog via the local APIC.
- */
-
-static unsigned int adjust_for_32bit_ctr(unsigned int hz)
-{
- u64 counter_val;
- unsigned int retval = hz;
-
- /*
- * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
- * are writable, with higher bits sign extending from bit 31.
- * So, we can only program the counter with 31 bit values and
- * 32nd bit should be 1, for 33.. to be 1.
- * Find the appropriate nmi_hz
- */
- counter_val = (u64)cpu_khz * 1000;
- do_div(counter_val, retval);
- if (counter_val > 0x7fffffffULL) {
- u64 count = (u64)cpu_khz * 1000;
- do_div(count, 0x7fffffffUL);
- retval = count + 1;
- }
- return retval;
-}
-
-static void write_watchdog_counter(unsigned int perfctr_msr,
- const char *descr, unsigned nmi_hz)
-{
- u64 count = (u64)cpu_khz * 1000;
-
- do_div(count, nmi_hz);
- if (descr)
- pr_debug("setting %s to -0x%08Lx\n", descr, count);
- wrmsrl(perfctr_msr, 0 - count);
-}
-
-static void write_watchdog_counter32(unsigned int perfctr_msr,
- const char *descr, unsigned nmi_hz)
-{
- u64 count = (u64)cpu_khz * 1000;
-
- do_div(count, nmi_hz);
- if (descr)
- pr_debug("setting %s to -0x%08Lx\n", descr, count);
- wrmsr(perfctr_msr, (u32)(-count), 0);
-}
-
-/*
- * AMD K7/K8/Family10h/Family11h support.
- * AMD keeps this interface nicely stable so there is not much variety
- */
-#define K7_EVNTSEL_ENABLE (1 << 22)
-#define K7_EVNTSEL_INT (1 << 20)
-#define K7_EVNTSEL_OS (1 << 17)
-#define K7_EVNTSEL_USR (1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
-#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
-
-static int setup_k7_watchdog(unsigned nmi_hz)
-{
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- perfctr_msr = wd_ops->perfctr;
- evntsel_msr = wd_ops->evntsel;
-
- wrmsrl(perfctr_msr, 0UL);
-
- evntsel = K7_EVNTSEL_INT
- | K7_EVNTSEL_OS
- | K7_EVNTSEL_USR
- | K7_NMI_EVENT;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz);
-
- /* initialize the wd struct before enabling */
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= K7_EVNTSEL_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
-
- return 1;
-}
-
-static void single_msr_stop_watchdog(void)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- wrmsr(wd->evntsel_msr, 0, 0);
-}
-
-static int single_msr_reserve(void)
-{
- if (!reserve_perfctr_nmi(wd_ops->perfctr))
- return 0;
-
- if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
- release_perfctr_nmi(wd_ops->perfctr);
- return 0;
- }
- return 1;
-}
-
-static void single_msr_unreserve(void)
-{
- release_evntsel_nmi(wd_ops->evntsel);
- release_perfctr_nmi(wd_ops->perfctr);
-}
-
-static void __kprobes
-single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
- /* start the cycle over again */
- write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops k7_wd_ops = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_k7_watchdog,
- .rearm = single_msr_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_K7_PERFCTR0,
- .evntsel = MSR_K7_EVNTSEL0,
- .checkbit = 1ULL << 47,
-};
-
-/*
- * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
- */
-#define P6_EVNTSEL0_ENABLE (1 << 22)
-#define P6_EVNTSEL_INT (1 << 20)
-#define P6_EVNTSEL_OS (1 << 17)
-#define P6_EVNTSEL_USR (1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
-#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
-
-static int setup_p6_watchdog(unsigned nmi_hz)
-{
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- perfctr_msr = wd_ops->perfctr;
- evntsel_msr = wd_ops->evntsel;
-
- /* KVM doesn't implement this MSR */
- if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
- return 0;
-
- evntsel = P6_EVNTSEL_INT
- | P6_EVNTSEL_OS
- | P6_EVNTSEL_USR
- | P6_NMI_EVENT;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- nmi_hz = adjust_for_32bit_ctr(nmi_hz);
- write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz);
-
- /* initialize the wd struct before enabling */
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= P6_EVNTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
-
- return 1;
-}
-
-static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
- /*
- * P6 based Pentium M need to re-unmask
- * the apic vector but it doesn't hurt
- * other P6 variant.
- * ArchPerfom/Core Duo also needs this
- */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-
- /* P6/ARCH_PERFMON has 32 bit counter write */
- write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops p6_wd_ops = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_p6_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_P6_PERFCTR0,
- .evntsel = MSR_P6_EVNTSEL0,
- .checkbit = 1ULL << 39,
-};
-
-/*
- * Intel P4 performance counters.
- * By far the most complicated of all.
- */
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7)
-#define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
-#define P4_ESCR_OS (1 << 3)
-#define P4_ESCR_USR (1 << 2)
-#define P4_CCCR_OVF_PMI0 (1 << 26)
-#define P4_CCCR_OVF_PMI1 (1 << 27)
-#define P4_CCCR_THRESHOLD(N) ((N) << 20)
-#define P4_CCCR_COMPLEMENT (1 << 19)
-#define P4_CCCR_COMPARE (1 << 18)
-#define P4_CCCR_REQUIRED (3 << 16)
-#define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
-#define P4_CCCR_ENABLE (1 << 12)
-#define P4_CCCR_OVF (1 << 31)
-
-#define P4_CONTROLS 18
-static unsigned int p4_controls[18] = {
- MSR_P4_BPU_CCCR0,
- MSR_P4_BPU_CCCR1,
- MSR_P4_BPU_CCCR2,
- MSR_P4_BPU_CCCR3,
- MSR_P4_MS_CCCR0,
- MSR_P4_MS_CCCR1,
- MSR_P4_MS_CCCR2,
- MSR_P4_MS_CCCR3,
- MSR_P4_FLAME_CCCR0,
- MSR_P4_FLAME_CCCR1,
- MSR_P4_FLAME_CCCR2,
- MSR_P4_FLAME_CCCR3,
- MSR_P4_IQ_CCCR0,
- MSR_P4_IQ_CCCR1,
- MSR_P4_IQ_CCCR2,
- MSR_P4_IQ_CCCR3,
- MSR_P4_IQ_CCCR4,
- MSR_P4_IQ_CCCR5,
-};
-/*
- * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
- * CRU_ESCR0 (with any non-null event selector) through a complemented
- * max threshold. [IA32-Vol3, Section 14.9.9]
- */
-static int setup_p4_watchdog(unsigned nmi_hz)
-{
- unsigned int perfctr_msr, evntsel_msr, cccr_msr;
- unsigned int evntsel, cccr_val;
- unsigned int misc_enable, dummy;
- unsigned int ht_num;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
- if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
- return 0;
-
-#ifdef CONFIG_SMP
- /* detect which hyperthread we are on */
- if (smp_num_siblings == 2) {
- unsigned int ebx, apicid;
-
- ebx = cpuid_ebx(1);
- apicid = (ebx >> 24) & 0xff;
- ht_num = apicid & 1;
- } else
-#endif
- ht_num = 0;
-
- /*
- * performance counters are shared resources
- * assign each hyperthread its own set
- * (re-use the ESCR0 register, seems safe
- * and keeps the cccr_val the same)
- */
- if (!ht_num) {
- /* logical cpu 0 */
- perfctr_msr = MSR_P4_IQ_PERFCTR0;
- evntsel_msr = MSR_P4_CRU_ESCR0;
- cccr_msr = MSR_P4_IQ_CCCR0;
- cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
-
- /*
- * If we're on the kdump kernel or other situation, we may
- * still have other performance counter registers set to
- * interrupt and they'll keep interrupting forever because
- * of the P4_CCCR_OVF quirk. So we need to ACK all the
- * pending interrupts and disable all the registers here,
- * before reenabling the NMI delivery. Refer to p4_rearm()
- * about the P4_CCCR_OVF quirk.
- */
- if (reset_devices) {
- unsigned int low, high;
- int i;
-
- for (i = 0; i < P4_CONTROLS; i++) {
- rdmsr(p4_controls[i], low, high);
- low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
- wrmsr(p4_controls[i], low, high);
- }
- }
- } else {
- /* logical cpu 1 */
- perfctr_msr = MSR_P4_IQ_PERFCTR1;
- evntsel_msr = MSR_P4_CRU_ESCR0;
- cccr_msr = MSR_P4_IQ_CCCR1;
-
- /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
- if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
- cccr_val = P4_CCCR_OVF_PMI0;
- else
- cccr_val = P4_CCCR_OVF_PMI1;
- cccr_val |= P4_CCCR_ESCR_SELECT(4);
- }
-
- evntsel = P4_ESCR_EVENT_SELECT(0x3F)
- | P4_ESCR_OS
- | P4_ESCR_USR;
-
- cccr_val |= P4_CCCR_THRESHOLD(15)
- | P4_CCCR_COMPLEMENT
- | P4_CCCR_COMPARE
- | P4_CCCR_REQUIRED;
-
- wrmsr(evntsel_msr, evntsel, 0);
- wrmsr(cccr_msr, cccr_val, 0);
- write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
-
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = cccr_msr;
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- cccr_val |= P4_CCCR_ENABLE;
- wrmsr(cccr_msr, cccr_val, 0);
- return 1;
-}
-
-static void stop_p4_watchdog(void)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- wrmsr(wd->cccr_msr, 0, 0);
- wrmsr(wd->evntsel_msr, 0, 0);
-}
-
-static int p4_reserve(void)
-{
- if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
- return 0;
-#ifdef CONFIG_SMP
- if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
- goto fail1;
-#endif
- if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
- goto fail2;
- /* RED-PEN why is ESCR1 not reserved here? */
- return 1;
- fail2:
-#ifdef CONFIG_SMP
- if (smp_num_siblings > 1)
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
- fail1:
-#endif
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
- return 0;
-}
-
-static void p4_unreserve(void)
-{
-#ifdef CONFIG_SMP
- if (smp_num_siblings > 1)
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
-#endif
- release_evntsel_nmi(MSR_P4_CRU_ESCR0);
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
-}
-
-static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
- unsigned dummy;
- /*
- * P4 quirks:
- * - An overflown perfctr will assert its interrupt
- * until the OVF flag in its CCCR is cleared.
- * - LVTPC is masked on interrupt and must be
- * unmasked by the LVTPC handler.
- */
- rdmsrl(wd->cccr_msr, dummy);
- dummy &= ~P4_CCCR_OVF;
- wrmsrl(wd->cccr_msr, dummy);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- /* start the cycle over again */
- write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops p4_wd_ops = {
- .reserve = p4_reserve,
- .unreserve = p4_unreserve,
- .setup = setup_p4_watchdog,
- .rearm = p4_rearm,
- .stop = stop_p4_watchdog,
- /* RED-PEN this is wrong for the other sibling */
- .perfctr = MSR_P4_BPU_PERFCTR0,
- .evntsel = MSR_P4_BSU_ESCR0,
- .checkbit = 1ULL << 39,
-};
-
-/*
- * Watchdog using the Intel architected PerfMon.
- * Used for Core2 and hopefully all future Intel CPUs.
- */
-#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
-#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
-
-static struct wd_ops intel_arch_wd_ops;
-
-static int setup_intel_arch_watchdog(unsigned nmi_hz)
-{
- unsigned int ebx;
- union cpuid10_eax eax;
- unsigned int unused;
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- /*
- * Check whether the Architectural PerfMon supports
- * Unhalted Core Cycles Event or not.
- * NOTE: Corresponding bit = 0 in ebx indicates event present.
- */
- cpuid(10, &(eax.full), &ebx, &unused, &unused);
- if ((eax.split.mask_length <
- (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
- (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
- return 0;
-
- perfctr_msr = wd_ops->perfctr;
- evntsel_msr = wd_ops->evntsel;
-
- wrmsrl(perfctr_msr, 0UL);
-
- evntsel = ARCH_PERFMON_EVENTSEL_INT
- | ARCH_PERFMON_EVENTSEL_OS
- | ARCH_PERFMON_EVENTSEL_USR
- | ARCH_PERFMON_NMI_EVENT_SEL
- | ARCH_PERFMON_NMI_EVENT_UMASK;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- nmi_hz = adjust_for_32bit_ctr(nmi_hz);
- write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
-
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
- intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
- return 1;
-}
-
-static struct wd_ops intel_arch_wd_ops __read_mostly = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_intel_arch_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
- .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
-};
-
-static void probe_nmi_watchdog(void)
-{
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 == 6 ||
- (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15))
- wd_ops = &k7_wd_ops;
- return;
- case X86_VENDOR_INTEL:
- /* Work around where perfctr1 doesn't have a working enable
- * bit as described in the following errata:
- * AE49 Core Duo and Intel Core Solo 65 nm
- * AN49 Intel Pentium Dual-Core
- * AF49 Dual-Core Intel Xeon Processor LV
- */
- if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
- ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
- boot_cpu_data.x86_mask == 4))) {
- intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
- intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
- }
- if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- wd_ops = &intel_arch_wd_ops;
- break;
- }
- switch (boot_cpu_data.x86) {
- case 6:
- if (boot_cpu_data.x86_model > 13)
- return;
-
- wd_ops = &p6_wd_ops;
- break;
- case 15:
- wd_ops = &p4_wd_ops;
- break;
- default:
- return;
- }
- break;
- }
-}
-
-/* Interface to nmi.c */
-
-int lapic_watchdog_init(unsigned nmi_hz)
-{
- if (!wd_ops) {
- probe_nmi_watchdog();
- if (!wd_ops) {
- printk(KERN_INFO "NMI watchdog: CPU not supported\n");
- return -1;
- }
-
- if (!wd_ops->reserve()) {
- printk(KERN_ERR
- "NMI watchdog: cannot reserve perfctrs\n");
- return -1;
- }
- }
-
- if (!(wd_ops->setup(nmi_hz))) {
- printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
- raw_smp_processor_id());
- return -1;
- }
-
- return 0;
-}
-
-void lapic_watchdog_stop(void)
-{
- if (wd_ops)
- wd_ops->stop();
-}
-
-unsigned lapic_adjust_nmi_hz(unsigned hz)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
- wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
- hz = adjust_for_32bit_ctr(hz);
- return hz;
-}
-
-int __kprobes lapic_wd_event(unsigned nmi_hz)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- u64 ctr;
-
- rdmsrl(wd->perfctr_msr, ctr);
- if (ctr & wd_ops->checkbit) /* perfctr still running? */
- return 0;
-
- wd_ops->rearm(wd, nmi_hz);
- return 1;
-}
void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
+ unsigned long *stack, char *log_lvl)
{
printk("%sCall Trace:\n", log_lvl);
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+ dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
}
void show_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp)
+ unsigned long *stack)
{
- show_trace_log_lvl(task, regs, stack, bp, "");
+ show_trace_log_lvl(task, regs, stack, "");
}
void show_stack(struct task_struct *task, unsigned long *sp)
{
- show_stack_log_lvl(task, NULL, sp, 0, "");
+ show_stack_log_lvl(task, NULL, sp, "");
}
/*
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
- show_trace(NULL, NULL, &stack, bp);
+ show_trace(NULL, NULL, &stack);
}
EXPORT_SYMBOL(dump_stack);
#include <asm/stacktrace.h>
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
+void dump_trace(struct task_struct *task,
+ struct pt_regs *regs, unsigned long *stack,
const struct stacktrace_ops *ops, void *data)
{
int graph = 0;
+ unsigned long bp;
if (!task)
task = current;
stack = (unsigned long *)task->thread.sp;
}
-#ifdef CONFIG_FRAME_POINTER
- if (!bp) {
- if (task == current) {
- /* Grab bp right from our regs */
- get_bp(bp);
- } else {
- /* bp is the last reg pushed by switch_to */
- bp = *(unsigned long *) task->thread.sp;
- }
- }
-#endif
-
+ bp = stack_frame(task, regs);
for (;;) {
struct thread_info *context;
void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+ unsigned long *sp, char *log_lvl)
{
unsigned long *stack;
int i;
touch_nmi_watchdog();
}
printk(KERN_CONT "\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ show_trace_log_lvl(task, regs, sp, log_lvl);
}
u8 *ip;
printk(KERN_EMERG "Stack:\n");
- show_stack_log_lvl(NULL, regs, ®s->sp,
- 0, KERN_EMERG);
+ show_stack_log_lvl(NULL, regs, ®s->sp, KERN_EMERG);
printk(KERN_EMERG "Code: ");
* severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
*/
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
+void dump_trace(struct task_struct *task,
+ struct pt_regs *regs, unsigned long *stack,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
unsigned used = 0;
struct thread_info *tinfo;
int graph = 0;
+ unsigned long bp;
if (!task)
task = current;
stack = (unsigned long *)task->thread.sp;
}
-#ifdef CONFIG_FRAME_POINTER
- if (!bp) {
- if (task == current) {
- /* Grab bp right from our regs */
- get_bp(bp);
- } else {
- /* bp is the last reg pushed by switch_to */
- bp = *(unsigned long *) task->thread.sp;
- }
- }
-#endif
-
+ bp = stack_frame(task, regs);
/*
* Print function call entries in all stacks, starting at the
* current stack address. If the stacks consist of nested
void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+ unsigned long *sp, char *log_lvl)
{
unsigned long *irq_stack_end;
unsigned long *irq_stack;
preempt_enable();
printk(KERN_CONT "\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ show_trace_log_lvl(task, regs, sp, log_lvl);
}
void show_registers(struct pt_regs *regs)
printk(KERN_EMERG "Stack:\n");
show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
- regs->bp, KERN_EMERG);
+ KERN_EMERG);
printk(KERN_EMERG "Code: ");
if (!strncmp(buf, "xen", 3))
early_console_register(&xenboot_console, keep);
#endif
-#ifdef CONFIG_X86_MRST_EARLY_PRINTK
+#ifdef CONFIG_EARLY_PRINTK_MRST
if (!strncmp(buf, "mrst", 4)) {
mrst_early_console_init();
early_console_register(&early_mrst_console, keep);
hsu_early_console_init();
early_console_register(&early_hsu_console, keep);
}
-
#endif
buf++;
}
+++ /dev/null
-/*
- * early_printk_mrst.c - early consoles for Intel MID platforms
- *
- * Copyright (c) 2008-2010, Intel Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-/*
- * This file implements two early consoles named mrst and hsu.
- * mrst is based on Maxim3110 spi-uart device, it exists in both
- * Moorestown and Medfield platforms, while hsu is based on a High
- * Speed UART device which only exists in the Medfield platform
- */
-
-#include <linux/serial_reg.h>
-#include <linux/serial_mfd.h>
-#include <linux/kmsg_dump.h>
-#include <linux/console.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/io.h>
-
-#include <asm/fixmap.h>
-#include <asm/pgtable.h>
-#include <asm/mrst.h>
-
-#define MRST_SPI_TIMEOUT 0x200000
-#define MRST_REGBASE_SPI0 0xff128000
-#define MRST_REGBASE_SPI1 0xff128400
-#define MRST_CLK_SPI0_REG 0xff11d86c
-
-/* Bit fields in CTRLR0 */
-#define SPI_DFS_OFFSET 0
-
-#define SPI_FRF_OFFSET 4
-#define SPI_FRF_SPI 0x0
-#define SPI_FRF_SSP 0x1
-#define SPI_FRF_MICROWIRE 0x2
-#define SPI_FRF_RESV 0x3
-
-#define SPI_MODE_OFFSET 6
-#define SPI_SCPH_OFFSET 6
-#define SPI_SCOL_OFFSET 7
-#define SPI_TMOD_OFFSET 8
-#define SPI_TMOD_TR 0x0 /* xmit & recv */
-#define SPI_TMOD_TO 0x1 /* xmit only */
-#define SPI_TMOD_RO 0x2 /* recv only */
-#define SPI_TMOD_EPROMREAD 0x3 /* eeprom read mode */
-
-#define SPI_SLVOE_OFFSET 10
-#define SPI_SRL_OFFSET 11
-#define SPI_CFS_OFFSET 12
-
-/* Bit fields in SR, 7 bits */
-#define SR_MASK 0x7f /* cover 7 bits */
-#define SR_BUSY (1 << 0)
-#define SR_TF_NOT_FULL (1 << 1)
-#define SR_TF_EMPT (1 << 2)
-#define SR_RF_NOT_EMPT (1 << 3)
-#define SR_RF_FULL (1 << 4)
-#define SR_TX_ERR (1 << 5)
-#define SR_DCOL (1 << 6)
-
-struct dw_spi_reg {
- u32 ctrl0;
- u32 ctrl1;
- u32 ssienr;
- u32 mwcr;
- u32 ser;
- u32 baudr;
- u32 txfltr;
- u32 rxfltr;
- u32 txflr;
- u32 rxflr;
- u32 sr;
- u32 imr;
- u32 isr;
- u32 risr;
- u32 txoicr;
- u32 rxoicr;
- u32 rxuicr;
- u32 msticr;
- u32 icr;
- u32 dmacr;
- u32 dmatdlr;
- u32 dmardlr;
- u32 idr;
- u32 version;
-
- /* Currently operates as 32 bits, though only the low 16 bits matter */
- u32 dr;
-} __packed;
-
-#define dw_readl(dw, name) __raw_readl(&(dw)->name)
-#define dw_writel(dw, name, val) __raw_writel((val), &(dw)->name)
-
-/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
-static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
-
-static u32 *pclk_spi0;
-/* Always contains an accessable address, start with 0 */
-static struct dw_spi_reg *pspi;
-
-static struct kmsg_dumper dw_dumper;
-static int dumper_registered;
-
-static void dw_kmsg_dump(struct kmsg_dumper *dumper,
- enum kmsg_dump_reason reason,
- const char *s1, unsigned long l1,
- const char *s2, unsigned long l2)
-{
- int i;
-
- /* When run to this, we'd better re-init the HW */
- mrst_early_console_init();
-
- for (i = 0; i < l1; i++)
- early_mrst_console.write(&early_mrst_console, s1 + i, 1);
- for (i = 0; i < l2; i++)
- early_mrst_console.write(&early_mrst_console, s2 + i, 1);
-}
-
-/* Set the ratio rate to 115200, 8n1, IRQ disabled */
-static void max3110_write_config(void)
-{
- u16 config;
-
- config = 0xc001;
- dw_writel(pspi, dr, config);
-}
-
-/* Translate char to a eligible word and send to max3110 */
-static void max3110_write_data(char c)
-{
- u16 data;
-
- data = 0x8000 | c;
- dw_writel(pspi, dr, data);
-}
-
-void mrst_early_console_init(void)
-{
- u32 ctrlr0 = 0;
- u32 spi0_cdiv;
- u32 freq; /* Freqency info only need be searched once */
-
- /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
- pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
- MRST_CLK_SPI0_REG);
- spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
- freq = 100000000 / (spi0_cdiv + 1);
-
- if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL)
- mrst_spi_paddr = MRST_REGBASE_SPI1;
-
- pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
- mrst_spi_paddr);
-
- /* Disable SPI controller */
- dw_writel(pspi, ssienr, 0);
-
- /* Set control param, 8 bits, transmit only mode */
- ctrlr0 = dw_readl(pspi, ctrl0);
-
- ctrlr0 &= 0xfcc0;
- ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
- | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
- dw_writel(pspi, ctrl0, ctrlr0);
-
- /*
- * Change the spi0 clk to comply with 115200 bps, use 100000 to
- * calculate the clk dividor to make the clock a little slower
- * than real baud rate.
- */
- dw_writel(pspi, baudr, freq/100000);
-
- /* Disable all INT for early phase */
- dw_writel(pspi, imr, 0x0);
-
- /* Set the cs to spi-uart */
- dw_writel(pspi, ser, 0x2);
-
- /* Enable the HW, the last step for HW init */
- dw_writel(pspi, ssienr, 0x1);
-
- /* Set the default configuration */
- max3110_write_config();
-
- /* Register the kmsg dumper */
- if (!dumper_registered) {
- dw_dumper.dump = dw_kmsg_dump;
- kmsg_dump_register(&dw_dumper);
- dumper_registered = 1;
- }
-}
-
-/* Slave select should be called in the read/write function */
-static void early_mrst_spi_putc(char c)
-{
- unsigned int timeout;
- u32 sr;
-
- timeout = MRST_SPI_TIMEOUT;
- /* Early putc needs to make sure the TX FIFO is not full */
- while (--timeout) {
- sr = dw_readl(pspi, sr);
- if (!(sr & SR_TF_NOT_FULL))
- cpu_relax();
- else
- break;
- }
-
- if (!timeout)
- pr_warning("MRST earlycon: timed out\n");
- else
- max3110_write_data(c);
-}
-
-/* Early SPI only uses polling mode */
-static void early_mrst_spi_write(struct console *con, const char *str, unsigned n)
-{
- int i;
-
- for (i = 0; i < n && *str; i++) {
- if (*str == '\n')
- early_mrst_spi_putc('\r');
- early_mrst_spi_putc(*str);
- str++;
- }
-}
-
-struct console early_mrst_console = {
- .name = "earlymrst",
- .write = early_mrst_spi_write,
- .flags = CON_PRINTBUFFER,
- .index = -1,
-};
-
-/*
- * Following is the early console based on Medfield HSU (High
- * Speed UART) device.
- */
-#define HSU_PORT2_PADDR 0xffa28180
-
-static void __iomem *phsu;
-
-void hsu_early_console_init(void)
-{
- u8 lcr;
-
- phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
- HSU_PORT2_PADDR);
-
- /* Disable FIFO */
- writeb(0x0, phsu + UART_FCR);
-
- /* Set to default 115200 bps, 8n1 */
- lcr = readb(phsu + UART_LCR);
- writeb((0x80 | lcr), phsu + UART_LCR);
- writeb(0x18, phsu + UART_DLL);
- writeb(lcr, phsu + UART_LCR);
- writel(0x3600, phsu + UART_MUL*4);
-
- writeb(0x8, phsu + UART_MCR);
- writeb(0x7, phsu + UART_FCR);
- writeb(0x3, phsu + UART_LCR);
-
- /* Clear IRQ status */
- readb(phsu + UART_LSR);
- readb(phsu + UART_RX);
- readb(phsu + UART_IIR);
- readb(phsu + UART_MSR);
-
- /* Enable FIFO */
- writeb(0x7, phsu + UART_FCR);
-}
-
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
-static void early_hsu_putc(char ch)
-{
- unsigned int timeout = 10000; /* 10ms */
- u8 status;
-
- while (--timeout) {
- status = readb(phsu + UART_LSR);
- if (status & BOTH_EMPTY)
- break;
- udelay(1);
- }
-
- /* Only write the char when there was no timeout */
- if (timeout)
- writeb(ch, phsu + UART_TX);
-}
-
-static void early_hsu_write(struct console *con, const char *str, unsigned n)
-{
- int i;
-
- for (i = 0; i < n && *str; i++) {
- if (*str == '\n')
- early_hsu_putc('\r');
- early_hsu_putc(*str);
- str++;
- }
-}
-
-struct console early_hsu_console = {
- .name = "earlyhsu",
- .write = early_hsu_write,
- .flags = CON_PRINTBUFFER,
- .index = -1,
-};
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/list.h>
+#include <linux/module.h>
#include <trace/syscall.h>
int ftrace_arch_code_modify_prepare(void)
{
set_kernel_text_rw();
+ set_all_modules_text_rw();
modifying_code = 1;
return 0;
}
int ftrace_arch_code_modify_post_process(void)
{
modifying_code = 0;
+ set_all_modules_text_ro();
set_kernel_text_ro();
return 0;
}
case X86_SUBARCH_MRST:
x86_mrst_early_setup();
break;
+ case X86_SUBARCH_CE4100:
+ x86_ce4100_early_setup();
+ break;
default:
i386_default_early_setup();
break;
subl $0x80000001, %eax
cmpl $(0x8000ffff-0x80000001), %eax
ja 6f
+
+ /* Clear bogus XD_DISABLE bits */
+ call verify_cpu
+
mov $0x80000001, %eax
cpuid
/* Execute Disable bit supported? */
#endif
iret
+#include "verify_cpu.S"
+
__REFDATA
.align 4
ENTRY(initial_code)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+ /* This is possible if op is under delayed unoptimizing */
+ if (kprobe_disabled(&op->kp))
+ return;
+
preempt_disable();
if (kprobe_running()) {
kprobes_inc_nmissed_count(&op->kp);
return 0;
}
-/* Replace a breakpoint (int3) with a relative jump. */
-int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
+#define MAX_OPTIMIZE_PROBES 256
+static struct text_poke_param *jump_poke_params;
+static struct jump_poke_buffer {
+ u8 buf[RELATIVEJUMP_SIZE];
+} *jump_poke_bufs;
+
+static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
+ u8 *insn_buf,
+ struct optimized_kprobe *op)
{
- unsigned char jmp_code[RELATIVEJUMP_SIZE];
s32 rel = (s32)((long)op->optinsn.insn -
((long)op->kp.addr + RELATIVEJUMP_SIZE));
memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
RELATIVE_ADDR_SIZE);
- jmp_code[0] = RELATIVEJUMP_OPCODE;
- *(s32 *)(&jmp_code[1]) = rel;
+ insn_buf[0] = RELATIVEJUMP_OPCODE;
+ *(s32 *)(&insn_buf[1]) = rel;
+
+ tprm->addr = op->kp.addr;
+ tprm->opcode = insn_buf;
+ tprm->len = RELATIVEJUMP_SIZE;
+}
+
+/*
+ * Replace breakpoints (int3) with relative jumps.
+ * Caller must call with locking kprobe_mutex and text_mutex.
+ */
+void __kprobes arch_optimize_kprobes(struct list_head *oplist)
+{
+ struct optimized_kprobe *op, *tmp;
+ int c = 0;
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ WARN_ON(kprobe_disabled(&op->kp));
+ /* Setup param */
+ setup_optimize_kprobe(&jump_poke_params[c],
+ jump_poke_bufs[c].buf, op);
+ list_del_init(&op->list);
+ if (++c >= MAX_OPTIMIZE_PROBES)
+ break;
+ }
/*
* text_poke_smp doesn't support NMI/MCE code modifying.
* However, since kprobes itself also doesn't support NMI/MCE
* code probing, it's not a problem.
*/
- text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
- return 0;
+ text_poke_smp_batch(jump_poke_params, c);
+}
+
+static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
+ u8 *insn_buf,
+ struct optimized_kprobe *op)
+{
+ /* Set int3 to first byte for kprobes */
+ insn_buf[0] = BREAKPOINT_INSTRUCTION;
+ memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+
+ tprm->addr = op->kp.addr;
+ tprm->opcode = insn_buf;
+ tprm->len = RELATIVEJUMP_SIZE;
+}
+
+/*
+ * Recover original instructions and breakpoints from relative jumps.
+ * Caller must call with locking kprobe_mutex.
+ */
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+ struct list_head *done_list)
+{
+ struct optimized_kprobe *op, *tmp;
+ int c = 0;
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ /* Setup param */
+ setup_unoptimize_kprobe(&jump_poke_params[c],
+ jump_poke_bufs[c].buf, op);
+ list_move(&op->list, done_list);
+ if (++c >= MAX_OPTIMIZE_PROBES)
+ break;
+ }
+
+ /*
+ * text_poke_smp doesn't support NMI/MCE code modifying.
+ * However, since kprobes itself also doesn't support NMI/MCE
+ * code probing, it's not a problem.
+ */
+ text_poke_smp_batch(jump_poke_params, c);
}
/* Replace a relative jump with a breakpoint (int3). */
}
return 0;
}
+
+static int __kprobes init_poke_params(void)
+{
+ /* Allocate code buffer and parameter array */
+ jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
+ MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+ if (!jump_poke_bufs)
+ return -ENOMEM;
+
+ jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
+ MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+ if (!jump_poke_params) {
+ kfree(jump_poke_bufs);
+ jump_poke_bufs = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+#else /* !CONFIG_OPTPROBES */
+static int __kprobes init_poke_params(void)
+{
+ return 0;
+}
#endif
int __init arch_init_kprobes(void)
{
- return 0;
+ return init_poke_params();
}
int __kprobes arch_trampoline_kprobe(struct kprobe *p)
return 0;
}
-static int get_ucode_data(void *to, const u8 *from, size_t n)
-{
- memcpy(to, from, n);
- return 0;
-}
-
static void *
get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
{
u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
void *mc;
- if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
- return NULL;
+ get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR);
if (section_hdr[0] != UCODE_UCODE_TYPE) {
pr_err("error: invalid type field in container file section header\n");
return NULL;
}
- mc = vmalloc(UCODE_MAX_SIZE);
- if (mc) {
- memset(mc, 0, UCODE_MAX_SIZE);
- if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR,
- total_size)) {
- vfree(mc);
- mc = NULL;
- } else
- *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
- }
+ mc = vzalloc(UCODE_MAX_SIZE);
+ if (!mc)
+ return NULL;
+
+ get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size);
+ *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
+
return mc;
}
unsigned int *buf_pos = (unsigned int *)container_hdr;
unsigned long size;
- if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
- return 0;
+ get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE);
size = buf_pos[2];
}
buf += UCODE_CONTAINER_HEADER_SIZE;
- if (get_ucode_data(equiv_cpu_table, buf, size)) {
- vfree(equiv_cpu_table);
- return 0;
- }
+ get_ucode_data(equiv_cpu_table, buf, size);
return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
}
spin_lock_irqsave(&iommu_bitmap_lock, flags);
if (need_flush) {
- k8_flush_garts();
+ amd_flush_garts();
need_flush = false;
}
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
{
int i;
- if (!k8_northbridges.gart_supported)
+ if (!amd_nb_has_feature(AMD_NB_GART))
return;
- for (i = 0; i < k8_northbridges.num; i++) {
- struct pci_dev *dev = k8_northbridges.nb_misc[i];
+ for (i = 0; i < amd_nb_num(); i++) {
+ struct pci_dev *dev = node_to_amd_nb(i)->misc;
enable_gart_translation(dev, __pa(agp_gatt_table));
}
/* Flush the GART-TLB to remove stale entries */
- k8_flush_garts();
+ amd_flush_garts();
}
/*
if (!fix_up_north_bridges)
return;
- if (!k8_northbridges.gart_supported)
+ if (!amd_nb_has_feature(AMD_NB_GART))
return;
pr_info("PCI-DMA: Restoring GART aperture settings\n");
- for (i = 0; i < k8_northbridges.num; i++) {
- struct pci_dev *dev = k8_northbridges.nb_misc[i];
+ for (i = 0; i < amd_nb_num(); i++) {
+ struct pci_dev *dev = node_to_amd_nb(i)->misc;
/*
* Don't enable translations just yet. That is the next
* Private Northbridge GATT initialization in case we cannot use the
* AGP driver for some reason.
*/
-static __init int init_k8_gatt(struct agp_kern_info *info)
+static __init int init_amd_gatt(struct agp_kern_info *info)
{
unsigned aper_size, gatt_size, new_aper_size;
unsigned aper_base, new_aper_base;
aper_size = aper_base = info->aper_size = 0;
dev = NULL;
- for (i = 0; i < k8_northbridges.num; i++) {
- dev = k8_northbridges.nb_misc[i];
+ for (i = 0; i < amd_nb_num(); i++) {
+ dev = node_to_amd_nb(i)->misc;
new_aper_base = read_aperture(dev, &new_aper_size);
if (!new_aper_base)
goto nommu;
if (!no_agp)
return;
- if (!k8_northbridges.gart_supported)
+ if (!amd_nb_has_feature(AMD_NB_GART))
return;
- for (i = 0; i < k8_northbridges.num; i++) {
+ for (i = 0; i < amd_nb_num(); i++) {
u32 ctl;
- dev = k8_northbridges.nb_misc[i];
+ dev = node_to_amd_nb(i)->misc;
pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
ctl &= ~GARTEN;
unsigned long scratch;
long i;
- if (!k8_northbridges.gart_supported)
+ if (!amd_nb_has_feature(AMD_NB_GART))
return 0;
#ifndef CONFIG_AGP_AMD64
no_agp = 1;
#else
/* Makefile puts PCI initialization via subsys_initcall first. */
- /* Add other K8 AGP bridge drivers here */
+ /* Add other AMD AGP bridge drivers here */
no_agp = no_agp ||
(agp_amd64_init() < 0) ||
(agp_copy_info(agp_bridge, &info) < 0);
if (no_iommu ||
(!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
!gart_iommu_aperture ||
- (no_agp && init_k8_gatt(&info) < 0)) {
+ (no_agp && init_amd_gatt(&info) < 0)) {
if (max_pfn > MAX_DMA32_PFN) {
pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
pr_warning("falling back to iommu=soft.\n");
void show_regs(struct pt_regs *regs)
{
show_registers(regs);
- show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs),
- regs->bp);
+ show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs));
}
void show_regs_common(void)
{
if (hlt_use_halt()) {
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+ trace_cpu_idle(1, smp_processor_id());
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
+ trace_cpu_idle((ax>>4)+1, smp_processor_id());
if (!need_resched()) {
if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)¤t_thread_info()->flags);
{
if (!need_resched()) {
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+ trace_cpu_idle(1, smp_processor_id());
if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)¤t_thread_info()->flags);
static void poll_idle(void)
{
trace_power_start(POWER_CSTATE, 0, smp_processor_id());
+ trace_cpu_idle(0, smp_processor_id());
local_irq_enable();
while (!need_resched())
cpu_relax();
- trace_power_end(0);
+ trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}
/*
stop_critical_timings();
pm_idle();
start_critical_timings();
-
trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}
tick_nohz_restart_sched_tick();
preempt_enable_no_resched();
start_critical_timings();
trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT,
+ smp_processor_id());
/* In many cases the interrupt that ended idle
has already called exit_idle. But some idle
outb(1, 0x92);
}
+static void ce4100_reset(struct pci_dev *dev)
+{
+ int i;
+
+ for (i = 0; i < 10; i++) {
+ outb(0x2, 0xcf9);
+ udelay(50);
+ }
+}
+
struct device_fixup {
unsigned int vendor;
unsigned int device;
void (*reboot_fixup)(struct pci_dev *);
};
+/*
+ * PCI ids solely used for fixups_table go here
+ */
+#define PCI_DEVICE_ID_INTEL_CE4100 0x0708
+
static const struct device_fixup fixups_table[] = {
{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
+{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset },
};
/*
void __init setup_arch(char **cmdline_p)
{
int acpi = 0;
- int k8 = 0;
+ int amd = 0;
unsigned long flags;
#ifdef CONFIG_X86_32
acpi = acpi_numa_init();
#endif
-#ifdef CONFIG_K8_NUMA
+#ifdef CONFIG_AMD_NUMA
if (!acpi)
- k8 = !k8_numa_init(0, max_pfn);
+ amd = !amd_numa_init(0, max_pfn);
#endif
- initmem_init(0, max_pfn, acpi, k8);
+ initmem_init(0, max_pfn, acpi, amd);
memblock_find_dma_reserve();
dma32_reserve_bootmem();
#endif
init_apic_mappings();
- ioapic_init_mappings();
-
- /* need to wait for io_apic is mapped */
- probe_nr_irqs_gsi();
+ ioapic_and_gsi_init();
kvm_guest_init();
*/
smp_store_cpu_info(cpuid);
+ /*
+ * This must be done before setting cpu_online_mask
+ * or calling notify_cpu_starting.
+ */
+ set_cpu_sibling_map(raw_smp_processor_id());
+ wmb();
+
notify_cpu_starting(cpuid);
/*
*/
check_tsc_sync_target();
- if (nmi_watchdog == NMI_IO_APIC) {
- legacy_pic->mask(0);
- enable_NMI_through_LVT0();
- legacy_pic->unmask(0);
- }
-
- /* This must be done before setting cpu_online_mask */
- set_cpu_sibling_map(raw_smp_processor_id());
- wmb();
-
/*
* We need to hold call_lock, so there is no inconsistency
* between the time smp_call_function() determines number of
printk(KERN_INFO "SMP mode deactivated.\n");
smpboot_clear_io_apic();
- localise_nmi_watchdog();
-
connect_bsp_APIC();
setup_local_APIC();
end_local_APIC_setup();
preempt_enable();
}
+void arch_disable_nonboot_cpus_begin(void)
+{
+ /*
+ * Avoid the smp alternatives switch during the disable_nonboot_cpus().
+ * In the suspend path, we will be back in the SMP mode shortly anyways.
+ */
+ skip_smp_alternatives = true;
+}
+
+void arch_disable_nonboot_cpus_end(void)
+{
+ skip_smp_alternatives = false;
+}
+
void arch_enable_nonboot_cpus_begin(void)
{
set_mtrr_aps_delayed_init();
#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
#endif
- check_nmi_watchdog();
mtrr_aps_init();
}
if (cpu == 0)
return -EBUSY;
- if (nmi_watchdog == NMI_LOCAL_APIC)
- stop_apic_nmi_watchdog(NULL);
clear_local_APIC();
cpu_disable_common();
*/
void save_stack_trace(struct stack_trace *trace)
{
- dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
+ dump_trace(current, NULL, NULL, &save_stack_ops, trace);
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
EXPORT_SYMBOL_GPL(save_stack_trace);
-void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
+void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
{
- dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
+ dump_trace(current, regs, NULL, &save_stack_ops, trace);
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
{
- dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
+ dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace);
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
#include <asm/hpet.h>
#include <asm/time.h>
-#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
-int timer_ack;
-#endif
-
#ifdef CONFIG_X86_64
volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
#endif
/* Keep nmi watchdog up to date */
inc_irq_stat(irq0_irqs);
- /* Optimized out for !IO_APIC and x86_64 */
- if (timer_ack) {
- /*
- * Subtle, when I/O APICs are used we have to ack timer IRQ
- * manually to deassert NMI lines for the watchdog if run
- * on an 82489DX-based system.
- */
- raw_spin_lock(&i8259A_lock);
- outb(0x0c, PIC_MASTER_OCW3);
- /* Ack the IRQ; AEOI will end it automatically. */
- inb(PIC_MASTER_POLL);
- raw_spin_unlock(&i8259A_lock);
- }
-
global_clock_event->event_handler(global_clock_event);
/* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
no_longmode:
hlt
jmp no_longmode
-#include "verify_cpu_64.S"
+#include "verify_cpu.S"
# Careful these need to be in the same 64K segment as the above;
tidt:
static int ignore_nmis;
+int unknown_nmi_panic;
+
static inline void conditional_sti(struct pt_regs *regs)
{
if (regs->flags & X86_EFLAGS_IF)
die("general protection fault", regs, error_code);
}
+static int __init setup_unknown_nmi_panic(char *str)
+{
+ unknown_nmi_panic = 1;
+ return 1;
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
+
static notrace __kprobes void
mem_parity_error(unsigned char reason, struct pt_regs *regs)
{
reason = (reason & 0xf) | 8;
outb(reason, 0x61);
- i = 2000;
- while (--i)
- udelay(1000);
+ i = 20000;
+ while (--i) {
+ touch_nmi_watchdog();
+ udelay(100);
+ }
reason &= ~8;
outb(reason, 0x61);
reason, smp_processor_id());
printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
- if (panic_on_unrecovered_nmi)
+ if (unknown_nmi_panic || panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
== NOTIFY_STOP)
return;
-
-#ifndef CONFIG_LOCKUP_DETECTOR
- /*
- * Ok, so this is none of the documented NMI sources,
- * so it must be the NMI watchdog.
- */
- if (nmi_watchdog_tick(regs, reason))
- return;
- if (!do_nmi_callback(regs, cpu))
-#endif /* !CONFIG_LOCKUP_DETECTOR */
- unknown_nmi_error(reason, regs);
-#else
- unknown_nmi_error(reason, regs);
#endif
+ unknown_nmi_error(reason, regs);
return;
}
void stop_nmi(void)
{
- acpi_nmi_disable();
ignore_nmis++;
}
void restart_nmi(void)
{
ignore_nmis--;
- acpi_nmi_enable();
}
/* May run on IST stack. */
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
return 0;
+
+ if (tsc_clocksource_reliable)
+ return 0;
/*
* Intel systems are normally all synchronized.
* Exceptions must mark TSC as unstable:
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
/* assume multi socket systems are not synchronized: */
if (num_possible_cpus() > 1)
- tsc_unstable = 1;
+ return 1;
}
- return tsc_unstable;
+ return 0;
+}
+
+
+static void tsc_refine_calibration_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
+/**
+ * tsc_refine_calibration_work - Further refine tsc freq calibration
+ * @work - ignored.
+ *
+ * This functions uses delayed work over a period of a
+ * second to further refine the TSC freq value. Since this is
+ * timer based, instead of loop based, we don't block the boot
+ * process while this longer calibration is done.
+ *
+ * If there are any calibration anomolies (too many SMIs, etc),
+ * or the refined calibration is off by 1% of the fast early
+ * calibration, we throw out the new calibration and use the
+ * early calibration.
+ */
+static void tsc_refine_calibration_work(struct work_struct *work)
+{
+ static u64 tsc_start = -1, ref_start;
+ static int hpet;
+ u64 tsc_stop, ref_stop, delta;
+ unsigned long freq;
+
+ /* Don't bother refining TSC on unstable systems */
+ if (check_tsc_unstable())
+ goto out;
+
+ /*
+ * Since the work is started early in boot, we may be
+ * delayed the first time we expire. So set the workqueue
+ * again once we know timers are working.
+ */
+ if (tsc_start == -1) {
+ /*
+ * Only set hpet once, to avoid mixing hardware
+ * if the hpet becomes enabled later.
+ */
+ hpet = is_hpet_enabled();
+ schedule_delayed_work(&tsc_irqwork, HZ);
+ tsc_start = tsc_read_refs(&ref_start, hpet);
+ return;
+ }
+
+ tsc_stop = tsc_read_refs(&ref_stop, hpet);
+
+ /* hpet or pmtimer available ? */
+ if (!hpet && !ref_start && !ref_stop)
+ goto out;
+
+ /* Check, whether the sampling was disturbed by an SMI */
+ if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
+ goto out;
+
+ delta = tsc_stop - tsc_start;
+ delta *= 1000000LL;
+ if (hpet)
+ freq = calc_hpet_ref(delta, ref_start, ref_stop);
+ else
+ freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+
+ /* Make sure we're within 1% */
+ if (abs(tsc_khz - freq) > tsc_khz/100)
+ goto out;
+
+ tsc_khz = freq;
+ printk(KERN_INFO "Refined TSC clocksource calibration: "
+ "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
+ (unsigned long)tsc_khz % 1000);
+
+out:
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
}
-static void __init init_tsc_clocksource(void)
+
+static int __init init_tsc_clocksource(void)
{
+ if (!cpu_has_tsc || tsc_disabled > 0)
+ return 0;
+
if (tsc_clocksource_reliable)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
/* lower the rating if we already know its unstable: */
clocksource_tsc.rating = 0;
clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
}
- clocksource_register_khz(&clocksource_tsc, tsc_khz);
+ schedule_delayed_work(&tsc_irqwork, 0);
+ return 0;
}
+/*
+ * We use device_initcall here, to ensure we run after the hpet
+ * is fully initialized, which may occur at fs_initcall time.
+ */
+device_initcall(init_tsc_clocksource);
void __init tsc_init(void)
{
mark_tsc_unstable("TSCs unsynchronized");
check_system_tsc_reliable();
- init_tsc_clocksource();
}
--- /dev/null
+/*
+ *
+ * verify_cpu.S - Code for cpu long mode and SSE verification. This
+ * code has been borrowed from boot/setup.S and was introduced by
+ * Andi Kleen.
+ *
+ * Copyright (c) 2007 Andi Kleen (ak@suse.de)
+ * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
+ * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
+ * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com)
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ *
+ * This is a common code for verification whether CPU supports
+ * long mode and SSE or not. It is not called directly instead this
+ * file is included at various places and compiled in that context.
+ * This file is expected to run in 32bit code. Currently:
+ *
+ * arch/x86/boot/compressed/head_64.S: Boot cpu verification
+ * arch/x86/kernel/trampoline_64.S: secondary processor verfication
+ * arch/x86/kernel/head_32.S: processor startup
+ *
+ * verify_cpu, returns the status of longmode and SSE in register %eax.
+ * 0: Success 1: Failure
+ *
+ * On Intel, the XD_DISABLE flag will be cleared as a side-effect.
+ *
+ * The caller needs to check for the error code and take the action
+ * appropriately. Either display a message or halt.
+ */
+
+#include <asm/cpufeature.h>
+#include <asm/msr-index.h>
+
+verify_cpu:
+ pushfl # Save caller passed flags
+ pushl $0 # Kill any dangerous flags
+ popfl
+
+ pushfl # standard way to check for cpuid
+ popl %eax
+ movl %eax,%ebx
+ xorl $0x200000,%eax
+ pushl %eax
+ popfl
+ pushfl
+ popl %eax
+ cmpl %eax,%ebx
+ jz verify_cpu_no_longmode # cpu has no cpuid
+
+ movl $0x0,%eax # See if cpuid 1 is implemented
+ cpuid
+ cmpl $0x1,%eax
+ jb verify_cpu_no_longmode # no cpuid 1
+
+ xor %di,%di
+ cmpl $0x68747541,%ebx # AuthenticAMD
+ jnz verify_cpu_noamd
+ cmpl $0x69746e65,%edx
+ jnz verify_cpu_noamd
+ cmpl $0x444d4163,%ecx
+ jnz verify_cpu_noamd
+ mov $1,%di # cpu is from AMD
+ jmp verify_cpu_check
+
+verify_cpu_noamd:
+ cmpl $0x756e6547,%ebx # GenuineIntel?
+ jnz verify_cpu_check
+ cmpl $0x49656e69,%edx
+ jnz verify_cpu_check
+ cmpl $0x6c65746e,%ecx
+ jnz verify_cpu_check
+
+ # only call IA32_MISC_ENABLE when:
+ # family > 6 || (family == 6 && model >= 0xd)
+ movl $0x1, %eax # check CPU family and model
+ cpuid
+ movl %eax, %ecx
+
+ andl $0x0ff00f00, %eax # mask family and extended family
+ shrl $8, %eax
+ cmpl $6, %eax
+ ja verify_cpu_clear_xd # family > 6, ok
+ jb verify_cpu_check # family < 6, skip
+
+ andl $0x000f00f0, %ecx # mask model and extended model
+ shrl $4, %ecx
+ cmpl $0xd, %ecx
+ jb verify_cpu_check # family == 6, model < 0xd, skip
+
+verify_cpu_clear_xd:
+ movl $MSR_IA32_MISC_ENABLE, %ecx
+ rdmsr
+ btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
+ jnc verify_cpu_check # only write MSR if bit was changed
+ wrmsr
+
+verify_cpu_check:
+ movl $0x1,%eax # Does the cpu have what it takes
+ cpuid
+ andl $REQUIRED_MASK0,%edx
+ xorl $REQUIRED_MASK0,%edx
+ jnz verify_cpu_no_longmode
+
+ movl $0x80000000,%eax # See if extended cpuid is implemented
+ cpuid
+ cmpl $0x80000001,%eax
+ jb verify_cpu_no_longmode # no extended cpuid
+
+ movl $0x80000001,%eax # Does the cpu have what it takes
+ cpuid
+ andl $REQUIRED_MASK1,%edx
+ xorl $REQUIRED_MASK1,%edx
+ jnz verify_cpu_no_longmode
+
+verify_cpu_sse_test:
+ movl $1,%eax
+ cpuid
+ andl $SSE_MASK,%edx
+ cmpl $SSE_MASK,%edx
+ je verify_cpu_sse_ok
+ test %di,%di
+ jz verify_cpu_no_longmode # only try to force SSE on AMD
+ movl $MSR_K7_HWCR,%ecx
+ rdmsr
+ btr $15,%eax # enable SSE
+ wrmsr
+ xor %di,%di # don't loop
+ jmp verify_cpu_sse_test # try again
+
+verify_cpu_no_longmode:
+ popfl # Restore caller passed flags
+ movl $1,%eax
+ ret
+verify_cpu_sse_ok:
+ popfl # Restore caller passed flags
+ xorl %eax, %eax
+ ret
+++ /dev/null
-/*
- *
- * verify_cpu.S - Code for cpu long mode and SSE verification. This
- * code has been borrowed from boot/setup.S and was introduced by
- * Andi Kleen.
- *
- * Copyright (c) 2007 Andi Kleen (ak@suse.de)
- * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
- * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
- *
- * This is a common code for verification whether CPU supports
- * long mode and SSE or not. It is not called directly instead this
- * file is included at various places and compiled in that context.
- * Following are the current usage.
- *
- * This file is included by both 16bit and 32bit code.
- *
- * arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
- * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
- * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
- * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
- *
- * verify_cpu, returns the status of cpu check in register %eax.
- * 0: Success 1: Failure
- *
- * The caller needs to check for the error code and take the action
- * appropriately. Either display a message or halt.
- */
-
-#include <asm/cpufeature.h>
-#include <asm/msr-index.h>
-
-verify_cpu:
- pushfl # Save caller passed flags
- pushl $0 # Kill any dangerous flags
- popfl
-
- pushfl # standard way to check for cpuid
- popl %eax
- movl %eax,%ebx
- xorl $0x200000,%eax
- pushl %eax
- popfl
- pushfl
- popl %eax
- cmpl %eax,%ebx
- jz verify_cpu_no_longmode # cpu has no cpuid
-
- movl $0x0,%eax # See if cpuid 1 is implemented
- cpuid
- cmpl $0x1,%eax
- jb verify_cpu_no_longmode # no cpuid 1
-
- xor %di,%di
- cmpl $0x68747541,%ebx # AuthenticAMD
- jnz verify_cpu_noamd
- cmpl $0x69746e65,%edx
- jnz verify_cpu_noamd
- cmpl $0x444d4163,%ecx
- jnz verify_cpu_noamd
- mov $1,%di # cpu is from AMD
-
-verify_cpu_noamd:
- movl $0x1,%eax # Does the cpu have what it takes
- cpuid
- andl $REQUIRED_MASK0,%edx
- xorl $REQUIRED_MASK0,%edx
- jnz verify_cpu_no_longmode
-
- movl $0x80000000,%eax # See if extended cpuid is implemented
- cpuid
- cmpl $0x80000001,%eax
- jb verify_cpu_no_longmode # no extended cpuid
-
- movl $0x80000001,%eax # Does the cpu have what it takes
- cpuid
- andl $REQUIRED_MASK1,%edx
- xorl $REQUIRED_MASK1,%edx
- jnz verify_cpu_no_longmode
-
-verify_cpu_sse_test:
- movl $1,%eax
- cpuid
- andl $SSE_MASK,%edx
- cmpl $SSE_MASK,%edx
- je verify_cpu_sse_ok
- test %di,%di
- jz verify_cpu_no_longmode # only try to force SSE on AMD
- movl $MSR_K7_HWCR,%ecx
- rdmsr
- btr $15,%eax # enable SSE
- wrmsr
- xor %di,%di # don't loop
- jmp verify_cpu_sse_test # try again
-
-verify_cpu_no_longmode:
- popfl # Restore caller passed flags
- movl $1,%eax
- ret
-verify_cpu_sse_ok:
- popfl # Restore caller passed flags
- xorl %eax, %eax
- ret
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
- data PT_LOAD FLAGS(7); /* RWE */
+ data PT_LOAD FLAGS(6); /* RW_ */
#ifdef CONFIG_X86_64
user PT_LOAD FLAGS(5); /* R_E */
#ifdef CONFIG_SMP
EXCEPTION_TABLE(16) :text = 0x9090
+#if defined(CONFIG_DEBUG_RODATA)
+ /* .text should occupy whole number of pages */
+ . = ALIGN(PAGE_SIZE);
+#endif
X64_ALIGN_DEBUG_RODATA_BEGIN
RO_DATA(PAGE_SIZE)
X64_ALIGN_DEBUG_RODATA_END
__bss_start = .;
*(.bss..page_aligned)
*(.bss)
- . = ALIGN(4);
+ . = ALIGN(PAGE_SIZE);
__bss_stop = .;
}
s->pics[1].elcr_mask = 0xde;
s->pics[0].pics_state = s;
s->pics[1].pics_state = s;
+ s->pics[0].isr_ack = 0xff;
+ s->pics[1].isr_ack = 0xff;
/*
* Initialize PIO device
ASSERT(!VALID_PAGE(root));
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
- sp = kvm_mmu_get_page(vcpu, i << 30, i << 30,
+ sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
+ i << 30,
PT32_ROOT_LEVEL, 1, ACC_ALL,
NULL);
root = __pa(sp->spt);
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
-obj-$(CONFIG_K8_NUMA) += k8topology_64.o
+obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o
obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
--- /dev/null
+/*
+ * AMD NUMA support.
+ * Discover the memory map and associated nodes.
+ *
+ * This version reads it directly from the AMD northbridge.
+ *
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/memblock.h>
+
+#include <asm/io.h>
+#include <linux/pci_ids.h>
+#include <linux/acpi.h>
+#include <asm/types.h>
+#include <asm/mmzone.h>
+#include <asm/proto.h>
+#include <asm/e820.h>
+#include <asm/pci-direct.h>
+#include <asm/numa.h>
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <asm/amd_nb.h>
+
+static struct bootnode __initdata nodes[8];
+static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
+
+static __init int find_northbridge(void)
+{
+ int num;
+
+ for (num = 0; num < 32; num++) {
+ u32 header;
+
+ header = read_pci_config(0, num, 0, 0x00);
+ if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
+ header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
+ header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
+ continue;
+
+ header = read_pci_config(0, num, 1, 0x00);
+ if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
+ header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
+ header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
+ continue;
+ return num;
+ }
+
+ return -1;
+}
+
+static __init void early_get_boot_cpu_id(void)
+{
+ /*
+ * need to get the APIC ID of the BSP so can use that to
+ * create apicid_to_node in amd_scan_nodes()
+ */
+#ifdef CONFIG_X86_MPPARSE
+ /*
+ * get boot-time SMP configuration:
+ */
+ if (smp_found_config)
+ early_get_smp_config();
+#endif
+ early_init_lapic_mapping();
+}
+
+int __init amd_get_nodes(struct bootnode *physnodes)
+{
+ int i;
+ int ret = 0;
+
+ for_each_node_mask(i, nodes_parsed) {
+ physnodes[ret].start = nodes[i].start;
+ physnodes[ret].end = nodes[i].end;
+ ret++;
+ }
+ return ret;
+}
+
+int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
+{
+ unsigned long start = PFN_PHYS(start_pfn);
+ unsigned long end = PFN_PHYS(end_pfn);
+ unsigned numnodes;
+ unsigned long prevbase;
+ int i, nb, found = 0;
+ u32 nodeid, reg;
+
+ if (!early_pci_allowed())
+ return -1;
+
+ nb = find_northbridge();
+ if (nb < 0)
+ return nb;
+
+ pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
+
+ reg = read_pci_config(0, nb, 0, 0x60);
+ numnodes = ((reg >> 4) & 0xF) + 1;
+ if (numnodes <= 1)
+ return -1;
+
+ pr_info("Number of physical nodes %d\n", numnodes);
+
+ prevbase = 0;
+ for (i = 0; i < 8; i++) {
+ unsigned long base, limit;
+
+ base = read_pci_config(0, nb, 1, 0x40 + i*8);
+ limit = read_pci_config(0, nb, 1, 0x44 + i*8);
+
+ nodeid = limit & 7;
+ if ((base & 3) == 0) {
+ if (i < numnodes)
+ pr_info("Skipping disabled node %d\n", i);
+ continue;
+ }
+ if (nodeid >= numnodes) {
+ pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
+ base, limit);
+ continue;
+ }
+
+ if (!limit) {
+ pr_info("Skipping node entry %d (base %lx)\n",
+ i, base);
+ continue;
+ }
+ if ((base >> 8) & 3 || (limit >> 8) & 3) {
+ pr_err("Node %d using interleaving mode %lx/%lx\n",
+ nodeid, (base >> 8) & 3, (limit >> 8) & 3);
+ return -1;
+ }
+ if (node_isset(nodeid, nodes_parsed)) {
+ pr_info("Node %d already present, skipping\n",
+ nodeid);
+ continue;
+ }
+
+ limit >>= 16;
+ limit <<= 24;
+ limit |= (1<<24)-1;
+ limit++;
+
+ if (limit > end)
+ limit = end;
+ if (limit <= base)
+ continue;
+
+ base >>= 16;
+ base <<= 24;
+
+ if (base < start)
+ base = start;
+ if (limit > end)
+ limit = end;
+ if (limit == base) {
+ pr_err("Empty node %d\n", nodeid);
+ continue;
+ }
+ if (limit < base) {
+ pr_err("Node %d bogus settings %lx-%lx.\n",
+ nodeid, base, limit);
+ continue;
+ }
+
+ /* Could sort here, but pun for now. Should not happen anyroads. */
+ if (prevbase > base) {
+ pr_err("Node map not sorted %lx,%lx\n",
+ prevbase, base);
+ return -1;
+ }
+
+ pr_info("Node %d MemBase %016lx Limit %016lx\n",
+ nodeid, base, limit);
+
+ found++;
+
+ nodes[nodeid].start = base;
+ nodes[nodeid].end = limit;
+
+ prevbase = base;
+
+ node_set(nodeid, nodes_parsed);
+ }
+
+ if (!found)
+ return -1;
+ return 0;
+}
+
+int __init amd_scan_nodes(void)
+{
+ unsigned int bits;
+ unsigned int cores;
+ unsigned int apicid_base;
+ int i;
+
+ BUG_ON(nodes_empty(nodes_parsed));
+ node_possible_map = nodes_parsed;
+ memnode_shift = compute_hash_shift(nodes, 8, NULL);
+ if (memnode_shift < 0) {
+ pr_err("No NUMA node hash function found. Contact maintainer\n");
+ return -1;
+ }
+ pr_info("Using node hash shift of %d\n", memnode_shift);
+
+ /* use the coreid bits from early_identify_cpu */
+ bits = boot_cpu_data.x86_coreid_bits;
+ cores = (1<<bits);
+ apicid_base = 0;
+ /* get the APIC ID of the BSP early for systems with apicid lifting */
+ early_get_boot_cpu_id();
+ if (boot_cpu_physical_apicid > 0) {
+ pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
+ apicid_base = boot_cpu_physical_apicid;
+ }
+
+ for_each_node_mask(i, node_possible_map) {
+ int j;
+
+ memblock_x86_register_active_regions(i,
+ nodes[i].start >> PAGE_SHIFT,
+ nodes[i].end >> PAGE_SHIFT);
+ for (j = apicid_base; j < cores + apicid_base; j++)
+ apicid_to_node[(i << bits) + j] = i;
+ setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+ }
+
+ numa_init_array();
+ return 0;
+}
/*
* We just marked the kernel text read only above, now that
* we are going to free part of that, we need to make that
- * writeable first.
+ * writeable and non-executable first.
*/
+ set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
static inline int is_kernel_text(unsigned long addr)
{
- if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+ if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
return 1;
return 0;
}
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
}
+static void mark_nxdata_nx(void)
+{
+ /*
+ * When this called, init has already been executed and released,
+ * so everything past _etext sould be NX.
+ */
+ unsigned long start = PFN_ALIGN(_etext);
+ /*
+ * This comes from is_kernel_text upper limit. Also HPAGE where used:
+ */
+ unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+
+ if (__supported_pte_mask & _PAGE_NX)
+ printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+ set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
printk(KERN_INFO "Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
#endif
+ mark_nxdata_nx();
}
#endif
+++ /dev/null
-/*
- * AMD K8 NUMA support.
- * Discover the memory map and associated nodes.
- *
- * This version reads it directly from the K8 northbridge.
- *
- * Copyright 2002,2003 Andi Kleen, SuSE Labs.
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/nodemask.h>
-#include <linux/memblock.h>
-
-#include <asm/io.h>
-#include <linux/pci_ids.h>
-#include <linux/acpi.h>
-#include <asm/types.h>
-#include <asm/mmzone.h>
-#include <asm/proto.h>
-#include <asm/e820.h>
-#include <asm/pci-direct.h>
-#include <asm/numa.h>
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <asm/amd_nb.h>
-
-static struct bootnode __initdata nodes[8];
-static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
-
-static __init int find_northbridge(void)
-{
- int num;
-
- for (num = 0; num < 32; num++) {
- u32 header;
-
- header = read_pci_config(0, num, 0, 0x00);
- if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
- header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
- header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
- continue;
-
- header = read_pci_config(0, num, 1, 0x00);
- if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
- header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
- header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
- continue;
- return num;
- }
-
- return -1;
-}
-
-static __init void early_get_boot_cpu_id(void)
-{
- /*
- * need to get the APIC ID of the BSP so can use that to
- * create apicid_to_node in k8_scan_nodes()
- */
-#ifdef CONFIG_X86_MPPARSE
- /*
- * get boot-time SMP configuration:
- */
- if (smp_found_config)
- early_get_smp_config();
-#endif
- early_init_lapic_mapping();
-}
-
-int __init k8_get_nodes(struct bootnode *physnodes)
-{
- int i;
- int ret = 0;
-
- for_each_node_mask(i, nodes_parsed) {
- physnodes[ret].start = nodes[i].start;
- physnodes[ret].end = nodes[i].end;
- ret++;
- }
- return ret;
-}
-
-int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
-{
- unsigned long start = PFN_PHYS(start_pfn);
- unsigned long end = PFN_PHYS(end_pfn);
- unsigned numnodes;
- unsigned long prevbase;
- int i, nb, found = 0;
- u32 nodeid, reg;
-
- if (!early_pci_allowed())
- return -1;
-
- nb = find_northbridge();
- if (nb < 0)
- return nb;
-
- pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
-
- reg = read_pci_config(0, nb, 0, 0x60);
- numnodes = ((reg >> 4) & 0xF) + 1;
- if (numnodes <= 1)
- return -1;
-
- pr_info("Number of physical nodes %d\n", numnodes);
-
- prevbase = 0;
- for (i = 0; i < 8; i++) {
- unsigned long base, limit;
-
- base = read_pci_config(0, nb, 1, 0x40 + i*8);
- limit = read_pci_config(0, nb, 1, 0x44 + i*8);
-
- nodeid = limit & 7;
- if ((base & 3) == 0) {
- if (i < numnodes)
- pr_info("Skipping disabled node %d\n", i);
- continue;
- }
- if (nodeid >= numnodes) {
- pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
- base, limit);
- continue;
- }
-
- if (!limit) {
- pr_info("Skipping node entry %d (base %lx)\n",
- i, base);
- continue;
- }
- if ((base >> 8) & 3 || (limit >> 8) & 3) {
- pr_err("Node %d using interleaving mode %lx/%lx\n",
- nodeid, (base >> 8) & 3, (limit >> 8) & 3);
- return -1;
- }
- if (node_isset(nodeid, nodes_parsed)) {
- pr_info("Node %d already present, skipping\n",
- nodeid);
- continue;
- }
-
- limit >>= 16;
- limit <<= 24;
- limit |= (1<<24)-1;
- limit++;
-
- if (limit > end)
- limit = end;
- if (limit <= base)
- continue;
-
- base >>= 16;
- base <<= 24;
-
- if (base < start)
- base = start;
- if (limit > end)
- limit = end;
- if (limit == base) {
- pr_err("Empty node %d\n", nodeid);
- continue;
- }
- if (limit < base) {
- pr_err("Node %d bogus settings %lx-%lx.\n",
- nodeid, base, limit);
- continue;
- }
-
- /* Could sort here, but pun for now. Should not happen anyroads. */
- if (prevbase > base) {
- pr_err("Node map not sorted %lx,%lx\n",
- prevbase, base);
- return -1;
- }
-
- pr_info("Node %d MemBase %016lx Limit %016lx\n",
- nodeid, base, limit);
-
- found++;
-
- nodes[nodeid].start = base;
- nodes[nodeid].end = limit;
-
- prevbase = base;
-
- node_set(nodeid, nodes_parsed);
- }
-
- if (!found)
- return -1;
- return 0;
-}
-
-int __init k8_scan_nodes(void)
-{
- unsigned int bits;
- unsigned int cores;
- unsigned int apicid_base;
- int i;
-
- BUG_ON(nodes_empty(nodes_parsed));
- node_possible_map = nodes_parsed;
- memnode_shift = compute_hash_shift(nodes, 8, NULL);
- if (memnode_shift < 0) {
- pr_err("No NUMA node hash function found. Contact maintainer\n");
- return -1;
- }
- pr_info("Using node hash shift of %d\n", memnode_shift);
-
- /* use the coreid bits from early_identify_cpu */
- bits = boot_cpu_data.x86_coreid_bits;
- cores = (1<<bits);
- apicid_base = 0;
- /* get the APIC ID of the BSP early for systems with apicid lifting */
- early_get_boot_cpu_id();
- if (boot_cpu_physical_apicid > 0) {
- pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
- apicid_base = boot_cpu_physical_apicid;
- }
-
- for_each_node_mask(i, node_possible_map) {
- int j;
-
- memblock_x86_register_active_regions(i,
- nodes[i].start >> PAGE_SHIFT,
- nodes[i].end >> PAGE_SHIFT);
- for (j = apicid_base; j < cores + apicid_base; j++)
- apicid_to_node[(i << bits) + j] = i;
- setup_node_bootmem(i, nodes[i].start, nodes[i].end);
- }
-
- numa_init_array();
- return 0;
-}
e->trace.entries = e->trace_entries;
e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
e->trace.skip = 0;
- save_stack_trace_bp(&e->trace, regs->bp);
+ save_stack_trace_regs(&e->trace, regs);
/* Round address down to nearest 16 bytes */
shadow_copy = kmemcheck_shadow_lookup(address
static char *cmdline __initdata;
static int __init setup_physnodes(unsigned long start, unsigned long end,
- int acpi, int k8)
+ int acpi, int amd)
{
int nr_nodes = 0;
int ret = 0;
if (acpi)
nr_nodes = acpi_get_nodes(physnodes);
#endif
-#ifdef CONFIG_K8_NUMA
- if (k8)
- nr_nodes = k8_get_nodes(physnodes);
+#ifdef CONFIG_AMD_NUMA
+ if (amd)
+ nr_nodes = amd_get_nodes(physnodes);
#endif
/*
* Basic sanity checking on the physical node map: there may be errors
- * if the SRAT or K8 incorrectly reported the topology or the mem=
+ * if the SRAT or AMD code incorrectly reported the topology or the mem=
* kernel parameter is used.
*/
for (i = 0; i < nr_nodes; i++) {
* numa=fake command-line option.
*/
static int __init numa_emulation(unsigned long start_pfn,
- unsigned long last_pfn, int acpi, int k8)
+ unsigned long last_pfn, int acpi, int amd)
{
u64 addr = start_pfn << PAGE_SHIFT;
u64 max_addr = last_pfn << PAGE_SHIFT;
int num_nodes;
int i;
- num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
+ num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd);
/*
* If the numa=fake command-line contains a 'M' or 'G', it represents
* the fixed node size. Otherwise, if it is just a single number N,
#endif /* CONFIG_NUMA_EMU */
void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
- int acpi, int k8)
+ int acpi, int amd)
{
int i;
nodes_clear(node_online_map);
#ifdef CONFIG_NUMA_EMU
- if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
+ if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
return;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
nodes_clear(node_online_map);
#endif
-#ifdef CONFIG_K8_NUMA
- if (!numa_off && k8 && !k8_scan_nodes())
+#ifdef CONFIG_AMD_NUMA
+ if (!numa_off && amd && !amd_scan_nodes())
return;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
#include <linux/pfn.h>
#include <linux/percpu.h>
#include <linux/gfp.h>
+#include <linux/pci.h>
#include <asm/e820.h>
#include <asm/processor.h>
unsigned long pfn)
{
pgprot_t forbidden = __pgprot(0);
+ pgprot_t required = __pgprot(0);
/*
* The BIOS area between 640k and 1Mb needs to be executable for
* PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
*/
- if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+#ifdef CONFIG_PCI_BIOS
+ if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_NX;
+#endif
/*
* The kernel text needs to be executable for obvious reasons
if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
__pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_RW;
+ /*
+ * .data and .bss should always be writable.
+ */
+ if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
+ within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
+ pgprot_val(required) |= _PAGE_RW;
#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
/*
#endif
prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+ prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
return prot;
}
{
unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
pte_t new_pte, old_pte, *tmp;
- pgprot_t old_prot, new_prot;
+ pgprot_t old_prot, new_prot, req_prot;
int i, do_split = 1;
unsigned int level;
* We are safe now. Check whether the new pgprot is the same:
*/
old_pte = *kpte;
- old_prot = new_prot = pte_pgprot(old_pte);
+ old_prot = new_prot = req_prot = pte_pgprot(old_pte);
- pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
- pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+ pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
+ pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
/*
* old_pte points to the large page base address. So we need
pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
cpa->pfn = pfn;
- new_prot = static_protections(new_prot, address, pfn);
+ new_prot = static_protections(req_prot, address, pfn);
/*
* We need to check the full range, whether
* static_protection() requires a different pgprot for one of
* the pages in the range we try to preserve:
*/
- addr = address + PAGE_SIZE;
- pfn++;
- for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
- pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
+ addr = address & pmask;
+ pfn = pte_pfn(old_pte);
+ for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
+ pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
if (pgprot_val(chk_prot) != pgprot_val(new_prot))
goto out_unlock;
* that we limited the number of possible pages already to
* the number of pages in the large page.
*/
- if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
+ if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
/*
* The address is aligned and the number of pages
* covers the full page.
{
if (!cpu_has_nx) {
printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
- "missing in CPU or disabled in BIOS!\n");
+ "missing in CPU!\n");
} else {
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
if (disable_nx) {
/* mark this node as "seen" in node bitmap */
BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
+ /* don't need to check apic_id here, because it is always 8 bits */
apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
}
apic_id = pa->apic_id;
+ if (apic_id >= MAX_LOCAL_APIC) {
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+ return;
+ }
apicid_to_node[apic_id] = node;
node_set(node, cpu_nodes_parsed);
acpi_numa = 1;
apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
else
apic_id = pa->apic_id;
+
+ if (apic_id >= MAX_LOCAL_APIC) {
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+ return;
+ }
+
apicid_to_node[apic_id] = node;
node_set(node, cpu_nodes_parsed);
acpi_numa = 1;
if (!user_mode_vm(regs)) {
unsigned long stack = kernel_stack_pointer(regs);
if (depth)
- dump_trace(NULL, regs, (unsigned long *)stack, 0,
+ dump_trace(NULL, regs, (unsigned long *)stack,
&backtrace_ops, &depth);
return;
}
case 0x14:
cpu_type = "x86-64/family14h";
break;
+ case 0x15:
+ cpu_type = "x86-64/family15h";
+ break;
default:
return -ENODEV;
}
int __init op_nmi_timer_init(struct oprofile_operations *ops)
{
- if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0))
- return -ENODEV;
-
ops->start = timer_start;
ops->stop = timer_stop;
ops->cpu_type = "timer";
#include "op_x86_model.h"
#include "op_counter.h"
-#define NUM_COUNTERS 4
+#define NUM_COUNTERS 4
+#define NUM_COUNTERS_F15H 6
#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-#define NUM_VIRT_COUNTERS 32
+#define NUM_VIRT_COUNTERS 32
#else
-#define NUM_VIRT_COUNTERS NUM_COUNTERS
+#define NUM_VIRT_COUNTERS 0
#endif
#define OP_EVENT_MASK 0x0FFF
#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21))
-static unsigned long reset_value[NUM_VIRT_COUNTERS];
+static int num_counters;
+static unsigned long reset_value[OP_MAX_COUNTER];
#define IBS_FETCH_SIZE 6
#define IBS_OP_SIZE 12
int i;
/* enable active counters */
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
int virt = op_x86_phys_to_virt(i);
if (!reset_value[virt])
continue;
{
int i;
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
if (!msrs->counters[i].addr)
continue;
release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
{
int i;
- for (i = 0; i < NUM_COUNTERS; i++) {
+ for (i = 0; i < num_counters; i++) {
if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
goto fail;
if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
goto fail;
}
/* both registers must be reserved */
- msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
- msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+ if (num_counters == NUM_COUNTERS_F15H) {
+ msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
+ msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
+ } else {
+ msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+ msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
+ }
continue;
fail:
if (!counter_config[i].enabled)
int i;
/* setup reset_value */
- for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
+ for (i = 0; i < OP_MAX_COUNTER; ++i) {
if (counter_config[i].enabled
&& msrs->counters[op_x86_virt_to_phys(i)].addr)
reset_value[i] = counter_config[i].count;
}
/* clear all counters */
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
if (!msrs->controls[i].addr)
continue;
rdmsrl(msrs->controls[i].addr, val);
}
/* enable active counters */
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
int virt = op_x86_phys_to_virt(i);
if (!reset_value[virt])
continue;
u64 val;
int i;
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
int virt = op_x86_phys_to_virt(i);
if (!reset_value[virt])
continue;
u64 val;
int i;
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
if (!reset_value[op_x86_phys_to_virt(i)])
continue;
rdmsrl(msrs->controls[i].addr, val);
* Subtle: stop on all counters to avoid race with setting our
* pm callback
*/
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
if (!reset_value[op_x86_phys_to_virt(i)])
continue;
rdmsrl(msrs->controls[i].addr, val);
ret = setup_ibs_ctl(i);
if (ret)
return ret;
+ pr_err(FW_BUG "using offset %d for IBS interrupts\n", i);
return 0;
}
return 0;
}
-/* initialize the APIC for the IBS interrupts if available */
+/*
+ * check and reserve APIC extended interrupt LVT offset for IBS if
+ * available
+ *
+ * init_ibs() preforms implicitly cpu-local operations, so pin this
+ * thread to its current CPU
+ */
+
static void init_ibs(void)
{
- ibs_caps = get_ibs_caps();
+ preempt_disable();
+ ibs_caps = get_ibs_caps();
if (!ibs_caps)
- return;
+ goto out;
- if (__init_ibs_nmi()) {
+ if (__init_ibs_nmi() < 0)
ibs_caps = 0;
- return;
- }
+ else
+ printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
- printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n",
- (unsigned)ibs_caps);
+out:
+ preempt_enable();
}
static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
return 0;
}
+struct op_x86_model_spec op_amd_spec;
+
static int op_amd_init(struct oprofile_operations *ops)
{
init_ibs();
create_arch_files = ops->create_files;
ops->create_files = setup_ibs_files;
+
+ if (boot_cpu_data.x86 == 0x15) {
+ num_counters = NUM_COUNTERS_F15H;
+ } else {
+ num_counters = NUM_COUNTERS;
+ }
+
+ op_amd_spec.num_counters = num_counters;
+ op_amd_spec.num_controls = num_counters;
+ op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS);
+
return 0;
}
struct op_x86_model_spec op_amd_spec = {
- .num_counters = NUM_COUNTERS,
- .num_controls = NUM_COUNTERS,
- .num_virt_counters = NUM_VIRT_COUNTERS,
+ /* num_counters/num_controls filled in at runtime */
.reserved = MSR_AMD_EVENTSEL_RESERVED,
.event_mask = OP_EVENT_MASK,
.init = op_amd_init,
#include <linux/oprofile.h>
#include <linux/smp.h>
#include <linux/ptrace.h>
-#include <linux/nmi.h>
+#include <asm/nmi.h>
#include <asm/msr.h>
#include <asm/fixmap.h>
#include <asm/apic.h>
obj-$(CONFIG_PCI_XEN) += xen.o
obj-y += fixup.o
+obj-$(CONFIG_X86_INTEL_CE) += ce4100.o
obj-$(CONFIG_ACPI) += acpi.o
obj-y += legacy.o irq.o
--- /dev/null
+/*
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2010 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ * The full GNU General Public License is included in this distribution
+ * in the file called LICENSE.GPL.
+ *
+ * Contact Information:
+ * Intel Corporation
+ * 2200 Mission College Blvd.
+ * Santa Clara, CA 97052
+ *
+ * This provides access methods for PCI registers that mis-behave on
+ * the CE4100. Each register can be assigned a private init, read and
+ * write routine. The exception to this is the bridge device. The
+ * bridge device is the only device on bus zero (0) that requires any
+ * fixup so it is a special case ATM
+ */
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+
+#include <asm/pci_x86.h>
+
+struct sim_reg {
+ u32 value;
+ u32 mask;
+};
+
+struct sim_dev_reg {
+ int dev_func;
+ int reg;
+ void (*init)(struct sim_dev_reg *reg);
+ void (*read)(struct sim_dev_reg *reg, u32 *value);
+ void (*write)(struct sim_dev_reg *reg, u32 value);
+ struct sim_reg sim_reg;
+};
+
+struct sim_reg_op {
+ void (*init)(struct sim_dev_reg *reg);
+ void (*read)(struct sim_dev_reg *reg, u32 value);
+ void (*write)(struct sim_dev_reg *reg, u32 value);
+};
+
+#define MB (1024 * 1024)
+#define KB (1024)
+#define SIZE_TO_MASK(size) (~(size - 1))
+
+#define DEFINE_REG(device, func, offset, size, init_op, read_op, write_op)\
+{ PCI_DEVFN(device, func), offset, init_op, read_op, write_op,\
+ {0, SIZE_TO_MASK(size)} },
+
+static void reg_init(struct sim_dev_reg *reg)
+{
+ pci_direct_conf1.read(0, 1, reg->dev_func, reg->reg, 4,
+ ®->sim_reg.value);
+}
+
+static void reg_read(struct sim_dev_reg *reg, u32 *value)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&pci_config_lock, flags);
+ *value = reg->sim_reg.value;
+ raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+}
+
+static void reg_write(struct sim_dev_reg *reg, u32 value)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&pci_config_lock, flags);
+ reg->sim_reg.value = (value & reg->sim_reg.mask) |
+ (reg->sim_reg.value & ~reg->sim_reg.mask);
+ raw_spin_unlock_irqrestore(&pci_config_lock, flags);
+}
+
+static void sata_reg_init(struct sim_dev_reg *reg)
+{
+ pci_direct_conf1.read(0, 1, PCI_DEVFN(14, 0), 0x10, 4,
+ ®->sim_reg.value);
+ reg->sim_reg.value += 0x400;
+}
+
+static void ehci_reg_read(struct sim_dev_reg *reg, u32 *value)
+{
+ reg_read(reg, value);
+ if (*value != reg->sim_reg.mask)
+ *value |= 0x100;
+}
+
+void sata_revid_init(struct sim_dev_reg *reg)
+{
+ reg->sim_reg.value = 0x01060100;
+ reg->sim_reg.mask = 0;
+}
+
+static void sata_revid_read(struct sim_dev_reg *reg, u32 *value)
+{
+ reg_read(reg, value);
+}
+
+static struct sim_dev_reg bus1_fixups[] = {
+ DEFINE_REG(2, 0, 0x10, (16*MB), reg_init, reg_read, reg_write)
+ DEFINE_REG(2, 0, 0x14, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(2, 1, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(3, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(4, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(4, 1, 0x10, (128*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(6, 0, 0x10, (512*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(6, 1, 0x10, (512*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(6, 2, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(8, 0, 0x10, (1*MB), reg_init, reg_read, reg_write)
+ DEFINE_REG(8, 1, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(8, 2, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(9, 0, 0x10 , (1*MB), reg_init, reg_read, reg_write)
+ DEFINE_REG(9, 0, 0x14, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(10, 0, 0x10, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(10, 0, 0x14, (256*MB), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 0, 0x10, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 0, 0x14, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 1, 0x10, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 2, 0x10, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 2, 0x14, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 2, 0x18, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 3, 0x10, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 3, 0x14, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 4, 0x10, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 5, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 6, 0x10, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(11, 7, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(12, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(12, 0, 0x14, (256), reg_init, reg_read, reg_write)
+ DEFINE_REG(12, 1, 0x10, (1024), reg_init, reg_read, reg_write)
+ DEFINE_REG(13, 0, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write)
+ DEFINE_REG(13, 1, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write)
+ DEFINE_REG(14, 0, 0x8, 0, sata_revid_init, sata_revid_read, 0)
+ DEFINE_REG(14, 0, 0x10, 0, reg_init, reg_read, reg_write)
+ DEFINE_REG(14, 0, 0x14, 0, reg_init, reg_read, reg_write)
+ DEFINE_REG(14, 0, 0x18, 0, reg_init, reg_read, reg_write)
+ DEFINE_REG(14, 0, 0x1C, 0, reg_init, reg_read, reg_write)
+ DEFINE_REG(14, 0, 0x20, 0, reg_init, reg_read, reg_write)
+ DEFINE_REG(14, 0, 0x24, (0x200), sata_reg_init, reg_read, reg_write)
+ DEFINE_REG(15, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(15, 0, 0x14, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(16, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(16, 0, 0x14, (64*MB), reg_init, reg_read, reg_write)
+ DEFINE_REG(16, 0, 0x18, (64*MB), reg_init, reg_read, reg_write)
+ DEFINE_REG(17, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
+ DEFINE_REG(18, 0, 0x10, (1*KB), reg_init, reg_read, reg_write)
+};
+
+static void __init init_sim_regs(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+ if (bus1_fixups[i].init)
+ bus1_fixups[i].init(&bus1_fixups[i]);
+ }
+}
+
+static inline void extract_bytes(u32 *value, int reg, int len)
+{
+ uint32_t mask;
+
+ *value >>= ((reg & 3) * 8);
+ mask = 0xFFFFFFFF >> ((4 - len) * 8);
+ *value &= mask;
+}
+
+int bridge_read(unsigned int devfn, int reg, int len, u32 *value)
+{
+ u32 av_bridge_base, av_bridge_limit;
+ int retval = 0;
+
+ switch (reg) {
+ /* Make BARs appear to not request any memory. */
+ case PCI_BASE_ADDRESS_0:
+ case PCI_BASE_ADDRESS_0 + 1:
+ case PCI_BASE_ADDRESS_0 + 2:
+ case PCI_BASE_ADDRESS_0 + 3:
+ *value = 0;
+ break;
+
+ /* Since subordinate bus number register is hardwired
+ * to zero and read only, so do the simulation.
+ */
+ case PCI_PRIMARY_BUS:
+ if (len == 4)
+ *value = 0x00010100;
+ break;
+
+ case PCI_SUBORDINATE_BUS:
+ *value = 1;
+ break;
+
+ case PCI_MEMORY_BASE:
+ case PCI_MEMORY_LIMIT:
+ /* Get the A/V bridge base address. */
+ pci_direct_conf1.read(0, 0, devfn,
+ PCI_BASE_ADDRESS_0, 4, &av_bridge_base);
+
+ av_bridge_limit = av_bridge_base + (512*MB - 1);
+ av_bridge_limit >>= 16;
+ av_bridge_limit &= 0xFFF0;
+
+ av_bridge_base >>= 16;
+ av_bridge_base &= 0xFFF0;
+
+ if (reg == PCI_MEMORY_LIMIT)
+ *value = av_bridge_limit;
+ else if (len == 2)
+ *value = av_bridge_base;
+ else
+ *value = (av_bridge_limit << 16) | av_bridge_base;
+ break;
+ /* Make prefetchable memory limit smaller than prefetchable
+ * memory base, so not claim prefetchable memory space.
+ */
+ case PCI_PREF_MEMORY_BASE:
+ *value = 0xFFF0;
+ break;
+ case PCI_PREF_MEMORY_LIMIT:
+ *value = 0x0;
+ break;
+ /* Make IO limit smaller than IO base, so not claim IO space. */
+ case PCI_IO_BASE:
+ *value = 0xF0;
+ break;
+ case PCI_IO_LIMIT:
+ *value = 0;
+ break;
+ default:
+ retval = 1;
+ }
+ return retval;
+}
+
+static int ce4100_conf_read(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 *value)
+{
+ int i, retval = 1;
+
+ if (bus == 1) {
+ for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+ if (bus1_fixups[i].dev_func == devfn &&
+ bus1_fixups[i].reg == (reg & ~3) &&
+ bus1_fixups[i].read) {
+ bus1_fixups[i].read(&(bus1_fixups[i]),
+ value);
+ extract_bytes(value, reg, len);
+ return 0;
+ }
+ }
+ }
+
+ if (bus == 0 && (PCI_DEVFN(1, 0) == devfn) &&
+ !bridge_read(devfn, reg, len, value))
+ return 0;
+
+ return pci_direct_conf1.read(seg, bus, devfn, reg, len, value);
+}
+
+static int ce4100_conf_write(unsigned int seg, unsigned int bus,
+ unsigned int devfn, int reg, int len, u32 value)
+{
+ int i;
+
+ if (bus == 1) {
+ for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
+ if (bus1_fixups[i].dev_func == devfn &&
+ bus1_fixups[i].reg == (reg & ~3) &&
+ bus1_fixups[i].write) {
+ bus1_fixups[i].write(&(bus1_fixups[i]),
+ value);
+ return 0;
+ }
+ }
+ }
+
+ /* Discard writes to A/V bridge BAR. */
+ if (bus == 0 && PCI_DEVFN(1, 0) == devfn &&
+ ((reg & ~3) == PCI_BASE_ADDRESS_0))
+ return 0;
+
+ return pci_direct_conf1.write(seg, bus, devfn, reg, len, value);
+}
+
+struct pci_raw_ops ce4100_pci_conf = {
+ .read = ce4100_conf_read,
+ .write = ce4100_conf_write,
+};
+
+static int __init ce4100_pci_init(void)
+{
+ init_sim_regs();
+ raw_pci_ops = &ce4100_pci_conf;
+ return 0;
+}
+subsys_initcall(ce4100_pci_init);
#include <linux/uaccess.h>
#include <asm/pci_x86.h>
#include <asm/pci-functions.h>
+#include <asm/cacheflush.h>
/* BIOS32 signature: "_32_" */
#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
#define PCIBIOS_HW_TYPE1_SPEC 0x10
#define PCIBIOS_HW_TYPE2_SPEC 0x20
+int pcibios_enabled;
+
+/* According to the BIOS specification at:
+ * http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could
+ * restrict the x zone to some pages and make it ro. But this may be
+ * broken on some bios, complex to handle with static_protections.
+ * We could make the 0xe0000-0x100000 range rox, but this can break
+ * some ISA mapping.
+ *
+ * So we let's an rw and x hole when pcibios is used. This shouldn't
+ * happen for modern system with mmconfig, and if you don't want it
+ * you could disable pcibios...
+ */
+static inline void set_bios_x(void)
+{
+ pcibios_enabled = 1;
+ set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
+ if (__supported_pte_mask & _PAGE_NX)
+ printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n");
+}
+
/*
* This is the standard structure used to identify the entry point
* to the BIOS32 Service Directory, as documented in
DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
bios32_entry);
bios32_indirect.address = bios32_entry + PAGE_OFFSET;
+ set_bios_x();
if (check_pcibios())
return &pci_bios_access;
}
# Platform specific code goes here
+obj-y += ce4100/
obj-y += efi/
+obj-y += iris/
obj-y += mrst/
obj-y += olpc/
obj-y += scx200/
--- /dev/null
+obj-$(CONFIG_X86_INTEL_CE) += ce4100.o
--- /dev/null
+/*
+ * Intel CE4100 platform specific setup code
+ *
+ * (C) Copyright 2010 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/serial_reg.h>
+#include <linux/serial_8250.h>
+
+#include <asm/setup.h>
+#include <asm/io.h>
+
+static int ce4100_i8042_detect(void)
+{
+ return 0;
+}
+
+static void __init sdv_find_smp_config(void)
+{
+}
+
+#ifdef CONFIG_SERIAL_8250
+
+
+static unsigned int mem_serial_in(struct uart_port *p, int offset)
+{
+ offset = offset << p->regshift;
+ return readl(p->membase + offset);
+}
+
+/*
+ * The UART Tx interrupts are not set under some conditions and therefore serial
+ * transmission hangs. This is a silicon issue and has not been root caused. The
+ * workaround for this silicon issue checks UART_LSR_THRE bit and UART_LSR_TEMT
+ * bit of LSR register in interrupt handler to see whether at least one of these
+ * two bits is set, if so then process the transmit request. If this workaround
+ * is not applied, then the serial transmission may hang. This workaround is for
+ * errata number 9 in Errata - B step.
+*/
+
+static unsigned int ce4100_mem_serial_in(struct uart_port *p, int offset)
+{
+ unsigned int ret, ier, lsr;
+
+ if (offset == UART_IIR) {
+ offset = offset << p->regshift;
+ ret = readl(p->membase + offset);
+ if (ret & UART_IIR_NO_INT) {
+ /* see if the TX interrupt should have really set */
+ ier = mem_serial_in(p, UART_IER);
+ /* see if the UART's XMIT interrupt is enabled */
+ if (ier & UART_IER_THRI) {
+ lsr = mem_serial_in(p, UART_LSR);
+ /* now check to see if the UART should be
+ generating an interrupt (but isn't) */
+ if (lsr & (UART_LSR_THRE | UART_LSR_TEMT))
+ ret &= ~UART_IIR_NO_INT;
+ }
+ }
+ } else
+ ret = mem_serial_in(p, offset);
+ return ret;
+}
+
+static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value)
+{
+ offset = offset << p->regshift;
+ writel(value, p->membase + offset);
+}
+
+static void ce4100_serial_fixup(int port, struct uart_port *up,
+ unsigned short *capabilites)
+{
+#ifdef CONFIG_EARLY_PRINTK
+ /*
+ * Over ride the legacy port configuration that comes from
+ * asm/serial.h. Using the ioport driver then switching to the
+ * PCI memmaped driver hangs the IOAPIC
+ */
+ if (up->iotype != UPIO_MEM32) {
+ up->uartclk = 14745600;
+ up->mapbase = 0xdffe0200;
+ set_fixmap_nocache(FIX_EARLYCON_MEM_BASE,
+ up->mapbase & PAGE_MASK);
+ up->membase =
+ (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE);
+ up->membase += up->mapbase & ~PAGE_MASK;
+ up->iotype = UPIO_MEM32;
+ up->regshift = 2;
+ }
+#endif
+ up->iobase = 0;
+ up->serial_in = ce4100_mem_serial_in;
+ up->serial_out = ce4100_mem_serial_out;
+
+ *capabilites |= (1 << 12);
+}
+
+static __init void sdv_serial_fixup(void)
+{
+ serial8250_set_isa_configurator(ce4100_serial_fixup);
+}
+
+#else
+static inline void sdv_serial_fixup(void);
+#endif
+
+static void __init sdv_arch_setup(void)
+{
+ sdv_serial_fixup();
+}
+
+/*
+ * CE4100 specific x86_init function overrides and early setup
+ * calls.
+ */
+void __init x86_ce4100_early_setup(void)
+{
+ x86_init.oem.arch_setup = sdv_arch_setup;
+ x86_platform.i8042_detect = ce4100_i8042_detect;
+ x86_init.resources.probe_roms = x86_init_noop;
+ x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+ x86_init.mpparse.find_smp_config = sdv_find_smp_config;
+}
--- /dev/null
+obj-$(CONFIG_X86_32_IRIS) += iris.o
--- /dev/null
+/*
+ * Eurobraille/Iris power off support.
+ *
+ * Eurobraille's Iris machine is a PC with no APM or ACPI support.
+ * It is shutdown by a special I/O sequence which this module provides.
+ *
+ * Copyright (C) Shérab <Sebastien.Hinderer@ens-lyon.org>
+ *
+ * This program is free software ; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation ; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY ; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with the program ; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/moduleparam.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/pm.h>
+#include <asm/io.h>
+
+#define IRIS_GIO_BASE 0x340
+#define IRIS_GIO_INPUT IRIS_GIO_BASE
+#define IRIS_GIO_OUTPUT (IRIS_GIO_BASE + 1)
+#define IRIS_GIO_PULSE 0x80 /* First byte to send */
+#define IRIS_GIO_REST 0x00 /* Second byte to send */
+#define IRIS_GIO_NODEV 0xff /* Likely not an Iris */
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
+MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
+MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
+
+static int force;
+
+module_param(force, bool, 0);
+MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation.");
+
+static void (*old_pm_power_off)(void);
+
+static void iris_power_off(void)
+{
+ outb(IRIS_GIO_PULSE, IRIS_GIO_OUTPUT);
+ msleep(850);
+ outb(IRIS_GIO_REST, IRIS_GIO_OUTPUT);
+}
+
+/*
+ * Before installing the power_off handler, try to make sure the OS is
+ * running on an Iris. Since Iris does not support DMI, this is done
+ * by reading its input port and seeing whether the read value is
+ * meaningful.
+ */
+static int iris_init(void)
+{
+ unsigned char status;
+ if (force != 1) {
+ printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n");
+ return -ENODEV;
+ }
+ status = inb(IRIS_GIO_INPUT);
+ if (status == IRIS_GIO_NODEV) {
+ printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n");
+ return -ENODEV;
+ }
+ old_pm_power_off = pm_power_off;
+ pm_power_off = &iris_power_off;
+ printk(KERN_INFO "Iris power_off handler installed.\n");
+
+ return 0;
+}
+
+static void iris_exit(void)
+{
+ pm_power_off = old_pm_power_off;
+ printk(KERN_INFO "Iris power_off handler uninstalled.\n");
+}
+
+module_init(iris_init);
+module_exit(iris_exit);
obj-$(CONFIG_X86_MRST) += mrst.o
+obj-$(CONFIG_X86_MRST) += vrtc.o
+obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o
--- /dev/null
+/*
+ * early_printk_mrst.c - early consoles for Intel MID platforms
+ *
+ * Copyright (c) 2008-2010, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+/*
+ * This file implements two early consoles named mrst and hsu.
+ * mrst is based on Maxim3110 spi-uart device, it exists in both
+ * Moorestown and Medfield platforms, while hsu is based on a High
+ * Speed UART device which only exists in the Medfield platform
+ */
+
+#include <linux/serial_reg.h>
+#include <linux/serial_mfd.h>
+#include <linux/kmsg_dump.h>
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/io.h>
+
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/mrst.h>
+
+#define MRST_SPI_TIMEOUT 0x200000
+#define MRST_REGBASE_SPI0 0xff128000
+#define MRST_REGBASE_SPI1 0xff128400
+#define MRST_CLK_SPI0_REG 0xff11d86c
+
+/* Bit fields in CTRLR0 */
+#define SPI_DFS_OFFSET 0
+
+#define SPI_FRF_OFFSET 4
+#define SPI_FRF_SPI 0x0
+#define SPI_FRF_SSP 0x1
+#define SPI_FRF_MICROWIRE 0x2
+#define SPI_FRF_RESV 0x3
+
+#define SPI_MODE_OFFSET 6
+#define SPI_SCPH_OFFSET 6
+#define SPI_SCOL_OFFSET 7
+#define SPI_TMOD_OFFSET 8
+#define SPI_TMOD_TR 0x0 /* xmit & recv */
+#define SPI_TMOD_TO 0x1 /* xmit only */
+#define SPI_TMOD_RO 0x2 /* recv only */
+#define SPI_TMOD_EPROMREAD 0x3 /* eeprom read mode */
+
+#define SPI_SLVOE_OFFSET 10
+#define SPI_SRL_OFFSET 11
+#define SPI_CFS_OFFSET 12
+
+/* Bit fields in SR, 7 bits */
+#define SR_MASK 0x7f /* cover 7 bits */
+#define SR_BUSY (1 << 0)
+#define SR_TF_NOT_FULL (1 << 1)
+#define SR_TF_EMPT (1 << 2)
+#define SR_RF_NOT_EMPT (1 << 3)
+#define SR_RF_FULL (1 << 4)
+#define SR_TX_ERR (1 << 5)
+#define SR_DCOL (1 << 6)
+
+struct dw_spi_reg {
+ u32 ctrl0;
+ u32 ctrl1;
+ u32 ssienr;
+ u32 mwcr;
+ u32 ser;
+ u32 baudr;
+ u32 txfltr;
+ u32 rxfltr;
+ u32 txflr;
+ u32 rxflr;
+ u32 sr;
+ u32 imr;
+ u32 isr;
+ u32 risr;
+ u32 txoicr;
+ u32 rxoicr;
+ u32 rxuicr;
+ u32 msticr;
+ u32 icr;
+ u32 dmacr;
+ u32 dmatdlr;
+ u32 dmardlr;
+ u32 idr;
+ u32 version;
+
+ /* Currently operates as 32 bits, though only the low 16 bits matter */
+ u32 dr;
+} __packed;
+
+#define dw_readl(dw, name) __raw_readl(&(dw)->name)
+#define dw_writel(dw, name, val) __raw_writel((val), &(dw)->name)
+
+/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
+static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
+
+static u32 *pclk_spi0;
+/* Always contains an accessable address, start with 0 */
+static struct dw_spi_reg *pspi;
+
+static struct kmsg_dumper dw_dumper;
+static int dumper_registered;
+
+static void dw_kmsg_dump(struct kmsg_dumper *dumper,
+ enum kmsg_dump_reason reason,
+ const char *s1, unsigned long l1,
+ const char *s2, unsigned long l2)
+{
+ int i;
+
+ /* When run to this, we'd better re-init the HW */
+ mrst_early_console_init();
+
+ for (i = 0; i < l1; i++)
+ early_mrst_console.write(&early_mrst_console, s1 + i, 1);
+ for (i = 0; i < l2; i++)
+ early_mrst_console.write(&early_mrst_console, s2 + i, 1);
+}
+
+/* Set the ratio rate to 115200, 8n1, IRQ disabled */
+static void max3110_write_config(void)
+{
+ u16 config;
+
+ config = 0xc001;
+ dw_writel(pspi, dr, config);
+}
+
+/* Translate char to a eligible word and send to max3110 */
+static void max3110_write_data(char c)
+{
+ u16 data;
+
+ data = 0x8000 | c;
+ dw_writel(pspi, dr, data);
+}
+
+void mrst_early_console_init(void)
+{
+ u32 ctrlr0 = 0;
+ u32 spi0_cdiv;
+ u32 freq; /* Freqency info only need be searched once */
+
+ /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
+ pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+ MRST_CLK_SPI0_REG);
+ spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
+ freq = 100000000 / (spi0_cdiv + 1);
+
+ if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL)
+ mrst_spi_paddr = MRST_REGBASE_SPI1;
+
+ pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+ mrst_spi_paddr);
+
+ /* Disable SPI controller */
+ dw_writel(pspi, ssienr, 0);
+
+ /* Set control param, 8 bits, transmit only mode */
+ ctrlr0 = dw_readl(pspi, ctrl0);
+
+ ctrlr0 &= 0xfcc0;
+ ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
+ | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
+ dw_writel(pspi, ctrl0, ctrlr0);
+
+ /*
+ * Change the spi0 clk to comply with 115200 bps, use 100000 to
+ * calculate the clk dividor to make the clock a little slower
+ * than real baud rate.
+ */
+ dw_writel(pspi, baudr, freq/100000);
+
+ /* Disable all INT for early phase */
+ dw_writel(pspi, imr, 0x0);
+
+ /* Set the cs to spi-uart */
+ dw_writel(pspi, ser, 0x2);
+
+ /* Enable the HW, the last step for HW init */
+ dw_writel(pspi, ssienr, 0x1);
+
+ /* Set the default configuration */
+ max3110_write_config();
+
+ /* Register the kmsg dumper */
+ if (!dumper_registered) {
+ dw_dumper.dump = dw_kmsg_dump;
+ kmsg_dump_register(&dw_dumper);
+ dumper_registered = 1;
+ }
+}
+
+/* Slave select should be called in the read/write function */
+static void early_mrst_spi_putc(char c)
+{
+ unsigned int timeout;
+ u32 sr;
+
+ timeout = MRST_SPI_TIMEOUT;
+ /* Early putc needs to make sure the TX FIFO is not full */
+ while (--timeout) {
+ sr = dw_readl(pspi, sr);
+ if (!(sr & SR_TF_NOT_FULL))
+ cpu_relax();
+ else
+ break;
+ }
+
+ if (!timeout)
+ pr_warning("MRST earlycon: timed out\n");
+ else
+ max3110_write_data(c);
+}
+
+/* Early SPI only uses polling mode */
+static void early_mrst_spi_write(struct console *con, const char *str, unsigned n)
+{
+ int i;
+
+ for (i = 0; i < n && *str; i++) {
+ if (*str == '\n')
+ early_mrst_spi_putc('\r');
+ early_mrst_spi_putc(*str);
+ str++;
+ }
+}
+
+struct console early_mrst_console = {
+ .name = "earlymrst",
+ .write = early_mrst_spi_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
+
+/*
+ * Following is the early console based on Medfield HSU (High
+ * Speed UART) device.
+ */
+#define HSU_PORT2_PADDR 0xffa28180
+
+static void __iomem *phsu;
+
+void hsu_early_console_init(void)
+{
+ u8 lcr;
+
+ phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
+ HSU_PORT2_PADDR);
+
+ /* Disable FIFO */
+ writeb(0x0, phsu + UART_FCR);
+
+ /* Set to default 115200 bps, 8n1 */
+ lcr = readb(phsu + UART_LCR);
+ writeb((0x80 | lcr), phsu + UART_LCR);
+ writeb(0x18, phsu + UART_DLL);
+ writeb(lcr, phsu + UART_LCR);
+ writel(0x3600, phsu + UART_MUL*4);
+
+ writeb(0x8, phsu + UART_MCR);
+ writeb(0x7, phsu + UART_FCR);
+ writeb(0x3, phsu + UART_LCR);
+
+ /* Clear IRQ status */
+ readb(phsu + UART_LSR);
+ readb(phsu + UART_RX);
+ readb(phsu + UART_IIR);
+ readb(phsu + UART_MSR);
+
+ /* Enable FIFO */
+ writeb(0x7, phsu + UART_FCR);
+}
+
+#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
+
+static void early_hsu_putc(char ch)
+{
+ unsigned int timeout = 10000; /* 10ms */
+ u8 status;
+
+ while (--timeout) {
+ status = readb(phsu + UART_LSR);
+ if (status & BOTH_EMPTY)
+ break;
+ udelay(1);
+ }
+
+ /* Only write the char when there was no timeout */
+ if (timeout)
+ writeb(ch, phsu + UART_TX);
+}
+
+static void early_hsu_write(struct console *con, const char *str, unsigned n)
+{
+ int i;
+
+ for (i = 0; i < n && *str; i++) {
+ if (*str == '\n')
+ early_hsu_putc('\r');
+ early_hsu_putc(*str);
+ str++;
+ }
+}
+
+struct console early_hsu_console = {
+ .name = "earlyhsu",
+ .write = early_hsu_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
* as published by the Free Software Foundation; version 2
* of the License.
*/
+
+#define pr_fmt(fmt) "mrst: " fmt
+
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/sfi.h>
+#include <linux/intel_pmic_gpio.h>
+#include <linux/spi/spi.h>
+#include <linux/i2c.h>
+#include <linux/i2c/pca953x.h>
+#include <linux/gpio_keys.h>
+#include <linux/input.h>
+#include <linux/platform_device.h>
#include <linux/irq.h>
#include <linux/module.h>
#include <asm/mrst.h>
#include <asm/io.h>
#include <asm/i8259.h>
+#include <asm/intel_scu_ipc.h>
#include <asm/apb_timer.h>
+#include <asm/reboot.h>
/*
* the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
memcpy(sfi_mtimer_array, pentry, totallen);
}
- printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
+ pr_debug("SFI MTIMER info (num = %d):\n", sfi_mtimer_num);
pentry = sfi_mtimer_array;
for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
- printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
+ pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz,"
" irq = %d\n", totallen, (u32)pentry->phys_addr,
pentry->freq_hz, pentry->irq);
if (!pentry->irq)
memcpy(sfi_mrtc_array, pentry, totallen);
}
- printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
+ pr_debug("SFI RTC info (num = %d):\n", sfi_mrtc_num);
pentry = sfi_mrtc_array;
for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
- printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
+ pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n",
totallen, (u32)pentry->phys_addr, pentry->irq);
mp_irq.type = MP_IOAPIC;
mp_irq.irqtype = mp_INT;
- mp_irq.irqflag = 0;
+ mp_irq.irqflag = 0xf; /* level trigger and active low */
mp_irq.srcbus = 0;
mp_irq.srcbusirq = pentry->irq; /* IRQ */
mp_irq.dstapic = MP_APIC_ALL;
void __init mrst_time_init(void)
{
+ sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
switch (mrst_timer_options) {
case MRST_TIMER_APBT_ONLY:
break;
return;
}
/* we need at least one APB timer */
- sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
pre_init_apic_IRQ0();
apbt_time_init();
}
-void __init mrst_rtc_init(void)
-{
- sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
-}
-
void __cpuinit mrst_arch_setup(void)
{
if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
return 0;
}
+/* Reboot and power off are handled by the SCU on a MID device */
+static void mrst_power_off(void)
+{
+ intel_scu_ipc_simple_command(0xf1, 1);
+}
+
+static void mrst_reboot(void)
+{
+ intel_scu_ipc_simple_command(0xf1, 0);
+}
+
/*
* Moorestown specific x86_init function overrides and early setup
* calls.
legacy_pic = &null_legacy_pic;
+ /* Moorestown specific power_off/restart method */
+ pm_power_off = mrst_power_off;
+ machine_ops.emergency_restart = mrst_reboot;
+
/* Avoid searching for BIOS MP tables */
x86_init.mpparse.find_smp_config = x86_init_noop;
x86_init.mpparse.get_smp_config = x86_init_uint_noop;
return 0;
}
__setup("x86_mrst_timer=", setup_x86_mrst_timer);
+
+/*
+ * Parsing GPIO table first, since the DEVS table will need this table
+ * to map the pin name to the actual pin.
+ */
+static struct sfi_gpio_table_entry *gpio_table;
+static int gpio_num_entry;
+
+static int __init sfi_parse_gpio(struct sfi_table_header *table)
+{
+ struct sfi_table_simple *sb;
+ struct sfi_gpio_table_entry *pentry;
+ int num, i;
+
+ if (gpio_table)
+ return 0;
+ sb = (struct sfi_table_simple *)table;
+ num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry);
+ pentry = (struct sfi_gpio_table_entry *)sb->pentry;
+
+ gpio_table = (struct sfi_gpio_table_entry *)
+ kmalloc(num * sizeof(*pentry), GFP_KERNEL);
+ if (!gpio_table)
+ return -1;
+ memcpy(gpio_table, pentry, num * sizeof(*pentry));
+ gpio_num_entry = num;
+
+ pr_debug("GPIO pin info:\n");
+ for (i = 0; i < num; i++, pentry++)
+ pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s,"
+ " pin = %d\n", i,
+ pentry->controller_name,
+ pentry->pin_name,
+ pentry->pin_no);
+ return 0;
+}
+
+static int get_gpio_by_name(const char *name)
+{
+ struct sfi_gpio_table_entry *pentry = gpio_table;
+ int i;
+
+ if (!pentry)
+ return -1;
+ for (i = 0; i < gpio_num_entry; i++, pentry++) {
+ if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN))
+ return pentry->pin_no;
+ }
+ return -1;
+}
+
+/*
+ * Here defines the array of devices platform data that IAFW would export
+ * through SFI "DEVS" table, we use name and type to match the device and
+ * its platform data.
+ */
+struct devs_id {
+ char name[SFI_NAME_LEN + 1];
+ u8 type;
+ u8 delay;
+ void *(*get_platform_data)(void *info);
+};
+
+/* the offset for the mapping of global gpio pin to irq */
+#define MRST_IRQ_OFFSET 0x100
+
+static void __init *pmic_gpio_platform_data(void *info)
+{
+ static struct intel_pmic_gpio_platform_data pmic_gpio_pdata;
+ int gpio_base = get_gpio_by_name("pmic_gpio_base");
+
+ if (gpio_base == -1)
+ gpio_base = 64;
+ pmic_gpio_pdata.gpio_base = gpio_base;
+ pmic_gpio_pdata.irq_base = gpio_base + MRST_IRQ_OFFSET;
+ pmic_gpio_pdata.gpiointr = 0xffffeff8;
+
+ return &pmic_gpio_pdata;
+}
+
+static void __init *max3111_platform_data(void *info)
+{
+ struct spi_board_info *spi_info = info;
+ int intr = get_gpio_by_name("max3111_int");
+
+ if (intr == -1)
+ return NULL;
+ spi_info->irq = intr + MRST_IRQ_OFFSET;
+ return NULL;
+}
+
+/* we have multiple max7315 on the board ... */
+#define MAX7315_NUM 2
+static void __init *max7315_platform_data(void *info)
+{
+ static struct pca953x_platform_data max7315_pdata[MAX7315_NUM];
+ static int nr;
+ struct pca953x_platform_data *max7315 = &max7315_pdata[nr];
+ struct i2c_board_info *i2c_info = info;
+ int gpio_base, intr;
+ char base_pin_name[SFI_NAME_LEN + 1];
+ char intr_pin_name[SFI_NAME_LEN + 1];
+
+ if (nr == MAX7315_NUM) {
+ pr_err("too many max7315s, we only support %d\n",
+ MAX7315_NUM);
+ return NULL;
+ }
+ /* we have several max7315 on the board, we only need load several
+ * instances of the same pca953x driver to cover them
+ */
+ strcpy(i2c_info->type, "max7315");
+ if (nr++) {
+ sprintf(base_pin_name, "max7315_%d_base", nr);
+ sprintf(intr_pin_name, "max7315_%d_int", nr);
+ } else {
+ strcpy(base_pin_name, "max7315_base");
+ strcpy(intr_pin_name, "max7315_int");
+ }
+
+ gpio_base = get_gpio_by_name(base_pin_name);
+ intr = get_gpio_by_name(intr_pin_name);
+
+ if (gpio_base == -1)
+ return NULL;
+ max7315->gpio_base = gpio_base;
+ if (intr != -1) {
+ i2c_info->irq = intr + MRST_IRQ_OFFSET;
+ max7315->irq_base = gpio_base + MRST_IRQ_OFFSET;
+ } else {
+ i2c_info->irq = -1;
+ max7315->irq_base = -1;
+ }
+ return max7315;
+}
+
+static void __init *emc1403_platform_data(void *info)
+{
+ static short intr2nd_pdata;
+ struct i2c_board_info *i2c_info = info;
+ int intr = get_gpio_by_name("thermal_int");
+ int intr2nd = get_gpio_by_name("thermal_alert");
+
+ if (intr == -1 || intr2nd == -1)
+ return NULL;
+
+ i2c_info->irq = intr + MRST_IRQ_OFFSET;
+ intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET;
+
+ return &intr2nd_pdata;
+}
+
+static void __init *lis331dl_platform_data(void *info)
+{
+ static short intr2nd_pdata;
+ struct i2c_board_info *i2c_info = info;
+ int intr = get_gpio_by_name("accel_int");
+ int intr2nd = get_gpio_by_name("accel_2");
+
+ if (intr == -1 || intr2nd == -1)
+ return NULL;
+
+ i2c_info->irq = intr + MRST_IRQ_OFFSET;
+ intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET;
+
+ return &intr2nd_pdata;
+}
+
+static void __init *no_platform_data(void *info)
+{
+ return NULL;
+}
+
+static const struct devs_id __initconst device_ids[] = {
+ {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data},
+ {"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data},
+ {"i2c_max7315", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
+ {"i2c_max7315_2", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
+ {"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data},
+ {"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data},
+ {"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
+ {"msic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
+ {},
+};
+
+#define MAX_IPCDEVS 24
+static struct platform_device *ipc_devs[MAX_IPCDEVS];
+static int ipc_next_dev;
+
+#define MAX_SCU_SPI 24
+static struct spi_board_info *spi_devs[MAX_SCU_SPI];
+static int spi_next_dev;
+
+#define MAX_SCU_I2C 24
+static struct i2c_board_info *i2c_devs[MAX_SCU_I2C];
+static int i2c_bus[MAX_SCU_I2C];
+static int i2c_next_dev;
+
+static void __init intel_scu_device_register(struct platform_device *pdev)
+{
+ if(ipc_next_dev == MAX_IPCDEVS)
+ pr_err("too many SCU IPC devices");
+ else
+ ipc_devs[ipc_next_dev++] = pdev;
+}
+
+static void __init intel_scu_spi_device_register(struct spi_board_info *sdev)
+{
+ struct spi_board_info *new_dev;
+
+ if (spi_next_dev == MAX_SCU_SPI) {
+ pr_err("too many SCU SPI devices");
+ return;
+ }
+
+ new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL);
+ if (!new_dev) {
+ pr_err("failed to alloc mem for delayed spi dev %s\n",
+ sdev->modalias);
+ return;
+ }
+ memcpy(new_dev, sdev, sizeof(*sdev));
+
+ spi_devs[spi_next_dev++] = new_dev;
+}
+
+static void __init intel_scu_i2c_device_register(int bus,
+ struct i2c_board_info *idev)
+{
+ struct i2c_board_info *new_dev;
+
+ if (i2c_next_dev == MAX_SCU_I2C) {
+ pr_err("too many SCU I2C devices");
+ return;
+ }
+
+ new_dev = kzalloc(sizeof(*idev), GFP_KERNEL);
+ if (!new_dev) {
+ pr_err("failed to alloc mem for delayed i2c dev %s\n",
+ idev->type);
+ return;
+ }
+ memcpy(new_dev, idev, sizeof(*idev));
+
+ i2c_bus[i2c_next_dev] = bus;
+ i2c_devs[i2c_next_dev++] = new_dev;
+}
+
+/* Called by IPC driver */
+void intel_scu_devices_create(void)
+{
+ int i;
+
+ for (i = 0; i < ipc_next_dev; i++)
+ platform_device_add(ipc_devs[i]);
+
+ for (i = 0; i < spi_next_dev; i++)
+ spi_register_board_info(spi_devs[i], 1);
+
+ for (i = 0; i < i2c_next_dev; i++) {
+ struct i2c_adapter *adapter;
+ struct i2c_client *client;
+
+ adapter = i2c_get_adapter(i2c_bus[i]);
+ if (adapter) {
+ client = i2c_new_device(adapter, i2c_devs[i]);
+ if (!client)
+ pr_err("can't create i2c device %s\n",
+ i2c_devs[i]->type);
+ } else
+ i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1);
+ }
+}
+EXPORT_SYMBOL_GPL(intel_scu_devices_create);
+
+/* Called by IPC driver */
+void intel_scu_devices_destroy(void)
+{
+ int i;
+
+ for (i = 0; i < ipc_next_dev; i++)
+ platform_device_del(ipc_devs[i]);
+}
+EXPORT_SYMBOL_GPL(intel_scu_devices_destroy);
+
+static void __init install_irq_resource(struct platform_device *pdev, int irq)
+{
+ /* Single threaded */
+ static struct resource __initdata res = {
+ .name = "IRQ",
+ .flags = IORESOURCE_IRQ,
+ };
+ res.start = irq;
+ platform_device_add_resources(pdev, &res, 1);
+}
+
+static void __init sfi_handle_ipc_dev(struct platform_device *pdev)
+{
+ const struct devs_id *dev = device_ids;
+ void *pdata = NULL;
+
+ while (dev->name[0]) {
+ if (dev->type == SFI_DEV_TYPE_IPC &&
+ !strncmp(dev->name, pdev->name, SFI_NAME_LEN)) {
+ pdata = dev->get_platform_data(pdev);
+ break;
+ }
+ dev++;
+ }
+ pdev->dev.platform_data = pdata;
+ intel_scu_device_register(pdev);
+}
+
+static void __init sfi_handle_spi_dev(struct spi_board_info *spi_info)
+{
+ const struct devs_id *dev = device_ids;
+ void *pdata = NULL;
+
+ while (dev->name[0]) {
+ if (dev->type == SFI_DEV_TYPE_SPI &&
+ !strncmp(dev->name, spi_info->modalias, SFI_NAME_LEN)) {
+ pdata = dev->get_platform_data(spi_info);
+ break;
+ }
+ dev++;
+ }
+ spi_info->platform_data = pdata;
+ if (dev->delay)
+ intel_scu_spi_device_register(spi_info);
+ else
+ spi_register_board_info(spi_info, 1);
+}
+
+static void __init sfi_handle_i2c_dev(int bus, struct i2c_board_info *i2c_info)
+{
+ const struct devs_id *dev = device_ids;
+ void *pdata = NULL;
+
+ while (dev->name[0]) {
+ if (dev->type == SFI_DEV_TYPE_I2C &&
+ !strncmp(dev->name, i2c_info->type, SFI_NAME_LEN)) {
+ pdata = dev->get_platform_data(i2c_info);
+ break;
+ }
+ dev++;
+ }
+ i2c_info->platform_data = pdata;
+
+ if (dev->delay)
+ intel_scu_i2c_device_register(bus, i2c_info);
+ else
+ i2c_register_board_info(bus, i2c_info, 1);
+ }
+
+
+static int __init sfi_parse_devs(struct sfi_table_header *table)
+{
+ struct sfi_table_simple *sb;
+ struct sfi_device_table_entry *pentry;
+ struct spi_board_info spi_info;
+ struct i2c_board_info i2c_info;
+ struct platform_device *pdev;
+ int num, i, bus;
+ int ioapic;
+ struct io_apic_irq_attr irq_attr;
+
+ sb = (struct sfi_table_simple *)table;
+ num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
+ pentry = (struct sfi_device_table_entry *)sb->pentry;
+
+ for (i = 0; i < num; i++, pentry++) {
+ if (pentry->irq != (u8)0xff) { /* native RTE case */
+ /* these SPI2 devices are not exposed to system as PCI
+ * devices, but they have separate RTE entry in IOAPIC
+ * so we have to enable them one by one here
+ */
+ ioapic = mp_find_ioapic(pentry->irq);
+ irq_attr.ioapic = ioapic;
+ irq_attr.ioapic_pin = pentry->irq;
+ irq_attr.trigger = 1;
+ irq_attr.polarity = 1;
+ io_apic_set_pci_routing(NULL, pentry->irq, &irq_attr);
+ }
+ switch (pentry->type) {
+ case SFI_DEV_TYPE_IPC:
+ /* ID as IRQ is a hack that will go away */
+ pdev = platform_device_alloc(pentry->name, pentry->irq);
+ if (pdev == NULL) {
+ pr_err("out of memory for SFI platform device '%s'.\n",
+ pentry->name);
+ continue;
+ }
+ install_irq_resource(pdev, pentry->irq);
+ pr_debug("info[%2d]: IPC bus, name = %16.16s, "
+ "irq = 0x%2x\n", i, pentry->name, pentry->irq);
+ sfi_handle_ipc_dev(pdev);
+ break;
+ case SFI_DEV_TYPE_SPI:
+ memset(&spi_info, 0, sizeof(spi_info));
+ strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN);
+ spi_info.irq = pentry->irq;
+ spi_info.bus_num = pentry->host_num;
+ spi_info.chip_select = pentry->addr;
+ spi_info.max_speed_hz = pentry->max_freq;
+ pr_debug("info[%2d]: SPI bus = %d, name = %16.16s, "
+ "irq = 0x%2x, max_freq = %d, cs = %d\n", i,
+ spi_info.bus_num,
+ spi_info.modalias,
+ spi_info.irq,
+ spi_info.max_speed_hz,
+ spi_info.chip_select);
+ sfi_handle_spi_dev(&spi_info);
+ break;
+ case SFI_DEV_TYPE_I2C:
+ memset(&i2c_info, 0, sizeof(i2c_info));
+ bus = pentry->host_num;
+ strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN);
+ i2c_info.irq = pentry->irq;
+ i2c_info.addr = pentry->addr;
+ pr_debug("info[%2d]: I2C bus = %d, name = %16.16s, "
+ "irq = 0x%2x, addr = 0x%x\n", i, bus,
+ i2c_info.type,
+ i2c_info.irq,
+ i2c_info.addr);
+ sfi_handle_i2c_dev(bus, &i2c_info);
+ break;
+ case SFI_DEV_TYPE_UART:
+ case SFI_DEV_TYPE_HSI:
+ default:
+ ;
+ }
+ }
+ return 0;
+}
+
+static int __init mrst_platform_init(void)
+{
+ sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio);
+ sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs);
+ return 0;
+}
+arch_initcall(mrst_platform_init);
+
+/*
+ * we will search these buttons in SFI GPIO table (by name)
+ * and register them dynamically. Please add all possible
+ * buttons here, we will shrink them if no GPIO found.
+ */
+static struct gpio_keys_button gpio_button[] = {
+ {KEY_POWER, -1, 1, "power_btn", EV_KEY, 0, 3000},
+ {KEY_PROG1, -1, 1, "prog_btn1", EV_KEY, 0, 20},
+ {KEY_PROG2, -1, 1, "prog_btn2", EV_KEY, 0, 20},
+ {SW_LID, -1, 1, "lid_switch", EV_SW, 0, 20},
+ {KEY_VOLUMEUP, -1, 1, "vol_up", EV_KEY, 0, 20},
+ {KEY_VOLUMEDOWN, -1, 1, "vol_down", EV_KEY, 0, 20},
+ {KEY_CAMERA, -1, 1, "camera_full", EV_KEY, 0, 20},
+ {KEY_CAMERA_FOCUS, -1, 1, "camera_half", EV_KEY, 0, 20},
+ {SW_KEYPAD_SLIDE, -1, 1, "MagSw1", EV_SW, 0, 20},
+ {SW_KEYPAD_SLIDE, -1, 1, "MagSw2", EV_SW, 0, 20},
+};
+
+static struct gpio_keys_platform_data mrst_gpio_keys = {
+ .buttons = gpio_button,
+ .rep = 1,
+ .nbuttons = -1, /* will fill it after search */
+};
+
+static struct platform_device pb_device = {
+ .name = "gpio-keys",
+ .id = -1,
+ .dev = {
+ .platform_data = &mrst_gpio_keys,
+ },
+};
+
+/*
+ * Shrink the non-existent buttons, register the gpio button
+ * device if there is some
+ */
+static int __init pb_keys_init(void)
+{
+ struct gpio_keys_button *gb = gpio_button;
+ int i, num, good = 0;
+
+ num = sizeof(gpio_button) / sizeof(struct gpio_keys_button);
+ for (i = 0; i < num; i++) {
+ gb[i].gpio = get_gpio_by_name(gb[i].desc);
+ if (gb[i].gpio == -1)
+ continue;
+
+ if (i != good)
+ gb[good] = gb[i];
+ good++;
+ }
+
+ if (good) {
+ mrst_gpio_keys.nbuttons = good;
+ return platform_device_register(&pb_device);
+ }
+ return 0;
+}
+late_initcall(pb_keys_init);
--- /dev/null
+/*
+ * vrtc.c: Driver for virtual RTC device on Intel MID platform
+ *
+ * (C) Copyright 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ * VRTC is emulated by system controller firmware, the real HW
+ * RTC is located in the PMIC device. SCU FW shadows PMIC RTC
+ * in a memory mapped IO space that is visible to the host IA
+ * processor.
+ *
+ * This driver is based on RTC CMOS driver.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+#include <linux/platform_device.h>
+
+#include <asm/mrst.h>
+#include <asm/mrst-vrtc.h>
+#include <asm/time.h>
+#include <asm/fixmap.h>
+
+static unsigned char __iomem *vrtc_virt_base;
+
+unsigned char vrtc_cmos_read(unsigned char reg)
+{
+ unsigned char retval;
+
+ /* vRTC's registers range from 0x0 to 0xD */
+ if (reg > 0xd || !vrtc_virt_base)
+ return 0xff;
+
+ lock_cmos_prefix(reg);
+ retval = __raw_readb(vrtc_virt_base + (reg << 2));
+ lock_cmos_suffix(reg);
+ return retval;
+}
+EXPORT_SYMBOL_GPL(vrtc_cmos_read);
+
+void vrtc_cmos_write(unsigned char val, unsigned char reg)
+{
+ if (reg > 0xd || !vrtc_virt_base)
+ return;
+
+ lock_cmos_prefix(reg);
+ __raw_writeb(val, vrtc_virt_base + (reg << 2));
+ lock_cmos_suffix(reg);
+}
+EXPORT_SYMBOL_GPL(vrtc_cmos_write);
+
+unsigned long vrtc_get_time(void)
+{
+ u8 sec, min, hour, mday, mon;
+ u32 year;
+
+ while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP))
+ cpu_relax();
+
+ sec = vrtc_cmos_read(RTC_SECONDS);
+ min = vrtc_cmos_read(RTC_MINUTES);
+ hour = vrtc_cmos_read(RTC_HOURS);
+ mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
+ mon = vrtc_cmos_read(RTC_MONTH);
+ year = vrtc_cmos_read(RTC_YEAR);
+
+ /* vRTC YEAR reg contains the offset to 1960 */
+ year += 1960;
+
+ printk(KERN_INFO "vRTC: sec: %d min: %d hour: %d day: %d "
+ "mon: %d year: %d\n", sec, min, hour, mday, mon, year);
+
+ return mktime(year, mon, mday, hour, min, sec);
+}
+
+/* Only care about the minutes and seconds */
+int vrtc_set_mmss(unsigned long nowtime)
+{
+ int real_sec, real_min;
+ int vrtc_min;
+
+ vrtc_min = vrtc_cmos_read(RTC_MINUTES);
+
+ real_sec = nowtime % 60;
+ real_min = nowtime / 60;
+ if (((abs(real_min - vrtc_min) + 15)/30) & 1)
+ real_min += 30;
+ real_min %= 60;
+
+ vrtc_cmos_write(real_sec, RTC_SECONDS);
+ vrtc_cmos_write(real_min, RTC_MINUTES);
+ return 0;
+}
+
+void __init mrst_rtc_init(void)
+{
+ unsigned long rtc_paddr;
+ void __iomem *virt_base;
+
+ sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
+ if (!sfi_mrtc_num)
+ return;
+
+ rtc_paddr = sfi_mrtc_array[0].phys_addr;
+
+ /* vRTC's register address may not be page aligned */
+ set_fixmap_nocache(FIX_LNW_VRTC, rtc_paddr);
+
+ virt_base = (void __iomem *)__fix_to_virt(FIX_LNW_VRTC);
+ virt_base += rtc_paddr & ~PAGE_MASK;
+ vrtc_virt_base = virt_base;
+
+ x86_platform.get_wallclock = vrtc_get_time;
+ x86_platform.set_wallclock = vrtc_set_mmss;
+}
+
+/*
+ * The Moorestown platform has a memory mapped virtual RTC device that emulates
+ * the programming interface of the RTC.
+ */
+
+static struct resource vrtc_resources[] = {
+ [0] = {
+ .flags = IORESOURCE_MEM,
+ },
+ [1] = {
+ .flags = IORESOURCE_IRQ,
+ }
+};
+
+static struct platform_device vrtc_device = {
+ .name = "rtc_mrst",
+ .id = -1,
+ .resource = vrtc_resources,
+ .num_resources = ARRAY_SIZE(vrtc_resources),
+};
+
+/* Register the RTC device if appropriate */
+static int __init mrst_device_create(void)
+{
+ /* No Moorestown, no device */
+ if (!mrst_identify_cpu())
+ return -ENODEV;
+ /* No timer, no device */
+ if (!sfi_mrtc_num)
+ return -ENODEV;
+
+ /* iomem resource */
+ vrtc_resources[0].start = sfi_mrtc_array[0].phys_addr;
+ vrtc_resources[0].end = sfi_mrtc_array[0].phys_addr +
+ MRST_VRTC_MAP_SZ;
+ /* irq resource */
+ vrtc_resources[1].start = sfi_mrtc_array[0].irq;
+ vrtc_resources[1].end = sfi_mrtc_array[0].irq;
+
+ return platform_device_register(&vrtc_device);
+}
+
+module_init(mrst_device_create);
/* All CPUs enumerated by SFI must be present and enabled */
static void __cpuinit mp_sfi_register_lapic(u8 id)
{
- if (MAX_APICS - id <= 0) {
+ if (MAX_LOCAL_APIC - id <= 0) {
pr_warning("Processor #%d invalid (max %d)\n",
- id, MAX_APICS);
+ id, MAX_LOCAL_APIC);
return;
}
/*
* each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
- * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
+ * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE)
*/
bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE
* UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
/*
* initialize the bau_control structure for each cpu
*/
-static void __init uv_init_per_cpu(int nuvhubs)
+static int __init uv_init_per_cpu(int nuvhubs)
{
int i;
int cpu;
struct bau_control *smaster = NULL;
struct socket_desc {
short num_cpus;
- short cpu_number[16];
+ short cpu_number[MAX_CPUS_PER_SOCKET];
};
struct uvhub_desc {
unsigned short socket_mask;
sdp = &bdp->socket[socket];
sdp->cpu_number[sdp->num_cpus] = cpu;
sdp->num_cpus++;
+ if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
+ printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus);
+ return 1;
+ }
}
for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
bcp->uvhub_master = hmaster;
bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
blade_processor_id;
+ if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
+ printk(KERN_EMERG
+ "%d cpus per uvhub invalid\n",
+ bcp->uvhub_cpu);
+ return 1;
+ }
}
nextsocket:
socket++;
bcp->congested_reps = congested_reps;
bcp->congested_period = congested_period;
}
+ return 0;
}
/*
spin_lock_init(&disable_lock);
congested_cycles = microsec_2_cycles(congested_response_us);
- uv_init_per_cpu(nuvhubs);
+ if (uv_init_per_cpu(nuvhubs)) {
+ nobau = 1;
+ return 0;
+ }
uv_partition_base_pnode = 0x7fffffff;
for (uvhub = 0; uvhub < nuvhubs; uvhub++)
ver = m->apicver;
if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
- m->apicid, MAX_APICS);
+ m->apicid, MAX_LOCAL_APIC);
return;
}
* as possible (without an NMI being received in the middle of
* this) - so disable NMIs and initialize the device:
*/
- acpi_nmi_disable();
status = acpi_ns_evaluate(info);
- acpi_nmi_enable();
if (ACPI_SUCCESS(status)) {
walk_info->num_INI++;
int __init acpi_numa_init(void)
{
int ret = 0;
+ int nr_cpu_entries = nr_cpu_ids;
+
+#ifdef CONFIG_X86
+ /*
+ * Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
+ * SRAT cpu entries could have different order with that in MADT.
+ * So go over all cpu entries in SRAT to get apicid to node mapping.
+ */
+ nr_cpu_entries = MAX_LOCAL_APIC;
+#endif
/* SRAT: Static Resource Affinity Table */
if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
- acpi_parse_x2apic_affinity, nr_cpu_ids);
+ acpi_parse_x2apic_affinity, nr_cpu_entries);
acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
- acpi_parse_processor_affinity, nr_cpu_ids);
+ acpi_parse_processor_affinity, nr_cpu_entries);
ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
acpi_parse_memory_affinity,
NR_NODE_MEMBLKS);
atm_dev_put(dev);
return -EMEDIUMTYPE;
}
- if (PRIV(dev)->vcc) return -EBUSY;
+ if (PRIV(dev)->vcc) {
+ atm_dev_put(dev);
+ return -EBUSY;
+ }
}
else {
int error;
static void amd64_tlbflush(struct agp_memory *temp)
{
- k8_flush_garts();
+ amd_flush_garts();
}
static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type)
u32 temp;
struct aper_size_info_32 *values;
- dev = k8_northbridges.nb_misc[0];
+ dev = node_to_amd_nb(0)->misc;
if (dev==NULL)
return 0;
unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
int i;
- if (!k8_northbridges.gart_supported)
+ if (!amd_nb_has_feature(AMD_NB_GART))
return 0;
/* Configure AGP regs in each x86-64 host bridge. */
- for (i = 0; i < k8_northbridges.num; i++) {
+ for (i = 0; i < amd_nb_num(); i++) {
agp_bridge->gart_bus_addr =
- amd64_configure(k8_northbridges.nb_misc[i],
- gatt_bus);
+ amd64_configure(node_to_amd_nb(i)->misc, gatt_bus);
}
- k8_flush_garts();
+ amd_flush_garts();
return 0;
}
u32 tmp;
int i;
- if (!k8_northbridges.gart_supported)
+ if (!amd_nb_has_feature(AMD_NB_GART))
return;
- for (i = 0; i < k8_northbridges.num; i++) {
- struct pci_dev *dev = k8_northbridges.nb_misc[i];
+ for (i = 0; i < amd_nb_num(); i++) {
+ struct pci_dev *dev = node_to_amd_nb(i)->misc;
/* disable gart translation */
pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp);
tmp &= ~GARTEN;
{
int i;
- if (cache_k8_northbridges() < 0)
+ if (amd_cache_northbridges() < 0)
return -ENODEV;
- if (!k8_northbridges.gart_supported)
+ if (!amd_nb_has_feature(AMD_NB_GART))
return -ENODEV;
i = 0;
- for (i = 0; i < k8_northbridges.num; i++) {
- struct pci_dev *dev = k8_northbridges.nb_misc[i];
+ for (i = 0; i < amd_nb_num(); i++) {
+ struct pci_dev *dev = node_to_amd_nb(i)->misc;
if (fix_northbridge(dev, pdev, cap_ptr) < 0) {
dev_err(&dev->dev, "no usable aperture found\n");
#ifdef __x86_64__
}
/* shadow x86-64 registers into ULi registers */
- pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+ pci_read_config_dword (node_to_amd_nb(0)->misc, AMD64_GARTAPERTUREBASE,
&httfea);
/* if x86-64 aperture base is beyond 4G, exit here */
pci_write_config_dword(dev1, NVIDIA_X86_64_1_APSIZE, tmp);
/* shadow x86-64 registers into NVIDIA registers */
- pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
+ pci_read_config_dword (node_to_amd_nb(0)->misc, AMD64_GARTAPERTUREBASE,
&apbase);
/* if x86-64 aperture base is beyond 4G, exit here */
}
/* First check that we have at least one AMD64 NB */
- if (!pci_dev_present(k8_nb_ids))
+ if (!pci_dev_present(amd_nb_misc_ids))
return -ENODEV;
/* Look for any AGP bridge */
dprintk("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new,
(unsigned long)freqs->cpu);
trace_power_frequency(POWER_PSTATE, freqs->new, freqs->cpu);
+ trace_cpu_frequency(freqs->new, freqs->cpu);
srcu_notifier_call_chain(&cpufreq_transition_notifier_list,
CPUFREQ_POSTCHANGE, freqs);
if (likely(policy) && likely(policy->cpu == freqs->cpu))
if (cpuidle_curr_governor->reflect)
cpuidle_curr_governor->reflect(dev);
trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}
/**
static void mv_xor_tasklet(unsigned long data)
{
struct mv_xor_chan *chan = (struct mv_xor_chan *) data;
- __mv_xor_slot_cleanup(chan);
+ mv_xor_slot_cleanup(chan);
}
static struct mv_xor_desc_slot *
opstate_init();
- if (cache_k8_northbridges() < 0)
+ if (amd_cache_northbridges() < 0)
goto err_ret;
msrs = msrs_alloc();
* to finish initialization of the MC instances.
*/
err = -ENODEV;
- for (nb = 0; nb < k8_northbridges.num; nb++) {
+ for (nb = 0; nb < amd_nb_num(); nb++) {
if (!pvt_lookup[nb])
continue;
static enum drm_connector_status ch7017_detect(struct intel_dvo_device *dvo)
{
- return connector_status_unknown;
+ return connector_status_connected;
}
static enum drm_mode_status ch7017_mode_valid(struct intel_dvo_device *dvo,
#include "i915_drm.h"
#include "i915_drv.h"
#include "i915_trace.h"
+#include "../../../platform/x86/intel_ips.h"
#include <linux/pci.h>
#include <linux/vgaarb.h>
#include <linux/acpi.h>
}
EXPORT_SYMBOL_GPL(i915_gpu_turbo_disable);
+/**
+ * Tells the intel_ips driver that the i915 driver is now loaded, if
+ * IPS got loaded first.
+ *
+ * This awkward dance is so that neither module has to depend on the
+ * other in order for IPS to do the appropriate communication of
+ * GPU turbo limits to i915.
+ */
+static void
+ips_ping_for_i915_load(void)
+{
+ void (*link)(void);
+
+ link = symbol_get(ips_link_to_i915_driver);
+ if (link) {
+ link();
+ symbol_put(ips_link_to_i915_driver);
+ }
+}
+
/**
* i915_driver_load - setup chip and create an initial config
* @dev: DRM device
dev_priv->mchdev_lock = &mchdev_lock;
spin_unlock(&mchdev_lock);
+ ips_ping_for_i915_load();
+
return 0;
out_workqueue_free:
# define MARIUNIT_CLOCK_GATE_DISABLE (1 << 18)
# define SVSMUNIT_CLOCK_GATE_DISABLE (1 << 1)
+#define PCH_3DCGDIS1 0x46024
+# define VFMUNIT_CLOCK_GATE_DISABLE (1 << 11)
+
#define FDI_PLL_FREQ_CTL 0x46030
#define FDI_PLL_FREQ_CHANGE_REQUEST (1<<24)
#define FDI_PLL_FREQ_LOCK_LIMIT_MASK 0xfff00
#define ILK_DISPLAY_CHICKEN2 0x42004
#define ILK_DPARB_GATE (1<<22)
#define ILK_VSDPFD_FULL (1<<21)
+#define ILK_DISPLAY_CHICKEN_FUSES 0x42014
+#define ILK_INTERNAL_GRAPHICS_DISABLE (1<<31)
+#define ILK_INTERNAL_DISPLAY_DISABLE (1<<30)
+#define ILK_DISPLAY_DEBUG_DISABLE (1<<29)
+#define ILK_HDCP_DISABLE (1<<25)
+#define ILK_eDP_A_DISABLE (1<<24)
+#define ILK_DESKTOP (1<<23)
#define ILK_DSPCLK_GATE 0x42020
#define ILK_DPARB_CLK_GATE (1<<5)
/* According to spec this bit 7/8/9 of 0x42020 should be set to enable FBC */
general->ssc_freq ? 66 : 48;
else if (IS_GEN5(dev) || IS_GEN6(dev))
dev_priv->lvds_ssc_freq =
- general->ssc_freq ? 120 : 100;
+ general->ssc_freq ? 100 : 120;
else
dev_priv->lvds_ssc_freq =
general->ssc_freq ? 100 : 96;
return index_mask;
}
+static bool has_edp_a(struct drm_device *dev)
+{
+ struct drm_i915_private *dev_priv = dev->dev_private;
+
+ if (!IS_MOBILE(dev))
+ return false;
+
+ if ((I915_READ(DP_A) & DP_DETECTED) == 0)
+ return false;
+
+ if (IS_GEN5(dev) &&
+ (I915_READ(ILK_DISPLAY_CHICKEN_FUSES) & ILK_eDP_A_DISABLE))
+ return false;
+
+ return true;
+}
+
static void intel_setup_outputs(struct drm_device *dev)
{
struct drm_i915_private *dev_priv = dev->dev_private;
if (HAS_PCH_SPLIT(dev)) {
dpd_is_edp = intel_dpd_is_edp(dev);
- if (IS_MOBILE(dev) && (I915_READ(DP_A) & DP_DETECTED))
+ if (has_edp_a(dev))
intel_dp_init(dev, DP_A);
if (dpd_is_edp && (I915_READ(PCH_DP_D) & DP_DETECTED))
I915_WRITE(PCH_3DCGDIS0,
MARIUNIT_CLOCK_GATE_DISABLE |
SVSMUNIT_CLOCK_GATE_DISABLE);
+ I915_WRITE(PCH_3DCGDIS1,
+ VFMUNIT_CLOCK_GATE_DISABLE);
}
I915_WRITE(PCH_DSPCLK_GATE_D, dspclk_gate);
SDVO_COLORIMETRY_RGB256);
connector->connector_type = DRM_MODE_CONNECTOR_HDMIA;
- intel_sdvo_add_hdmi_properties(intel_sdvo_connector);
intel_sdvo->is_hdmi = true;
}
intel_sdvo->base.clone_mask = ((1 << INTEL_SDVO_NON_TV_CLONE_BIT) |
(1 << INTEL_ANALOG_CLONE_BIT));
intel_sdvo_connector_init(intel_sdvo_connector, intel_sdvo);
+ if (intel_sdvo->is_hdmi)
+ intel_sdvo_add_hdmi_properties(intel_sdvo_connector);
return true;
}
attr->index = channel;
attr->dev_attr.attr.name = attrs->in_name;
attr->dev_attr.attr.mode = S_IRUGO;
- attr->dev_attr.attr.owner = THIS_MODULE;
attr->dev_attr.show = s3c_hwmon_ch_show;
ret = device_create_file(dev, &attr->dev_attr);
attr->index = channel;
attr->dev_attr.attr.name = attrs->label_name;
attr->dev_attr.attr.mode = S_IRUGO;
- attr->dev_attr.attr.owner = THIS_MODULE;
attr->dev_attr.show = s3c_hwmon_label_show;
ret = device_create_file(dev, &attr->dev_attr);
kt_before = ktime_get_real();
stop_critical_timings();
-#ifndef MODULE
trace_power_start(POWER_CSTATE, (eax >> 4) + 1, cpu);
-#endif
+ trace_cpu_idle((eax >> 4) + 1, cpu);
if (!need_resched()) {
__monitor((void *)¤t_thread_info()->flags, 0, 0);
if (b3skb == NULL) {
dev_err(cs->dev, "%s: out of memory\n", __func__);
send_conf(iif, ap, skb, CAPI_MSGOSRESOURCEERR);
+ kfree(b3cmsg);
return;
}
capi_cmsg2message(b3cmsg,
v4l2_ctrl_new_std(&state->hdl, &cx25840_ctrl_ops,
V4L2_CID_HUE, -128, 127, 1, 0);
if (!is_cx2583x(state)) {
- default_volume = 228 - cx25840_read(client, 0x8d4);
- default_volume = ((default_volume / 2) + 23) << 9;
+ default_volume = cx25840_read(client, 0x8d4);
+ /*
+ * Enforce the legacy PVR-350/MSP3400 to PVR-150/CX25843 volume
+ * scale mapping limits to avoid -ERANGE errors when
+ * initializing the volume control
+ */
+ if (default_volume > 228) {
+ /* Bottom out at -96 dB, v4l2 vol range 0x2e00-0x2fff */
+ default_volume = 228;
+ cx25840_write(client, 0x8d4, 228);
+ }
+ else if (default_volume < 20) {
+ /* Top out at + 8 dB, v4l2 vol range 0xfe00-0xffff */
+ default_volume = 20;
+ cx25840_write(client, 0x8d4, 20);
+ }
+ default_volume = (((228 - default_volume) >> 1) + 23) << 9;
state->volume = v4l2_ctrl_new_std(&state->hdl,
&cx25840_audio_ctrl_ops, V4L2_CID_AUDIO_VOLUME,
#include <sound/control.h>
#include <sound/initval.h>
#include <sound/tlv.h>
-#include <media/wm8775.h>
#include "cx88.h"
#include "cx88-reg.h"
int left, right, v, b;
int changed = 0;
u32 old;
- struct v4l2_control client_ctl;
-
- /* Pass volume & balance onto any WM8775 */
- if (value->value.integer.value[0] >= value->value.integer.value[1]) {
- v = value->value.integer.value[0] << 10;
- b = value->value.integer.value[0] ?
- (0x8000 * value->value.integer.value[1]) / value->value.integer.value[0] :
- 0x8000;
- } else {
- v = value->value.integer.value[1] << 10;
- b = value->value.integer.value[1] ?
- 0xffff - (0x8000 * value->value.integer.value[0]) / value->value.integer.value[1] :
- 0x8000;
- }
- client_ctl.value = v;
- client_ctl.id = V4L2_CID_AUDIO_VOLUME;
- call_hw(core, WM8775_GID, core, s_ctrl, &client_ctl);
-
- client_ctl.value = b;
- client_ctl.id = V4L2_CID_AUDIO_BALANCE;
- call_hw(core, WM8775_GID, core, s_ctrl, &client_ctl);
left = value->value.integer.value[0] & 0x3f;
right = value->value.integer.value[1] & 0x3f;
b = right - left;
if (b < 0) {
- v = 0x3f - left;
- b = (-b) | 0x40;
+ v = 0x3f - left;
+ b = (-b) | 0x40;
} else {
- v = 0x3f - right;
+ v = 0x3f - right;
}
/* Do we really know this will always be called with IRQs on? */
spin_lock_irq(&chip->reg_lock);
old = cx_read(AUD_VOL_CTL);
if (v != (old & 0x3f)) {
- cx_swrite(SHADOW_AUD_VOL_CTL, AUD_VOL_CTL, (old & ~0x3f) | v);
- changed = 1;
+ cx_write(AUD_VOL_CTL, (old & ~0x3f) | v);
+ changed = 1;
}
- if ((cx_read(AUD_BAL_CTL) & 0x7f) != b) {
- cx_write(AUD_BAL_CTL, b);
- changed = 1;
+ if (cx_read(AUD_BAL_CTL) != b) {
+ cx_write(AUD_BAL_CTL, b);
+ changed = 1;
}
spin_unlock_irq(&chip->reg_lock);
.iface = SNDRV_CTL_ELEM_IFACE_MIXER,
.access = SNDRV_CTL_ELEM_ACCESS_READWRITE |
SNDRV_CTL_ELEM_ACCESS_TLV_READ,
- .name = "Analog-TV Volume",
+ .name = "Playback Volume",
.info = snd_cx88_volume_info,
.get = snd_cx88_volume_get,
.put = snd_cx88_volume_put,
vol = cx_read(AUD_VOL_CTL);
if (value->value.integer.value[0] != !(vol & bit)) {
vol ^= bit;
- cx_swrite(SHADOW_AUD_VOL_CTL, AUD_VOL_CTL, vol);
- /* Pass mute onto any WM8775 */
- if ((1<<6) == bit) {
- struct v4l2_control client_ctl;
- client_ctl.value = 0 != (vol & bit);
- client_ctl.id = V4L2_CID_AUDIO_MUTE;
- call_hw(core, WM8775_GID, core, s_ctrl, &client_ctl);
- }
+ cx_write(AUD_VOL_CTL, vol);
ret = 1;
}
spin_unlock_irq(&chip->reg_lock);
static const struct snd_kcontrol_new snd_cx88_dac_switch = {
.iface = SNDRV_CTL_ELEM_IFACE_MIXER,
- .name = "Audio-Out Switch",
+ .name = "Playback Switch",
.info = snd_ctl_boolean_mono_info,
.get = snd_cx88_switch_get,
.put = snd_cx88_switch_put,
static const struct snd_kcontrol_new snd_cx88_source_switch = {
.iface = SNDRV_CTL_ELEM_IFACE_MIXER,
- .name = "Analog-TV Switch",
+ .name = "Capture Switch",
.info = snd_ctl_boolean_mono_info,
.get = snd_cx88_switch_get,
.put = snd_cx88_switch_put,
.private_value = (1<<6),
};
-static int snd_cx88_alc_get(struct snd_kcontrol *kcontrol,
- struct snd_ctl_elem_value *value)
-{
- snd_cx88_card_t *chip = snd_kcontrol_chip(kcontrol);
- struct cx88_core *core = chip->core;
- struct v4l2_control client_ctl;
-
- client_ctl.id = V4L2_CID_AUDIO_LOUDNESS;
- call_hw(core, WM8775_GID, core, g_ctrl, &client_ctl);
- value->value.integer.value[0] = client_ctl.value ? 1 : 0;
-
- return 0;
-}
-
-static int snd_cx88_alc_put(struct snd_kcontrol *kcontrol,
- struct snd_ctl_elem_value *value)
-{
- snd_cx88_card_t *chip = snd_kcontrol_chip(kcontrol);
- struct cx88_core *core = chip->core;
- struct v4l2_control client_ctl;
-
- client_ctl.value = 0 != value->value.integer.value[0];
- client_ctl.id = V4L2_CID_AUDIO_LOUDNESS;
- call_hw(core, WM8775_GID, core, s_ctrl, &client_ctl);
-
- return 0;
-}
-
-static struct snd_kcontrol_new snd_cx88_alc_switch = {
- .iface = SNDRV_CTL_ELEM_IFACE_MIXER,
- .name = "Line-In ALC Switch",
- .info = snd_ctl_boolean_mono_info,
- .get = snd_cx88_alc_get,
- .put = snd_cx88_alc_put,
-};
-
/****************************************************************************
Basic Flow for Sound Devices
****************************************************************************/
{
struct snd_card *card;
snd_cx88_card_t *chip;
- struct v4l2_subdev *sd;
int err;
if (devno >= SNDRV_CARDS)
if (err < 0)
goto error;
- /* If there's a wm8775 then add a Line-In ALC switch */
- list_for_each_entry(sd, &chip->core->v4l2_dev.subdevs, list) {
- if (WM8775_GID == sd->grp_id) {
- snd_ctl_add(card, snd_ctl_new1(&snd_cx88_alc_switch,
- chip));
- break;
- }
- }
-
strcpy (card->driver, "CX88x");
sprintf(card->shortname, "Conexant CX%x", pci->device);
sprintf(card->longname, "%s at %#llx",
.radio_type = UNSET,
.tuner_addr = ADDR_UNSET,
.radio_addr = ADDR_UNSET,
+ .audio_chip = V4L2_IDENT_WM8775,
.input = {{
.type = CX88_VMUX_DVB,
.vmux = 0,
+ /* 2: Line-In */
+ .audioroute = 2,
},{
.type = CX88_VMUX_COMPOSITE1,
.vmux = 1,
+ /* 2: Line-In */
+ .audioroute = 2,
},{
.type = CX88_VMUX_SVIDEO,
.vmux = 2,
+ /* 2: Line-In */
+ .audioroute = 2,
}},
.mpeg = CX88_MPEG_DVB,
},
#include "cx88.h"
#include <media/v4l2-common.h>
#include <media/v4l2-ioctl.h>
-#include <media/wm8775.h>
MODULE_DESCRIPTION("v4l2 driver module for cx2388x based TV cards");
MODULE_AUTHOR("Gerd Knorr <kraxel@bytesex.org> [SuSE Labs]");
const struct cx88_ctrl *c = NULL;
u32 value,mask;
int i;
- struct v4l2_control client_ctl;
for (i = 0; i < CX8800_CTLS; i++) {
if (cx8800_ctls[i].v.id == ctl->id) {
ctl->value = c->v.minimum;
if (ctl->value > c->v.maximum)
ctl->value = c->v.maximum;
-
- /* Pass changes onto any WM8775 */
- client_ctl.id = ctl->id;
- switch (ctl->id) {
- case V4L2_CID_AUDIO_MUTE:
- client_ctl.value = ctl->value;
- break;
- case V4L2_CID_AUDIO_VOLUME:
- client_ctl.value = (ctl->value) ?
- (0x90 + ctl->value) << 8 : 0;
- break;
- case V4L2_CID_AUDIO_BALANCE:
- client_ctl.value = ctl->value << 9;
- break;
- default:
- client_ctl.id = 0;
- break;
- }
- if (client_ctl.id)
- call_hw(core, WM8775_GID, core, s_ctrl, &client_ctl);
-
mask=c->mask;
switch (ctl->id) {
case V4L2_CID_AUDIO_BALANCE:
if (c->id < V4L2_CID_BASE ||
c->id >= V4L2_CID_LASTP1)
return -EINVAL;
- if (c->id == V4L2_CID_AUDIO_MUTE ||
- c->id == V4L2_CID_AUDIO_VOLUME ||
- c->id == V4L2_CID_AUDIO_BALANCE) {
+ if (c->id == V4L2_CID_AUDIO_MUTE) {
for (i = 0; i < CX8800_CTLS; i++) {
if (cx8800_ctls[i].v.id == c->id)
break;
return container_of(v4l2_dev, struct cx88_core, v4l2_dev);
}
-#define call_hw(core, grpid, o, f, args...) \
+#define call_all(core, o, f, args...) \
do { \
if (!core->i2c_rc) { \
if (core->gate_ctrl) \
core->gate_ctrl(core, 1); \
- v4l2_device_call_all(&core->v4l2_dev, grpid, o, f, ##args); \
+ v4l2_device_call_all(&core->v4l2_dev, 0, o, f, ##args); \
if (core->gate_ctrl) \
core->gate_ctrl(core, 0); \
} \
} while (0)
-#define call_all(core, o, f, args...) call_hw(core, 0, o, f, ##args)
-
struct cx8800_dev;
struct cx8802_dev;
.owner = THIS_MODULE,
.open = em28xx_v4l2_open,
.release = em28xx_v4l2_close,
- .ioctl = video_ioctl2,
+ .unlocked_ioctl = video_ioctl2,
};
static const struct v4l2_ioctl_ops radio_ioctl_ops = {
#include <media/v4l2-device.h>
#include <media/v4l2-chip-ident.h>
#include <media/v4l2-ctrls.h>
-#include <media/wm8775.h>
MODULE_DESCRIPTION("wm8775 driver");
MODULE_AUTHOR("Ulf Eklund, Hans Verkuil");
TOT_REGS
};
-#define ALC_HOLD 0x85 /* R17: use zero cross detection, ALC hold time 42.6 ms */
-#define ALC_EN 0x100 /* R17: ALC enable */
-
struct wm8775_state {
struct v4l2_subdev sd;
struct v4l2_ctrl_handler hdl;
struct v4l2_ctrl *mute;
- struct v4l2_ctrl *vol;
- struct v4l2_ctrl *bal;
- struct v4l2_ctrl *loud;
u8 input; /* Last selected input (0-0xf) */
};
return -1;
}
-static void wm8775_set_audio(struct v4l2_subdev *sd, int quietly)
-{
- struct wm8775_state *state = to_state(sd);
- u8 vol_l, vol_r;
- int muted = 0 != state->mute->val;
- u16 volume = (u16)state->vol->val;
- u16 balance = (u16)state->bal->val;
-
- /* normalize ( 65535 to 0 -> 255 to 0 (+24dB to -103dB) ) */
- vol_l = (min(65536 - balance, 32768) * volume) >> 23;
- vol_r = (min(balance, (u16)32768) * volume) >> 23;
-
- /* Mute */
- if (muted || quietly)
- wm8775_write(sd, R21, 0x0c0 | state->input);
-
- wm8775_write(sd, R14, vol_l | 0x100); /* 0x100= Left channel ADC zero cross enable */
- wm8775_write(sd, R15, vol_r | 0x100); /* 0x100= Right channel ADC zero cross enable */
-
- /* Un-mute */
- if (!muted)
- wm8775_write(sd, R21, state->input);
-}
-
static int wm8775_s_routing(struct v4l2_subdev *sd,
u32 input, u32 output, u32 config)
{
state->input = input;
if (!v4l2_ctrl_g_ctrl(state->mute))
return 0;
- if (!v4l2_ctrl_g_ctrl(state->vol))
- return 0;
- if (!v4l2_ctrl_g_ctrl(state->bal))
- return 0;
- wm8775_set_audio(sd, 1);
+ wm8775_write(sd, R21, 0x0c0);
+ wm8775_write(sd, R14, 0x1d4);
+ wm8775_write(sd, R15, 0x1d4);
+ wm8775_write(sd, R21, 0x100 + state->input);
return 0;
}
static int wm8775_s_ctrl(struct v4l2_ctrl *ctrl)
{
struct v4l2_subdev *sd = to_sd(ctrl);
+ struct wm8775_state *state = to_state(sd);
switch (ctrl->id) {
case V4L2_CID_AUDIO_MUTE:
- case V4L2_CID_AUDIO_VOLUME:
- case V4L2_CID_AUDIO_BALANCE:
- wm8775_set_audio(sd, 0);
- return 0;
- case V4L2_CID_AUDIO_LOUDNESS:
- wm8775_write(sd, R17, (ctrl->val ? ALC_EN : 0) | ALC_HOLD);
+ wm8775_write(sd, R21, 0x0c0);
+ wm8775_write(sd, R14, 0x1d4);
+ wm8775_write(sd, R15, 0x1d4);
+ if (!ctrl->val)
+ wm8775_write(sd, R21, 0x100 + state->input);
return 0;
}
return -EINVAL;
static int wm8775_s_frequency(struct v4l2_subdev *sd, struct v4l2_frequency *freq)
{
- wm8775_set_audio(sd, 0);
+ struct wm8775_state *state = to_state(sd);
+
+ /* If I remove this, then it can happen that I have no
+ sound the first time I tune from static to a valid channel.
+ It's difficult to reproduce and is almost certainly related
+ to the zero cross detect circuit. */
+ wm8775_write(sd, R21, 0x0c0);
+ wm8775_write(sd, R14, 0x1d4);
+ wm8775_write(sd, R15, 0x1d4);
+ wm8775_write(sd, R21, 0x100 + state->input);
return 0;
}
{
struct wm8775_state *state;
struct v4l2_subdev *sd;
- int err;
/* Check if the adapter supports the needed features */
if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_BYTE_DATA))
return -ENOMEM;
sd = &state->sd;
v4l2_i2c_subdev_init(sd, client, &wm8775_ops);
- sd->grp_id = WM8775_GID; /* subdev group id */
state->input = 2;
- v4l2_ctrl_handler_init(&state->hdl, 4);
+ v4l2_ctrl_handler_init(&state->hdl, 1);
state->mute = v4l2_ctrl_new_std(&state->hdl, &wm8775_ctrl_ops,
V4L2_CID_AUDIO_MUTE, 0, 1, 1, 0);
- state->vol = v4l2_ctrl_new_std(&state->hdl, &wm8775_ctrl_ops,
- V4L2_CID_AUDIO_VOLUME, 0, 65535, (65535+99)/100, 0xCF00); /* 0dB*/
- state->bal = v4l2_ctrl_new_std(&state->hdl, &wm8775_ctrl_ops,
- V4L2_CID_AUDIO_BALANCE, 0, 65535, (65535+99)/100, 32768);
- state->loud = v4l2_ctrl_new_std(&state->hdl, &wm8775_ctrl_ops,
- V4L2_CID_AUDIO_LOUDNESS, 0, 1, 1, 1);
sd->ctrl_handler = &state->hdl;
- err = state->hdl.error;
- if (err) {
+ if (state->hdl.error) {
+ int err = state->hdl.error;
+
v4l2_ctrl_handler_free(&state->hdl);
kfree(state);
return err;
wm8775_write(sd, R23, 0x000);
/* Disable zero cross detect timeout */
wm8775_write(sd, R7, 0x000);
- /* HPF enable, I2S mode, 24-bit */
- wm8775_write(sd, R11, 0x022);
+ /* Left justified, 24-bit mode */
+ wm8775_write(sd, R11, 0x021);
/* Master mode, clock ratio 256fs */
wm8775_write(sd, R12, 0x102);
/* Powered up */
wm8775_write(sd, R13, 0x000);
- /* ALC stereo, ALC target level -5dB FS, ALC max gain +8dB */
- wm8775_write(sd, R16, 0x1bb);
- /* Set ALC mode and hold time */
- wm8775_write(sd, R17, (state->loud->val ? ALC_EN : 0) | ALC_HOLD);
+ /* ADC gain +2.5dB, enable zero cross */
+ wm8775_write(sd, R14, 0x1d4);
+ /* ADC gain +2.5dB, enable zero cross */
+ wm8775_write(sd, R15, 0x1d4);
+ /* ALC Stereo, ALC target level -1dB FS max gain +8dB */
+ wm8775_write(sd, R16, 0x1bf);
+ /* Enable gain control, use zero cross detection,
+ ALC hold time 42.6 ms */
+ wm8775_write(sd, R17, 0x185);
/* ALC gain ramp up delay 34 s, ALC gain ramp down delay 33 ms */
wm8775_write(sd, R18, 0x0a2);
/* Enable noise gate, threshold -72dBfs */
wm8775_write(sd, R19, 0x005);
- /* Transient window 4ms, ALC min gain -5dB */
- wm8775_write(sd, R20, 0x0fb);
-
- wm8775_set_audio(sd, 1); /* set volume/mute/mux */
-
+ /* Transient window 4ms, lower PGA gain limit -1dB */
+ wm8775_write(sd, R20, 0x07a);
+ /* LRBOTH = 1, use input 2. */
+ wm8775_write(sd, R21, 0x102);
return 0;
}
struct atl1_rfd_ring rfd_old, rfd_new;
struct atl1_rrd_ring rrd_old, rrd_new;
struct atl1_ring_header rhdr_old, rhdr_new;
+ struct atl1_smb smb;
+ struct atl1_cmb cmb;
int err;
tpd_old = adapter->tpd_ring;
adapter->rrd_ring = rrd_old;
adapter->tpd_ring = tpd_old;
adapter->ring_header = rhdr_old;
+ /*
+ * Save SMB and CMB, since atl1_free_ring_resources
+ * will clear them.
+ */
+ smb = adapter->smb;
+ cmb = adapter->cmb;
atl1_free_ring_resources(adapter);
adapter->rfd_ring = rfd_new;
adapter->rrd_ring = rrd_new;
adapter->tpd_ring = tpd_new;
adapter->ring_header = rhdr_new;
+ adapter->smb = smb;
+ adapter->cmb = cmb;
err = atl1_up(adapter);
if (err)
&udev->l2_ring_map,
GFP_KERNEL | __GFP_COMP);
if (!udev->l2_ring)
- return -ENOMEM;
+ goto err_udev;
udev->l2_buf_size = (cp->l2_rx_ring_size + 1) * cp->l2_single_buf_size;
udev->l2_buf_size = PAGE_ALIGN(udev->l2_buf_size);
&udev->l2_buf_map,
GFP_KERNEL | __GFP_COMP);
if (!udev->l2_buf)
- return -ENOMEM;
+ goto err_dma;
write_lock(&cnic_dev_lock);
list_add(&udev->list, &cnic_udev_list);
cp->udev = udev;
return 0;
+ err_dma:
+ dma_free_coherent(&udev->pdev->dev, udev->l2_ring_size,
+ udev->l2_ring, udev->l2_ring_map);
+ err_udev:
+ kfree(udev);
+ return -ENOMEM;
}
static int cnic_init_uio(struct cnic_dev *dev)
static int ehea_set_flags(struct net_device *dev, u32 data)
{
+ /* Avoid changing the VLAN flags */
+ if ((data & (ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN)) !=
+ (ethtool_op_get_flags(dev) & (ETH_FLAG_RXVLAN |
+ ETH_FLAG_TXVLAN))){
+ return -EINVAL;
+ }
+
return ethtool_op_set_flags(dev, data, ETH_FLAG_LRO
| ETH_FLAG_TXVLAN
| ETH_FLAG_RXVLAN);
}
#ifdef CONFIG_PPP_MULTILINK
+static bool mp_protocol_compress __read_mostly = true;
+module_param(mp_protocol_compress, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(mp_protocol_compress,
+ "compress protocol id in multilink fragments");
+
/*
* Divide a packet to be transmitted into fragments and
* send them out the individual links.
if (nfree == 0 || nfree < navail / 2)
return 0; /* can't take now, leave it in xmit_pending */
- /* Do protocol field compression (XXX this should be optional) */
+ /* Do protocol field compression */
p = skb->data;
len = skb->len;
- if (*p == 0) {
+ if (*p == 0 && mp_protocol_compress) {
++p;
--len;
}
bp->SharedMemAddr = pci_alloc_consistent(&bp->pdev,
bp->SharedMemSize,
&bp->SharedMemDMA);
- if (!bp->SharedMemSize) {
+ if (!bp->SharedMemAddr) {
printk("could not allocate mem for ");
printk("hardware module: %ld byte\n",
bp->SharedMemSize);
* This SUCKS.
* We need a much better method to determine if dma_addr_t is 64-bit.
*/
-#if (defined(__i386__) && defined(CONFIG_HIGHMEM64G)) || defined(__x86_64__) || defined (__ia64__) || defined(__alpha__) || defined(__mips64__) || (defined(__mips__) && defined(CONFIG_HIGHMEM) && defined(CONFIG_64BIT_PHYS_ADDR)) || (defined(__powerpc64__) || defined(CONFIG_PHYS_64BIT))
+#if (defined(__i386__) && defined(CONFIG_HIGHMEM64G)) || defined(__x86_64__) || defined (__ia64__) || defined(__alpha__) || (defined(CONFIG_MIPS) && ((defined(CONFIG_HIGHMEM) && defined(CONFIG_64BIT_PHYS_ADDR)) || defined(CONFIG_64BIT))) || (defined(__powerpc64__) || defined(CONFIG_PHYS_64BIT))
/* 64-bit dma_addr_t */
#define ADDR_64BITS /* This chip uses 64 bit addresses. */
#define netdrv_addr_t __le64
cnt = pci_read_vpd(tp->pdev, pos,
TG3_NVM_VPD_LEN - pos,
&vpd_data[pos]);
- if (cnt == -ETIMEDOUT || -EINTR)
+ if (cnt == -ETIMEDOUT || cnt == -EINTR)
cnt = 0;
else if (cnt < 0)
goto out_not_found;
#include <drm/i915_drm.h>
#include <asm/msr.h>
#include <asm/processor.h>
+#include "intel_ips.h"
#define PCI_DEVICE_ID_INTEL_THERMAL_SENSOR 0x3b32
#define thm_writel(off, val) writel((val), ips->regmap + (off))
static const int IPS_ADJUST_PERIOD = 5000; /* ms */
+static bool late_i915_load = false;
/* For initial average collection */
static const int IPS_SAMPLE_PERIOD = 200; /* ms */
u64 orig_turbo_ratios;
};
+static bool
+ips_gpu_turbo_enabled(struct ips_driver *ips);
+
/**
* ips_cpu_busy - is CPU busy?
* @ips: IPS driver struct
*/
static bool ips_gpu_busy(struct ips_driver *ips)
{
- if (!ips->gpu_turbo_enabled)
+ if (!ips_gpu_turbo_enabled(ips))
return false;
return ips->gpu_busy();
*/
static void ips_gpu_raise(struct ips_driver *ips)
{
- if (!ips->gpu_turbo_enabled)
+ if (!ips_gpu_turbo_enabled(ips))
return;
if (!ips->gpu_raise())
*/
static void ips_gpu_lower(struct ips_driver *ips)
{
- if (!ips->gpu_turbo_enabled)
+ if (!ips_gpu_turbo_enabled(ips))
return;
if (!ips->gpu_lower())
return false;
}
+static bool
+ips_gpu_turbo_enabled(struct ips_driver *ips)
+{
+ if (!ips->gpu_busy && late_i915_load) {
+ if (ips_get_i915_syms(ips)) {
+ dev_info(&ips->dev->dev,
+ "i915 driver attached, reenabling gpu turbo\n");
+ ips->gpu_turbo_enabled = !(thm_readl(THM_HTS) & HTS_GTD_DIS);
+ }
+ }
+
+ return ips->gpu_turbo_enabled;
+}
+
+void
+ips_link_to_i915_driver()
+{
+ /* We can't cleanly get at the various ips_driver structs from
+ * this caller (the i915 driver), so just set a flag saying
+ * that it's time to try getting the symbols again.
+ */
+ late_i915_load = true;
+}
+EXPORT_SYMBOL_GPL(ips_link_to_i915_driver);
+
static DEFINE_PCI_DEVICE_TABLE(ips_id_table) = {
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL,
PCI_DEVICE_ID_INTEL_THERMAL_SENSOR), },
--- /dev/null
+/*
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * The full GNU General Public License is included in this distribution in
+ * the file called "COPYING".
+ */
+
+void ips_link_to_i915_driver(void);
#include <linux/sfi.h>
#include <asm/mrst.h>
#include <asm/intel_scu_ipc.h>
+#include <asm/mrst.h>
/* IPC defines the following message types */
#define IPCMSG_WATCHDOG_TIMER 0xF8 /* Set Kernel Watchdog Threshold */
iounmap(ipcdev.ipc_base);
return -ENOMEM;
}
+
+ intel_scu_devices_create();
+
return 0;
}
iounmap(ipcdev.ipc_base);
iounmap(ipcdev.i2c_base);
ipcdev.pdev = NULL;
+ intel_scu_devices_destroy();
}
static const struct pci_device_id pci_ids[] = {
This driver can also be built as a module. If so, the module
will be called rtc-cmos.
+config RTC_DRV_VRTC
+ tristate "Virtual RTC for Moorestown platforms"
+ depends on X86_MRST
+ default y if X86_MRST
+
+ help
+ Say "yes" here to get direct support for the real time clock
+ found on Moorestown platforms. The VRTC is a emulated RTC that
+ derives its clock source from a real RTC in the PMIC. The MC146818
+ style programming interface is mostly conserved, but any
+ updates are done via IPC calls to the system controller FW.
+
config RTC_DRV_DS1216
tristate "Dallas DS1216"
depends on SNI_RM
obj-$(CONFIG_RTC_DRV_COH901331) += rtc-coh901331.o
obj-$(CONFIG_RTC_DRV_DAVINCI) += rtc-davinci.o
obj-$(CONFIG_RTC_DRV_DM355EVM) += rtc-dm355evm.o
+obj-$(CONFIG_RTC_DRV_VRTC) += rtc-mrst.o
obj-$(CONFIG_RTC_DRV_DS1216) += rtc-ds1216.o
obj-$(CONFIG_RTC_DRV_DS1286) += rtc-ds1286.o
obj-$(CONFIG_RTC_DRV_DS1302) += rtc-ds1302.o
--- /dev/null
+/*
+ * rtc-mrst.c: Driver for Moorestown virtual RTC
+ *
+ * (C) Copyright 2009 Intel Corporation
+ * Author: Jacob Pan (jacob.jun.pan@intel.com)
+ * Feng Tang (feng.tang@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * Note:
+ * VRTC is emulated by system controller firmware, the real HW
+ * RTC is located in the PMIC device. SCU FW shadows PMIC RTC
+ * in a memory mapped IO space that is visible to the host IA
+ * processor.
+ *
+ * This driver is based upon drivers/rtc/rtc-cmos.c
+ */
+
+/*
+ * Note:
+ * * vRTC only supports binary mode and 24H mode
+ * * vRTC only support PIE and AIE, no UIE, and its PIE only happens
+ * at 23:59:59pm everyday, no support for adjustable frequency
+ * * Alarm function is also limited to hr/min/sec.
+ */
+
+#include <linux/mod_devicetable.h>
+#include <linux/platform_device.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sfi.h>
+
+#include <asm-generic/rtc.h>
+#include <asm/intel_scu_ipc.h>
+#include <asm/mrst.h>
+#include <asm/mrst-vrtc.h>
+
+struct mrst_rtc {
+ struct rtc_device *rtc;
+ struct device *dev;
+ int irq;
+ struct resource *iomem;
+
+ u8 enabled_wake;
+ u8 suspend_ctrl;
+};
+
+static const char driver_name[] = "rtc_mrst";
+
+#define RTC_IRQMASK (RTC_PF | RTC_AF)
+
+static inline int is_intr(u8 rtc_intr)
+{
+ if (!(rtc_intr & RTC_IRQF))
+ return 0;
+ return rtc_intr & RTC_IRQMASK;
+}
+
+/*
+ * rtc_time's year contains the increment over 1900, but vRTC's YEAR
+ * register can't be programmed to value larger than 0x64, so vRTC
+ * driver chose to use 1960 (1970 is UNIX time start point) as the base,
+ * and does the translation at read/write time.
+ *
+ * Why not just use 1970 as the offset? it's because using 1960 will
+ * make it consistent in leap year setting for both vrtc and low-level
+ * physical rtc devices.
+ */
+static int mrst_read_time(struct device *dev, struct rtc_time *time)
+{
+ unsigned long flags;
+
+ if (rtc_is_updating())
+ mdelay(20);
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ time->tm_sec = vrtc_cmos_read(RTC_SECONDS);
+ time->tm_min = vrtc_cmos_read(RTC_MINUTES);
+ time->tm_hour = vrtc_cmos_read(RTC_HOURS);
+ time->tm_mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
+ time->tm_mon = vrtc_cmos_read(RTC_MONTH);
+ time->tm_year = vrtc_cmos_read(RTC_YEAR);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
+ /* Adjust for the 1960/1900 */
+ time->tm_year += 60;
+ time->tm_mon--;
+ return RTC_24H;
+}
+
+static int mrst_set_time(struct device *dev, struct rtc_time *time)
+{
+ int ret;
+ unsigned long flags;
+ unsigned char mon, day, hrs, min, sec;
+ unsigned int yrs;
+
+ yrs = time->tm_year;
+ mon = time->tm_mon + 1; /* tm_mon starts at zero */
+ day = time->tm_mday;
+ hrs = time->tm_hour;
+ min = time->tm_min;
+ sec = time->tm_sec;
+
+ if (yrs < 70 || yrs > 138)
+ return -EINVAL;
+ yrs -= 60;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+
+ vrtc_cmos_write(yrs, RTC_YEAR);
+ vrtc_cmos_write(mon, RTC_MONTH);
+ vrtc_cmos_write(day, RTC_DAY_OF_MONTH);
+ vrtc_cmos_write(hrs, RTC_HOURS);
+ vrtc_cmos_write(min, RTC_MINUTES);
+ vrtc_cmos_write(sec, RTC_SECONDS);
+
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
+ ret = intel_scu_ipc_simple_command(IPCMSG_VRTC, IPC_CMD_VRTC_SETTIME);
+ return ret;
+}
+
+static int mrst_read_alarm(struct device *dev, struct rtc_wkalrm *t)
+{
+ struct mrst_rtc *mrst = dev_get_drvdata(dev);
+ unsigned char rtc_control;
+
+ if (mrst->irq <= 0)
+ return -EIO;
+
+ /* Basic alarms only support hour, minute, and seconds fields.
+ * Some also support day and month, for alarms up to a year in
+ * the future.
+ */
+ t->time.tm_mday = -1;
+ t->time.tm_mon = -1;
+ t->time.tm_year = -1;
+
+ /* vRTC only supports binary mode */
+ spin_lock_irq(&rtc_lock);
+ t->time.tm_sec = vrtc_cmos_read(RTC_SECONDS_ALARM);
+ t->time.tm_min = vrtc_cmos_read(RTC_MINUTES_ALARM);
+ t->time.tm_hour = vrtc_cmos_read(RTC_HOURS_ALARM);
+
+ rtc_control = vrtc_cmos_read(RTC_CONTROL);
+ spin_unlock_irq(&rtc_lock);
+
+ t->enabled = !!(rtc_control & RTC_AIE);
+ t->pending = 0;
+
+ return 0;
+}
+
+static void mrst_checkintr(struct mrst_rtc *mrst, unsigned char rtc_control)
+{
+ unsigned char rtc_intr;
+
+ /*
+ * NOTE after changing RTC_xIE bits we always read INTR_FLAGS;
+ * allegedly some older rtcs need that to handle irqs properly
+ */
+ rtc_intr = vrtc_cmos_read(RTC_INTR_FLAGS);
+ rtc_intr &= (rtc_control & RTC_IRQMASK) | RTC_IRQF;
+ if (is_intr(rtc_intr))
+ rtc_update_irq(mrst->rtc, 1, rtc_intr);
+}
+
+static void mrst_irq_enable(struct mrst_rtc *mrst, unsigned char mask)
+{
+ unsigned char rtc_control;
+
+ /*
+ * Flush any pending IRQ status, notably for update irqs,
+ * before we enable new IRQs
+ */
+ rtc_control = vrtc_cmos_read(RTC_CONTROL);
+ mrst_checkintr(mrst, rtc_control);
+
+ rtc_control |= mask;
+ vrtc_cmos_write(rtc_control, RTC_CONTROL);
+
+ mrst_checkintr(mrst, rtc_control);
+}
+
+static void mrst_irq_disable(struct mrst_rtc *mrst, unsigned char mask)
+{
+ unsigned char rtc_control;
+
+ rtc_control = vrtc_cmos_read(RTC_CONTROL);
+ rtc_control &= ~mask;
+ vrtc_cmos_write(rtc_control, RTC_CONTROL);
+ mrst_checkintr(mrst, rtc_control);
+}
+
+static int mrst_set_alarm(struct device *dev, struct rtc_wkalrm *t)
+{
+ struct mrst_rtc *mrst = dev_get_drvdata(dev);
+ unsigned char hrs, min, sec;
+ int ret = 0;
+
+ if (!mrst->irq)
+ return -EIO;
+
+ hrs = t->time.tm_hour;
+ min = t->time.tm_min;
+ sec = t->time.tm_sec;
+
+ spin_lock_irq(&rtc_lock);
+ /* Next rtc irq must not be from previous alarm setting */
+ mrst_irq_disable(mrst, RTC_AIE);
+
+ /* Update alarm */
+ vrtc_cmos_write(hrs, RTC_HOURS_ALARM);
+ vrtc_cmos_write(min, RTC_MINUTES_ALARM);
+ vrtc_cmos_write(sec, RTC_SECONDS_ALARM);
+
+ spin_unlock_irq(&rtc_lock);
+
+ ret = intel_scu_ipc_simple_command(IPCMSG_VRTC, IPC_CMD_VRTC_SETALARM);
+ if (ret)
+ return ret;
+
+ spin_lock_irq(&rtc_lock);
+ if (t->enabled)
+ mrst_irq_enable(mrst, RTC_AIE);
+
+ spin_unlock_irq(&rtc_lock);
+
+ return 0;
+}
+
+static int mrst_irq_set_state(struct device *dev, int enabled)
+{
+ struct mrst_rtc *mrst = dev_get_drvdata(dev);
+ unsigned long flags;
+
+ if (!mrst->irq)
+ return -ENXIO;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+
+ if (enabled)
+ mrst_irq_enable(mrst, RTC_PIE);
+ else
+ mrst_irq_disable(mrst, RTC_PIE);
+
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ return 0;
+}
+
+#if defined(CONFIG_RTC_INTF_DEV) || defined(CONFIG_RTC_INTF_DEV_MODULE)
+
+/* Currently, the vRTC doesn't support UIE ON/OFF */
+static int
+mrst_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg)
+{
+ struct mrst_rtc *mrst = dev_get_drvdata(dev);
+ unsigned long flags;
+
+ switch (cmd) {
+ case RTC_AIE_OFF:
+ case RTC_AIE_ON:
+ if (!mrst->irq)
+ return -EINVAL;
+ break;
+ default:
+ /* PIE ON/OFF is handled by mrst_irq_set_state() */
+ return -ENOIOCTLCMD;
+ }
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ switch (cmd) {
+ case RTC_AIE_OFF: /* alarm off */
+ mrst_irq_disable(mrst, RTC_AIE);
+ break;
+ case RTC_AIE_ON: /* alarm on */
+ mrst_irq_enable(mrst, RTC_AIE);
+ break;
+ }
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ return 0;
+}
+
+#else
+#define mrst_rtc_ioctl NULL
+#endif
+
+#if defined(CONFIG_RTC_INTF_PROC) || defined(CONFIG_RTC_INTF_PROC_MODULE)
+
+static int mrst_procfs(struct device *dev, struct seq_file *seq)
+{
+ unsigned char rtc_control, valid;
+
+ spin_lock_irq(&rtc_lock);
+ rtc_control = vrtc_cmos_read(RTC_CONTROL);
+ valid = vrtc_cmos_read(RTC_VALID);
+ spin_unlock_irq(&rtc_lock);
+
+ return seq_printf(seq,
+ "periodic_IRQ\t: %s\n"
+ "alarm\t\t: %s\n"
+ "BCD\t\t: no\n"
+ "periodic_freq\t: daily (not adjustable)\n",
+ (rtc_control & RTC_PIE) ? "on" : "off",
+ (rtc_control & RTC_AIE) ? "on" : "off");
+}
+
+#else
+#define mrst_procfs NULL
+#endif
+
+static const struct rtc_class_ops mrst_rtc_ops = {
+ .ioctl = mrst_rtc_ioctl,
+ .read_time = mrst_read_time,
+ .set_time = mrst_set_time,
+ .read_alarm = mrst_read_alarm,
+ .set_alarm = mrst_set_alarm,
+ .proc = mrst_procfs,
+ .irq_set_state = mrst_irq_set_state,
+};
+
+static struct mrst_rtc mrst_rtc;
+
+/*
+ * When vRTC IRQ is captured by SCU FW, FW will clear the AIE bit in
+ * Reg B, so no need for this driver to clear it
+ */
+static irqreturn_t mrst_rtc_irq(int irq, void *p)
+{
+ u8 irqstat;
+
+ spin_lock(&rtc_lock);
+ /* This read will clear all IRQ flags inside Reg C */
+ irqstat = vrtc_cmos_read(RTC_INTR_FLAGS);
+ spin_unlock(&rtc_lock);
+
+ irqstat &= RTC_IRQMASK | RTC_IRQF;
+ if (is_intr(irqstat)) {
+ rtc_update_irq(p, 1, irqstat);
+ return IRQ_HANDLED;
+ }
+ return IRQ_NONE;
+}
+
+static int __init
+vrtc_mrst_do_probe(struct device *dev, struct resource *iomem, int rtc_irq)
+{
+ int retval = 0;
+ unsigned char rtc_control;
+
+ /* There can be only one ... */
+ if (mrst_rtc.dev)
+ return -EBUSY;
+
+ if (!iomem)
+ return -ENODEV;
+
+ iomem = request_mem_region(iomem->start,
+ iomem->end + 1 - iomem->start,
+ driver_name);
+ if (!iomem) {
+ dev_dbg(dev, "i/o mem already in use.\n");
+ return -EBUSY;
+ }
+
+ mrst_rtc.irq = rtc_irq;
+ mrst_rtc.iomem = iomem;
+
+ mrst_rtc.rtc = rtc_device_register(driver_name, dev,
+ &mrst_rtc_ops, THIS_MODULE);
+ if (IS_ERR(mrst_rtc.rtc)) {
+ retval = PTR_ERR(mrst_rtc.rtc);
+ goto cleanup0;
+ }
+
+ mrst_rtc.dev = dev;
+ dev_set_drvdata(dev, &mrst_rtc);
+ rename_region(iomem, dev_name(&mrst_rtc.rtc->dev));
+
+ spin_lock_irq(&rtc_lock);
+ mrst_irq_disable(&mrst_rtc, RTC_PIE | RTC_AIE);
+ rtc_control = vrtc_cmos_read(RTC_CONTROL);
+ spin_unlock_irq(&rtc_lock);
+
+ if (!(rtc_control & RTC_24H) || (rtc_control & (RTC_DM_BINARY)))
+ dev_dbg(dev, "TODO: support more than 24-hr BCD mode\n");
+
+ if (rtc_irq) {
+ retval = request_irq(rtc_irq, mrst_rtc_irq,
+ IRQF_DISABLED, dev_name(&mrst_rtc.rtc->dev),
+ mrst_rtc.rtc);
+ if (retval < 0) {
+ dev_dbg(dev, "IRQ %d is already in use, err %d\n",
+ rtc_irq, retval);
+ goto cleanup1;
+ }
+ }
+ dev_dbg(dev, "initialised\n");
+ return 0;
+
+cleanup1:
+ mrst_rtc.dev = NULL;
+ rtc_device_unregister(mrst_rtc.rtc);
+cleanup0:
+ release_region(iomem->start, iomem->end + 1 - iomem->start);
+ dev_err(dev, "rtc-mrst: unable to initialise\n");
+ return retval;
+}
+
+static void rtc_mrst_do_shutdown(void)
+{
+ spin_lock_irq(&rtc_lock);
+ mrst_irq_disable(&mrst_rtc, RTC_IRQMASK);
+ spin_unlock_irq(&rtc_lock);
+}
+
+static void __exit rtc_mrst_do_remove(struct device *dev)
+{
+ struct mrst_rtc *mrst = dev_get_drvdata(dev);
+ struct resource *iomem;
+
+ rtc_mrst_do_shutdown();
+
+ if (mrst->irq)
+ free_irq(mrst->irq, mrst->rtc);
+
+ rtc_device_unregister(mrst->rtc);
+ mrst->rtc = NULL;
+
+ iomem = mrst->iomem;
+ release_region(iomem->start, iomem->end + 1 - iomem->start);
+ mrst->iomem = NULL;
+
+ mrst->dev = NULL;
+ dev_set_drvdata(dev, NULL);
+}
+
+#ifdef CONFIG_PM
+static int mrst_suspend(struct device *dev, pm_message_t mesg)
+{
+ struct mrst_rtc *mrst = dev_get_drvdata(dev);
+ unsigned char tmp;
+
+ /* Only the alarm might be a wakeup event source */
+ spin_lock_irq(&rtc_lock);
+ mrst->suspend_ctrl = tmp = vrtc_cmos_read(RTC_CONTROL);
+ if (tmp & (RTC_PIE | RTC_AIE)) {
+ unsigned char mask;
+
+ if (device_may_wakeup(dev))
+ mask = RTC_IRQMASK & ~RTC_AIE;
+ else
+ mask = RTC_IRQMASK;
+ tmp &= ~mask;
+ vrtc_cmos_write(tmp, RTC_CONTROL);
+
+ mrst_checkintr(mrst, tmp);
+ }
+ spin_unlock_irq(&rtc_lock);
+
+ if (tmp & RTC_AIE) {
+ mrst->enabled_wake = 1;
+ enable_irq_wake(mrst->irq);
+ }
+
+ dev_dbg(&mrst_rtc.rtc->dev, "suspend%s, ctrl %02x\n",
+ (tmp & RTC_AIE) ? ", alarm may wake" : "",
+ tmp);
+
+ return 0;
+}
+
+/*
+ * We want RTC alarms to wake us from the deep power saving state
+ */
+static inline int mrst_poweroff(struct device *dev)
+{
+ return mrst_suspend(dev, PMSG_HIBERNATE);
+}
+
+static int mrst_resume(struct device *dev)
+{
+ struct mrst_rtc *mrst = dev_get_drvdata(dev);
+ unsigned char tmp = mrst->suspend_ctrl;
+
+ /* Re-enable any irqs previously active */
+ if (tmp & RTC_IRQMASK) {
+ unsigned char mask;
+
+ if (mrst->enabled_wake) {
+ disable_irq_wake(mrst->irq);
+ mrst->enabled_wake = 0;
+ }
+
+ spin_lock_irq(&rtc_lock);
+ do {
+ vrtc_cmos_write(tmp, RTC_CONTROL);
+
+ mask = vrtc_cmos_read(RTC_INTR_FLAGS);
+ mask &= (tmp & RTC_IRQMASK) | RTC_IRQF;
+ if (!is_intr(mask))
+ break;
+
+ rtc_update_irq(mrst->rtc, 1, mask);
+ tmp &= ~RTC_AIE;
+ } while (mask & RTC_AIE);
+ spin_unlock_irq(&rtc_lock);
+ }
+
+ dev_dbg(&mrst_rtc.rtc->dev, "resume, ctrl %02x\n", tmp);
+
+ return 0;
+}
+
+#else
+#define mrst_suspend NULL
+#define mrst_resume NULL
+
+static inline int mrst_poweroff(struct device *dev)
+{
+ return -ENOSYS;
+}
+
+#endif
+
+static int __init vrtc_mrst_platform_probe(struct platform_device *pdev)
+{
+ return vrtc_mrst_do_probe(&pdev->dev,
+ platform_get_resource(pdev, IORESOURCE_MEM, 0),
+ platform_get_irq(pdev, 0));
+}
+
+static int __exit vrtc_mrst_platform_remove(struct platform_device *pdev)
+{
+ rtc_mrst_do_remove(&pdev->dev);
+ return 0;
+}
+
+static void vrtc_mrst_platform_shutdown(struct platform_device *pdev)
+{
+ if (system_state == SYSTEM_POWER_OFF && !mrst_poweroff(&pdev->dev))
+ return;
+
+ rtc_mrst_do_shutdown();
+}
+
+MODULE_ALIAS("platform:vrtc_mrst");
+
+static struct platform_driver vrtc_mrst_platform_driver = {
+ .probe = vrtc_mrst_platform_probe,
+ .remove = __exit_p(vrtc_mrst_platform_remove),
+ .shutdown = vrtc_mrst_platform_shutdown,
+ .driver = {
+ .name = (char *) driver_name,
+ .suspend = mrst_suspend,
+ .resume = mrst_resume,
+ }
+};
+
+static int __init vrtc_mrst_init(void)
+{
+ return platform_driver_register(&vrtc_mrst_platform_driver);
+}
+
+static void __exit vrtc_mrst_exit(void)
+{
+ platform_driver_unregister(&vrtc_mrst_platform_driver);
+}
+
+module_init(vrtc_mrst_init);
+module_exit(vrtc_mrst_exit);
+
+MODULE_AUTHOR("Jacob Pan; Feng Tang");
+MODULE_DESCRIPTION("Driver for Moorestown virtual RTC");
+MODULE_LICENSE("GPL");
msg = container_of(mcfqspi->msgq.next, struct spi_message,
queue);
- list_del_init(&mcfqspi->msgq);
+ list_del_init(&msg->queue);
spin_unlock_irqrestore(&mcfqspi->lock, flags);
spi = msg->spi;
/* work with hotplug and coldplug */
MODULE_ALIAS("platform:omap2_mcspi");
+#ifdef CONFIG_SUSPEND
+/*
+ * When SPI wake up from off-mode, CS is in activate state. If it was in
+ * unactive state when driver was suspend, then force it to unactive state at
+ * wake up.
+ */
+static int omap2_mcspi_resume(struct device *dev)
+{
+ struct spi_master *master = dev_get_drvdata(dev);
+ struct omap2_mcspi *mcspi = spi_master_get_devdata(master);
+ struct omap2_mcspi_cs *cs;
+
+ omap2_mcspi_enable_clocks(mcspi);
+ list_for_each_entry(cs, &omap2_mcspi_ctx[master->bus_num - 1].cs,
+ node) {
+ if ((cs->chconf0 & OMAP2_MCSPI_CHCONF_FORCE) == 0) {
+
+ /*
+ * We need to toggle CS state for OMAP take this
+ * change in account.
+ */
+ MOD_REG_BIT(cs->chconf0, OMAP2_MCSPI_CHCONF_FORCE, 1);
+ __raw_writel(cs->chconf0, cs->base + OMAP2_MCSPI_CHCONF0);
+ MOD_REG_BIT(cs->chconf0, OMAP2_MCSPI_CHCONF_FORCE, 0);
+ __raw_writel(cs->chconf0, cs->base + OMAP2_MCSPI_CHCONF0);
+ }
+ }
+ omap2_mcspi_disable_clocks(mcspi);
+ return 0;
+}
+#else
+#define omap2_mcspi_resume NULL
+#endif
+
+static const struct dev_pm_ops omap2_mcspi_pm_ops = {
+ .resume = omap2_mcspi_resume,
+};
+
static struct platform_driver omap2_mcspi_driver = {
.driver = {
.name = "omap2_mcspi",
.owner = THIS_MODULE,
+ .pm = &omap2_mcspi_pm_ops
},
.remove = __exit_p(omap2_mcspi_remove),
};
int ret = 0;
struct zram *zram = queue->queuedata;
- if (unlikely(!zram->init_done)) {
- set_bit(BIO_UPTODATE, &bio->bi_flags);
- bio_endio(bio, 0);
- return 0;
- }
-
if (!valid_io_request(zram, bio)) {
zram_stat64_inc(zram, &zram->stats.invalid_io);
bio_io_error(bio);
goto err1;
}
- sc->kthread = kthread_run(uea_kthread, sc, "ueagle-atm");
- if (sc->kthread == ERR_PTR(-ENOMEM)) {
+ /* Create worker thread, but don't start it here. Start it after
+ * all usbatm generic initialization is done.
+ */
+ sc->kthread = kthread_create(uea_kthread, sc, "ueagle-atm");
+ if (IS_ERR(sc->kthread)) {
uea_err(INS_TO_USBDEV(sc), "failed to create thread\n");
goto err2;
}
static int uea_probe(struct usb_interface *intf, const struct usb_device_id *id)
{
struct usb_device *usb = interface_to_usbdev(intf);
+ int ret;
uea_enters(usb);
uea_info(usb, "ADSL device founded vid (%#X) pid (%#X) Rev (%#X): %s\n",
if (UEA_IS_PREFIRM(id))
return uea_load_firmware(usb, UEA_CHIP_VERSION(id));
- return usbatm_usb_probe(intf, id, &uea_usbatm_driver);
+ ret = usbatm_usb_probe(intf, id, &uea_usbatm_driver);
+ if (ret == 0) {
+ struct usbatm_data *usbatm = usb_get_intfdata(intf);
+ struct uea_softc *sc = usbatm->driver_data;
+
+ /* Ensure carrier is initialized to off as early as possible */
+ UPDATE_ATM_SIGNAL(ATM_PHY_SIG_LOST);
+
+ /* Only start the worker thread when all init is done */
+ wake_up_process(sc->kthread);
+ }
+
+ return ret;
}
static void uea_disconnect(struct usb_interface *intf)
*/
#ifdef CONFIG_HPWDT_NMI_DECODING
-#ifdef ARCH_HAS_NMI_WATCHDOG
+#ifdef CONFIG_X86_LOCAL_APIC
static void __devinit hpwdt_check_nmi_decoding(struct pci_dev *dev)
{
/*
* If nmi_watchdog is turned off then we can turn on
* our nmi decoding capability.
*/
- if (!nmi_watchdog_active())
- hpwdt_nmi_decoding = 1;
- else
- dev_warn(&dev->dev, "NMI decoding is disabled. To enable this "
- "functionality you must reboot with nmi_watchdog=0 "
- "and load the hpwdt driver with priority=1.\n");
+ hpwdt_nmi_decoding = 1;
}
#else
static void __devinit hpwdt_check_nmi_decoding(struct pci_dev *dev)
dev_warn(&dev->dev, "NMI decoding is disabled. "
"Your kernel does not support a NMI Watchdog.\n");
}
-#endif /* ARCH_HAS_NMI_WATCHDOG */
+#endif /* CONFIG_X86_LOCAL_APIC */
static int __devinit hpwdt_init_nmi_decoding(struct pci_dev *dev)
{
int metadata;
unsigned int revokes = 0;
int x;
- int error;
+ int error = 0;
if (!*top)
sm->sm_first = 0;
if (metadata)
revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
- error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+ if (ip != GFS2_I(sdp->sd_rindex))
+ error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
+ else if (!sdp->sd_rgrps)
+ error = gfs2_ri_update(ip);
+
if (error)
return error;
out_rlist:
gfs2_rlist_free(&rlist);
out:
- gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
+ if (ip != GFS2_I(sdp->sd_rindex))
+ gfs2_glock_dq_uninit(&ip->i_alloc->al_ri_gh);
return error;
}
spin_unlock(&gl->gl_spin);
}
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
- unsigned int req_state,
- unsigned int flags)
-{
- int ret = LM_OUT_ERROR;
-
- if (!sdp->sd_lockstruct.ls_ops->lm_lock)
- return req_state == LM_ST_UNLOCKED ? 0 : req_state;
-
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
- req_state, flags);
- return ret;
-}
-
/**
* do_xmote - Calls the DLM to change the state of a lock
* @gl: The lock state
lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
LM_FLAG_PRIORITY);
- BUG_ON(gl->gl_state == target);
- BUG_ON(gl->gl_state == gl->gl_target);
+ GLOCK_BUG_ON(gl, gl->gl_state == target);
+ GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
glops->go_inval) {
set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
do_error(gl, 0); /* Fail queued try locks */
}
+ gl->gl_req = target;
spin_unlock(&gl->gl_spin);
if (glops->go_xmote_th)
glops->go_xmote_th(gl);
gl->gl_state == LM_ST_DEFERRED) &&
!(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
lck_flags |= LM_FLAG_TRY_1CB;
- ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
- if (!(ret & LM_OUT_ASYNC)) {
- finish_xmote(gl, ret);
+ if (sdp->sd_lockstruct.ls_ops->lm_lock) {
+ /* lock_dlm */
+ ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
+ GLOCK_BUG_ON(gl, ret);
+ } else { /* lock_nolock */
+ finish_xmote(gl, target);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
- } else {
- GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
}
+
spin_lock(&gl->gl_spin);
}
void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
va_start(args, fmt);
+
if (seq) {
struct gfs2_glock_iter *gi = seq->private;
vsprintf(gi->string, fmt, args);
seq_printf(seq, gi->string);
} else {
- printk(KERN_ERR " ");
- vprintk(fmt, args);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_ERR " %pV", &vaf);
}
+
va_end(args);
}
* @gl: Pointer to the glock
* @ret: The return value from the dlm
*
+ * The gl_reply field is under the gl_spin lock so that it is ok
+ * to use a bitfield shared with other glock state fields.
*/
void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
{
struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+ spin_lock(&gl->gl_spin);
gl->gl_reply = ret;
if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
- spin_lock(&gl->gl_spin);
if (gfs2_should_freeze(gl)) {
set_bit(GLF_FROZEN, &gl->gl_flags);
spin_unlock(&gl->gl_spin);
return;
}
- spin_unlock(&gl->gl_spin);
}
+
+ spin_unlock(&gl->gl_spin);
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+ smp_wmb();
gfs2_glock_hold(gl);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
{
struct task_struct *gh_owner = NULL;
- char buffer[KSYM_SYMBOL_LEN];
char flags_buf[32];
- sprint_symbol(buffer, gh->gh_ip);
if (gh->gh_owner_pid)
gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
- gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
- state2str(gh->gh_state),
- hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
- gh->gh_error,
- gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
- gh_owner ? gh_owner->comm : "(ended)", buffer);
+ gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
+ state2str(gh->gh_state),
+ hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+ gh->gh_error,
+ gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+ gh_owner ? gh_owner->comm : "(ended)",
+ (void *)gh->gh_ip);
return 0;
}
}
#endif
- glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
+ glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
WQ_HIGHPRI | WQ_FREEZEABLE, 0);
if (IS_ERR(glock_workqueue))
return PTR_ERR(glock_workqueue);
- gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
- WQ_FREEZEABLE, 0);
+ gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
+ WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+ 0);
if (IS_ERR(gfs2_delete_workqueue)) {
destroy_workqueue(glock_workqueue);
return PTR_ERR(gfs2_delete_workqueue);
#define GL_ASYNC 0x00000040
#define GL_EXACT 0x00000080
#define GL_SKIP 0x00000100
-#define GL_ATIME 0x00000200
#define GL_NOCACHE 0x00000400
/*
- * lm_lock() and lm_async_cb return flags
+ * lm_async_cb return flags
*
* LM_OUT_ST_MASK
* Masks the lower two bits of lock state in the returned value.
* LM_OUT_CANCELED
* The lock request was canceled.
*
- * LM_OUT_ASYNC
- * The result of the request will be returned in an LM_CB_ASYNC callback.
- *
*/
#define LM_OUT_ST_MASK 0x00000003
#define LM_OUT_CANCELED 0x00000008
-#define LM_OUT_ASYNC 0x00000080
-#define LM_OUT_ERROR 0x00000100
+#define LM_OUT_ERROR 0x00000004
/*
* lm_recovery_done() messages
void (*lm_unmount) (struct gfs2_sbd *sdp);
void (*lm_withdraw) (struct gfs2_sbd *sdp);
void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
- unsigned int (*lm_lock) (struct gfs2_glock *gl,
- unsigned int req_state, unsigned int flags);
+ int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
+ unsigned int flags);
void (*lm_cancel) (struct gfs2_glock *gl);
const match_table_t *lm_tokens;
};
-#define LM_FLAG_TRY 0x00000001
-#define LM_FLAG_TRY_1CB 0x00000002
-#define LM_FLAG_NOEXP 0x00000004
-#define LM_FLAG_ANY 0x00000008
-#define LM_FLAG_PRIORITY 0x00000010
-
-#define GL_ASYNC 0x00000040
-#define GL_EXACT 0x00000080
-#define GL_SKIP 0x00000100
-#define GL_NOCACHE 0x00000400
-
-#define GLR_TRYFAILED 13
-
extern struct workqueue_struct *gfs2_delete_workqueue;
static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
{
int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+
+__attribute__ ((format(printf, 2, 3)))
void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
/**
if (gl->gl_state != LM_ST_UNLOCKED &&
test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
- flush_workqueue(gfs2_delete_workqueue);
gfs2_meta_syncfs(sdp);
gfs2_log_shutdown(sdp);
}
spinlock_t gl_spin;
- unsigned int gl_state;
- unsigned int gl_target;
- unsigned int gl_reply;
+ /* State fields protected by gl_spin */
+ unsigned int gl_state:2, /* Current state */
+ gl_target:2, /* Target state */
+ gl_demote_state:2, /* State requested by remote node */
+ gl_req:2, /* State in last dlm request */
+ gl_reply:8; /* Last reply from the dlm */
+
unsigned int gl_hash;
- unsigned int gl_req;
- unsigned int gl_demote_state; /* state requested by remote node */
unsigned long gl_demote_time; /* time of first demote request */
struct list_head gl_holders;
if (error)
return error;
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
- if (error)
- return error;
- }
-
setattr_copy(inode, attr);
mark_inode_dirty(inode);
-
- gfs2_assert_warn(GFS2_SB(inode), !error);
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
return lkf;
}
-static unsigned int gdlm_lock(struct gfs2_glock *gl,
- unsigned int req_state, unsigned int flags)
+static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
+ unsigned int flags)
{
struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
- int error;
int req;
u32 lkf;
- gl->gl_req = req_state;
req = make_mode(req_state);
lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
* Submit the actual lock request.
*/
- error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
- GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
- if (error == -EAGAIN)
- return 0;
- if (error)
- return LM_OUT_ERROR;
- return LM_OUT_ASYNC;
+ return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+ GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
}
static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct buffer_head *dibh;
u32 ouid, ogid, nuid, ngid;
int error;
if (error)
goto out_gunlock_q;
- error = gfs2_meta_inode_buffer(ip, &dibh);
+ error = gfs2_setattr_simple(ip, attr);
if (error)
goto out_end_trans;
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- int error;
-
- error = vmtruncate(inode, attr->ia_size);
- gfs2_assert_warn(sdp, !error);
- }
-
- setattr_copy(inode, attr);
- mark_inode_dirty(inode);
-
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
-
if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
gfs2_quota_change(ip, -blocks, ouid, ogid);
qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
qd->qd_qb.qb_limit = qp->qu_limit;
}
+ if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
+ qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+ qd->qd_qb.qb_value = qp->qu_value;
+ }
}
/* Write the quota into the quota file on disk */
}
/* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD)
+#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
struct fs_disk_quota *fdq)
if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
fdq->d_fieldmask ^= FS_DQ_BSOFT;
+
if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
fdq->d_fieldmask ^= FS_DQ_BHARD;
+
+ if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
+ ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+ fdq->d_fieldmask ^= FS_DQ_BCOUNT;
+
if (fdq->d_fieldmask == 0)
goto out_i;
.get_dqblk = gfs2_get_dqblk,
.set_dqblk = gfs2_set_dqblk,
};
-
for (rgrps = 0;; rgrps++) {
loff_t pos = rgrps * sizeof(struct gfs2_rindex);
- if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
+ if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
break;
error = gfs2_internal_read(ip, &ra_state, buf, &pos,
sizeof(struct gfs2_rindex));
* Returns: 0 on successful update, error code otherwise
*/
-static int gfs2_ri_update(struct gfs2_inode *ip)
+int gfs2_ri_update(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct inode *inode = &ip->i_inode;
return 0;
}
-/**
- * gfs2_ri_update_special - Pull in a new resource index from the disk
- *
- * This is a special version that's safe to call from gfs2_inplace_reserve_i.
- * In this case we know that we don't have any resource groups in memory yet.
- *
- * @ip: pointer to the rindex inode
- *
- * Returns: 0 on successful update, error code otherwise
- */
-static int gfs2_ri_update_special(struct gfs2_inode *ip)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct inode *inode = &ip->i_inode;
- struct file_ra_state ra_state;
- struct gfs2_rgrpd *rgd;
- unsigned int max_data = 0;
- int error;
-
- file_ra_state_init(&ra_state, inode->i_mapping);
- for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
- /* Ignore partials */
- if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
- i_size_read(inode))
- break;
- error = read_rindex_entry(ip, &ra_state);
- if (error) {
- clear_rgrpdi(sdp);
- return error;
- }
- }
- list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
- if (rgd->rd_data > max_data)
- max_data = rgd->rd_data;
- sdp->sd_max_rg_data = max_data;
-
- sdp->sd_rindex_uptodate = 1;
- return 0;
-}
-
/**
* gfs2_rindex_hold - Grab a lock on the rindex
* @sdp: The GFS2 superblock
error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
else if (!sdp->sd_rgrps) /* We may not have the rindex read
in, so: */
- error = gfs2_ri_update_special(ip);
+ error = gfs2_ri_update(ip);
if (error)
return error;
}
+try_again:
do {
error = get_local_rgrp(ip, &last_unlinked);
/* If there is no space, flushing the log may release some */
- if (error)
+ if (error) {
+ if (ip == GFS2_I(sdp->sd_rindex) &&
+ !sdp->sd_rindex_uptodate) {
+ error = gfs2_ri_update(ip);
+ if (error)
+ return error;
+ goto try_again;
+ }
gfs2_log_flush(sdp, NULL);
+ }
} while (error && tries++ < 3);
if (error) {
extern void gfs2_inplace_release(struct gfs2_inode *ip);
+extern int gfs2_ri_update(struct gfs2_inode *ip);
extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
{
- struct inode *inode = &ip->i_inode;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_ea_location el;
- struct buffer_head *dibh;
int error;
error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
if (error)
return error;
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- goto out_trans_end;
-
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- int error;
-
- error = vmtruncate(inode, attr->ia_size);
- gfs2_assert_warn(GFS2_SB(inode), !error);
- }
-
- setattr_copy(inode, attr);
- mark_inode_dirty(inode);
-
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
-
-out_trans_end:
+ error = gfs2_setattr_simple(ip, attr);
gfs2_trans_end(sdp);
return error;
}
#endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+/*
+ * Print out autogroup related information:
+ */
+static int sched_autogroup_show(struct seq_file *m, void *v)
+{
+ struct inode *inode = m->private;
+ struct task_struct *p;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+ proc_sched_autogroup_show_task(p, m);
+
+ put_task_struct(p);
+
+ return 0;
+}
+
+static ssize_t
+sched_autogroup_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *offset)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct task_struct *p;
+ char buffer[PROC_NUMBUF];
+ long nice;
+ int err;
+
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+ if (copy_from_user(buffer, buf, count))
+ return -EFAULT;
+
+ err = strict_strtol(strstrip(buffer), 0, &nice);
+ if (err)
+ return -EINVAL;
+
+ p = get_proc_task(inode);
+ if (!p)
+ return -ESRCH;
+
+ err = nice;
+ err = proc_sched_autogroup_set_nice(p, &err);
+ if (err)
+ count = err;
+
+ put_task_struct(p);
+
+ return count;
+}
+
+static int sched_autogroup_open(struct inode *inode, struct file *filp)
+{
+ int ret;
+
+ ret = single_open(filp, sched_autogroup_show, NULL);
+ if (!ret) {
+ struct seq_file *m = filp->private_data;
+
+ m->private = inode;
+ }
+ return ret;
+}
+
+static const struct file_operations proc_pid_sched_autogroup_operations = {
+ .open = sched_autogroup_open,
+ .read = seq_read,
+ .write = sched_autogroup_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
+
static ssize_t comm_write(struct file *file, const char __user *buf,
size_t count, loff_t *offset)
{
INF("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
+#endif
+#ifdef CONFIG_SCHED_AUTOGROUP
+ REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
#endif
REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
extern int wait_for_completion_killable(struct completion *x);
extern unsigned long wait_for_completion_timeout(struct completion *x,
unsigned long timeout);
-extern unsigned long wait_for_completion_interruptible_timeout(
- struct completion *x, unsigned long timeout);
-extern unsigned long wait_for_completion_killable_timeout(
- struct completion *x, unsigned long timeout);
+extern long wait_for_completion_interruptible_timeout(
+ struct completion *x, unsigned long timeout);
+extern long wait_for_completion_killable_timeout(
+ struct completion *x, unsigned long timeout);
extern bool try_wait_for_completion(struct completion *x);
extern bool completion_done(struct completion *x);
#ifdef CONFIG_DMA_ENGINE
enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx);
void dma_issue_pending_all(void);
+struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param);
+void dma_release_channel(struct dma_chan *chan);
#else
static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
{
}
static inline void dma_issue_pending_all(void)
{
- do { } while (0);
+}
+static inline struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask,
+ dma_filter_fn fn, void *fn_param)
+{
+ return NULL;
+}
+static inline void dma_release_channel(struct dma_chan *chan)
+{
}
#endif
void dma_run_dependencies(struct dma_async_tx_descriptor *tx);
struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type);
#define dma_request_channel(mask, x, y) __dma_request_channel(&(mask), x, y)
-struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param);
-void dma_release_channel(struct dma_chan *chan);
/* --- Helper iov-locking functions --- */
TRACE_EVENT_FL_ENABLED_BIT,
TRACE_EVENT_FL_FILTERED_BIT,
TRACE_EVENT_FL_RECORDED_CMD_BIT,
+ TRACE_EVENT_FL_CAP_ANY_BIT,
};
enum {
TRACE_EVENT_FL_ENABLED = (1 << TRACE_EVENT_FL_ENABLED_BIT),
TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT),
TRACE_EVENT_FL_RECORDED_CMD = (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT),
+ TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT),
};
struct ftrace_event_call {
#endif
};
+#define __TRACE_EVENT_FLAGS(name, value) \
+ static int __init trace_init_flags_##name(void) \
+ { \
+ event_##name.flags = value; \
+ return 0; \
+ } \
+ early_initcall(trace_init_flags_##name);
+
#define PERF_MAX_TRACE_SIZE 2048
#define MAX_FILTER_PRED 32
FILTER_PTR_STRING,
};
+#define EVENT_STORAGE_SIZE 128
+extern struct mutex event_storage_mutex;
+extern char event_storage[EVENT_STORAGE_SIZE];
+
extern int trace_event_raw_init(struct ftrace_event_call *call);
extern int trace_define_field(struct ftrace_event_call *call, const char *type,
const char *name, int offset, int size,
#include <linux/wait.h>
#include <linux/percpu.h>
#include <linux/timer.h>
-
+#include <linux/timerqueue.h>
struct hrtimer_clock_base;
struct hrtimer_cpu_base;
/**
* struct hrtimer - the basic hrtimer structure
- * @node: red black tree node for time ordered insertion
- * @_expires: the absolute expiry time in the hrtimers internal
+ * @node: timerqueue node, which also manages node.expires,
+ * the absolute expiry time in the hrtimers internal
* representation. The time is related to the clock on
* which the timer is based. Is setup by adding
* slack to the _softexpires value. For non range timers
* The hrtimer structure must be initialized by hrtimer_init()
*/
struct hrtimer {
- struct rb_node node;
- ktime_t _expires;
+ struct timerqueue_node node;
ktime_t _softexpires;
enum hrtimer_restart (*function)(struct hrtimer *);
struct hrtimer_clock_base *base;
struct hrtimer_clock_base {
struct hrtimer_cpu_base *cpu_base;
clockid_t index;
- struct rb_root active;
- struct rb_node *first;
+ struct timerqueue_head active;
ktime_t resolution;
ktime_t (*get_time)(void);
ktime_t softirq_time;
* @lock: lock protecting the base and associated clock bases
* and timers
* @clock_base: array of clock bases for this cpu
- * @curr_timer: the timer which is executing a callback right now
* @expires_next: absolute time of the next event which was scheduled
* via clock_set_next_event()
* @hres_active: State of high resolution mode
static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
{
- timer->_expires = time;
+ timer->node.expires = time;
timer->_softexpires = time;
}
static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
{
timer->_softexpires = time;
- timer->_expires = ktime_add_safe(time, delta);
+ timer->node.expires = ktime_add_safe(time, delta);
}
static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, unsigned long delta)
{
timer->_softexpires = time;
- timer->_expires = ktime_add_safe(time, ns_to_ktime(delta));
+ timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta));
}
static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
{
- timer->_expires.tv64 = tv64;
+ timer->node.expires.tv64 = tv64;
timer->_softexpires.tv64 = tv64;
}
static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
{
- timer->_expires = ktime_add_safe(timer->_expires, time);
+ timer->node.expires = ktime_add_safe(timer->node.expires, time);
timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
}
static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns)
{
- timer->_expires = ktime_add_ns(timer->_expires, ns);
+ timer->node.expires = ktime_add_ns(timer->node.expires, ns);
timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
}
static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
{
- return timer->_expires;
+ return timer->node.expires;
}
static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
{
- return timer->_expires.tv64;
+ return timer->node.expires.tv64;
}
static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
{
static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
{
- return ktime_to_ns(timer->_expires);
+ return ktime_to_ns(timer->node.expires);
}
static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
{
- return ktime_sub(timer->_expires, timer->base->get_time());
+ return ktime_sub(timer->node.expires, timer->base->get_time());
}
#ifdef CONFIG_HIGH_RES_TIMERS
#include <linux/securebits.h>
#include <net/net_namespace.h>
+#ifdef CONFIG_SMP
+# define INIT_PUSHABLE_TASKS(tsk) \
+ .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
+#else
+# define INIT_PUSHABLE_TASKS(tsk)
+#endif
+
extern struct files_struct init_files;
extern struct fs_struct init_fs;
*/
# define CAP_INIT_BSET CAP_FULL_SET
+#ifdef CONFIG_RCU_BOOST
+#define INIT_TASK_RCU_BOOST() \
+ .rcu_boost_mutex = NULL,
+#else
+#define INIT_TASK_RCU_BOOST()
+#endif
#ifdef CONFIG_TREE_PREEMPT_RCU
#define INIT_TASK_RCU_TREE_PREEMPT() \
.rcu_blocked_node = NULL,
.rcu_read_lock_nesting = 0, \
.rcu_read_unlock_special = 0, \
.rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \
- INIT_TASK_RCU_TREE_PREEMPT()
+ INIT_TASK_RCU_TREE_PREEMPT() \
+ INIT_TASK_RCU_BOOST()
#else
#define INIT_TASK_RCU_PREEMPT(tsk)
#endif
.nr_cpus_allowed = NR_CPUS, \
}, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
- .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
+ INIT_PUSHABLE_TASKS(tsk) \
.ptraced = LIST_HEAD_INIT(tsk.ptraced), \
.ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
.real_parent = &tsk, \
struct irqaction {
irq_handler_t handler;
unsigned long flags;
- const char *name;
void *dev_id;
struct irqaction *next;
int irq;
- struct proc_dir_entry *dir;
irq_handler_t thread_fn;
struct task_struct *thread;
unsigned long thread_flags;
-};
+ const char *name;
+ struct proc_dir_entry *dir;
+} ____cacheline_internodealigned_in_smp;
extern irqreturn_t no_action(int cpl, void *dev_id);
extern int arch_check_optimized_kprobe(struct optimized_kprobe *op);
extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op);
extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op);
-extern int arch_optimize_kprobe(struct optimized_kprobe *op);
+extern void arch_optimize_kprobes(struct list_head *oplist);
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+ struct list_head *done_list);
extern void arch_unoptimize_kprobe(struct optimized_kprobe *op);
extern kprobe_opcode_t *get_optinsn_slot(void);
extern void free_optinsn_slot(kprobe_opcode_t *slot, int dirty);
/* The size of the executable code in each section. */
unsigned int init_text_size, core_text_size;
+ /* Size of RO sections of the module (text+rodata) */
+ unsigned int init_ro_size, core_ro_size;
+
/* Arch-specific module values */
struct mod_arch_specific arch;
{
return 0;
}
-
#endif /* CONFIG_MODULES */
#ifdef CONFIG_SYSFS
#define __MODULE_STRING(x) __stringify(x)
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+extern void set_all_modules_text_rw(void);
+extern void set_all_modules_text_ro(void);
+#else
+static inline void set_all_modules_text_rw(void) { }
+static inline void set_all_modules_text_ro(void) { }
+#endif
#ifdef CONFIG_GENERIC_BUG
void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *,
extern void mutex_unlock(struct mutex *lock);
extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
+#ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
+#define arch_mutex_cpu_relax() cpu_relax()
+#endif
+
#endif
* may be used to reset the timeout - for code which intentionally
* disables interrupts for a long time. This call is stateless.
*/
-#ifdef ARCH_HAS_NMI_WATCHDOG
+#if defined(ARCH_HAS_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
#include <asm/nmi.h>
extern void touch_nmi_watchdog(void);
-extern void acpi_nmi_disable(void);
-extern void acpi_nmi_enable(void);
#else
-#ifndef CONFIG_HARDLOCKUP_DETECTOR
static inline void touch_nmi_watchdog(void)
{
touch_softlockup_watchdog();
}
-#else
-extern void touch_nmi_watchdog(void);
-#endif
-static inline void acpi_nmi_disable(void) { }
-static inline void acpi_nmi_enable(void) { }
#endif
/*
*/
precise_ip : 2, /* skid constraint */
mmap_data : 1, /* non-exec mmap data */
+ sample_id_all : 1, /* sample_type all events */
- __reserved_1 : 46;
+ __reserved_1 : 45;
union {
__u32 wakeup_events; /* wakeup every n events */
enum perf_event_type {
/*
+ * If perf_event_attr.sample_id_all is set then all event types will
+ * have the sample_type selected fields related to where/when
+ * (identity) an event took place (TID, TIME, ID, CPU, STREAM_ID)
+ * described in PERF_RECORD_SAMPLE below, it will be stashed just after
+ * the perf_event_header and the fields already present for the existing
+ * fields, i.e. at the end of the payload. That way a newer perf.data
+ * file will be supported by older perf tools, with these new optional
+ * fields being ignored.
+ *
* The MMAP events record the PROT_EXEC mappings so that we can
* correlate userspace IPs to code. They have the following structure:
*
struct pmu {
struct list_head entry;
+ struct device *dev;
+ char *name;
+ int type;
+
int * __percpu pmu_disable_count;
struct perf_cpu_context * __percpu pmu_cpu_context;
int task_ctx_nr;
u64 shadow_ctx_time;
struct perf_event_attr attr;
+ u16 header_size;
+ u16 id_header_size;
+ u16 read_size;
struct hw_perf_event hw;
struct perf_event_context *ctx;
#ifdef CONFIG_PERF_EVENTS
-extern int perf_pmu_register(struct pmu *pmu);
+extern int perf_pmu_register(struct pmu *pmu, char *name, int type);
extern void perf_pmu_unregister(struct pmu *pmu);
extern int perf_num_counters(void);
struct perf_sample_data *data,
struct pt_regs *regs);
+static inline bool is_sampling_event(struct perf_event *event)
+{
+ return event->attr.sample_period != 0;
+}
+
/*
* Return 1 for a software event, 0 for a hardware event
*/
#define list_first_entry_rcu(ptr, type, member) \
list_entry_rcu((ptr)->next, type, member)
-#define __list_for_each_rcu(pos, head) \
- for (pos = rcu_dereference_raw(list_next_rcu(head)); \
- pos != (head); \
- pos = rcu_dereference_raw(list_next_rcu((pos)))
-
/**
* list_for_each_entry_rcu - iterate over rcu list of given type
* @pos: the type * to use as a loop cursor.
extern int rcutorture_runnable; /* for sysctl */
#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
+#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b))
+#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
extern void synchronize_sched(void);
extern void rcu_barrier_bh(void);
extern void rcu_barrier_sched(void);
-extern void synchronize_sched_expedited(void);
extern int sched_expedited_torture_stats(char *page);
static inline void __rcu_read_lock_bh(void)
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/* Internal to kernel */
-extern void rcu_init(void);
extern void rcu_sched_qs(int cpu);
extern void rcu_bh_qs(int cpu);
extern void rcu_check_callbacks(int cpu, int user);
#include <linux/cache.h>
-#define rcu_init_sched() do { } while (0)
+static inline void rcu_init(void)
+{
+}
#ifdef CONFIG_TINY_RCU
synchronize_sched();
}
+static inline void synchronize_sched_expedited(void)
+{
+ synchronize_sched();
+}
+
#ifdef CONFIG_TINY_RCU
static inline void rcu_preempt_note_context_switch(void)
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
extern int rcu_scheduler_active __read_mostly;
extern void rcu_scheduler_starting(void);
-
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
static inline void rcu_scheduler_starting(void)
{
}
-
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
#endif /* __LINUX_RCUTINY_H */
#ifndef __LINUX_RCUTREE_H
#define __LINUX_RCUTREE_H
+extern void rcu_init(void);
extern void rcu_note_context_switch(int cpu);
extern int rcu_needs_cpu(int cpu);
extern void rcu_cpu_stall_reset(void);
#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
extern void synchronize_rcu_bh(void);
+extern void synchronize_sched_expedited(void);
extern void synchronize_rcu_expedited(void);
static inline void synchronize_rcu_bh_expedited(void)
size_t *lenp, loff_t *ppos);
extern unsigned int softlockup_panic;
extern int softlockup_thresh;
+void lockup_detector_init(void);
#else
static inline void touch_softlockup_watchdog(void)
{
static inline void touch_all_softlockup_watchdogs(void)
{
}
+static inline void lockup_detector_init(void)
+{
+}
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
spinlock_t lock;
};
+struct autogroup;
+
/*
* NOTE! "signal_struct" does not have it's own
* locking, because a shared signal_struct always
struct tty_struct *tty; /* NULL if no tty */
+#ifdef CONFIG_SCHED_AUTOGROUP
+ struct autogroup *autogroup;
+#endif
/*
* Cumulative resource counters for dead threads in the group,
* and for reaped dead child processes forked by this group.
#ifdef CONFIG_TREE_PREEMPT_RCU
struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+ struct rt_mutex *rcu_boost_mutex;
+#endif /* #ifdef CONFIG_RCU_BOOST */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
#endif
struct list_head tasks;
+#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
+#endif
struct mm_struct *mm, *active_mm;
#if defined(SPLIT_RSS_COUNTING)
#ifdef CONFIG_PREEMPT_RCU
#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
-#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */
+#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */
static inline void rcu_copy_process(struct task_struct *p)
{
p->rcu_read_unlock_special = 0;
#ifdef CONFIG_TREE_PREEMPT_RCU
p->rcu_blocked_node = NULL;
-#endif
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+ p->rcu_boost_mutex = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
INIT_LIST_HEAD(&p->rcu_node_entry);
}
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
#ifdef CONFIG_HOTPLUG_CPU
-extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
extern void idle_task_exit(void);
#else
static inline void idle_task_exit(void) {}
#endif
-extern void sched_idle_next(void);
-
#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
extern void wake_up_idle_cpu(int cpu);
#else
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_shares_ratelimit;
-extern unsigned int sysctl_sched_shares_thresh;
extern unsigned int sysctl_sched_child_runs_first;
enum sched_tunable_scaling {
extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_time_avg;
extern unsigned int sysctl_timer_migration;
+extern unsigned int sysctl_sched_shares_window;
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
extern unsigned int sysctl_sched_compat_yield;
+#ifdef CONFIG_SCHED_AUTOGROUP
+extern unsigned int sysctl_sched_autogroup_enabled;
+
+extern void sched_autogroup_create_attach(struct task_struct *p);
+extern void sched_autogroup_detach(struct task_struct *p);
+extern void sched_autogroup_fork(struct signal_struct *sig);
+extern void sched_autogroup_exit(struct signal_struct *sig);
+#ifdef CONFIG_PROC_FS
+extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
+extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice);
+#endif
+#else
+static inline void sched_autogroup_create_attach(struct task_struct *p) { }
+static inline void sched_autogroup_detach(struct task_struct *p) { }
+static inline void sched_autogroup_fork(struct signal_struct *sig) { }
+static inline void sched_autogroup_exit(struct signal_struct *sig) { }
+#endif
+
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
extern void rt_mutex_setprio(struct task_struct *p, int prio);
extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu);
-extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
+extern int sched_setscheduler(struct task_struct *, int,
+ const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int,
- struct sched_param *);
+ const struct sched_param *);
extern struct task_struct *idle_task(int cpu);
extern struct task_struct *curr_task(int cpu);
extern void set_curr_task(int cpu, struct task_struct *p);
#define SFI_OEM_ID_SIZE 6
#define SFI_OEM_TABLE_ID_SIZE 8
+#define SFI_NAME_LEN 16
+
#define SFI_SYST_SEARCH_BEGIN 0x000E0000
#define SFI_SYST_SEARCH_END 0x000FFFFF
u16 addr;
u8 irq;
u32 max_freq;
- char name[16];
+ char name[SFI_NAME_LEN];
} __packed;
struct sfi_gpio_table_entry {
- char controller_name[16];
+ char controller_name[SFI_NAME_LEN];
u16 pin_no;
- char pin_name[16];
+ char pin_name[SFI_NAME_LEN];
} __packed;
typedef int (*sfi_table_handler) (struct sfi_table_header *table);
#define __LINUX_STACKTRACE_H
struct task_struct;
+struct pt_regs;
#ifdef CONFIG_STACKTRACE
struct task_struct;
};
extern void save_stack_trace(struct stack_trace *trace);
-extern void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp);
+extern void save_stack_trace_regs(struct stack_trace *trace,
+ struct pt_regs *regs);
extern void save_stack_trace_tsk(struct task_struct *tsk,
struct stack_trace *trace);
#define SYSCALL_TRACE_ENTER_EVENT(sname) \
static struct syscall_metadata \
__attribute__((__aligned__(4))) __syscall_meta_##sname; \
- static struct ftrace_event_call \
- __attribute__((__aligned__(4))) event_enter_##sname; \
static struct ftrace_event_call __used \
__attribute__((__aligned__(4))) \
__attribute__((section("_ftrace_events"))) \
.class = &event_class_syscall_enter, \
.event.funcs = &enter_syscall_print_funcs, \
.data = (void *)&__syscall_meta_##sname,\
- }
+ }; \
+ __TRACE_EVENT_FLAGS(enter_##sname, TRACE_EVENT_FL_CAP_ANY)
#define SYSCALL_TRACE_EXIT_EVENT(sname) \
static struct syscall_metadata \
__attribute__((__aligned__(4))) __syscall_meta_##sname; \
- static struct ftrace_event_call \
- __attribute__((__aligned__(4))) event_exit_##sname; \
static struct ftrace_event_call __used \
__attribute__((__aligned__(4))) \
__attribute__((section("_ftrace_events"))) \
.class = &event_class_syscall_exit, \
.event.funcs = &exit_syscall_print_funcs, \
.data = (void *)&__syscall_meta_##sname,\
- }
+ }; \
+ __TRACE_EVENT_FLAGS(exit_##sname, TRACE_EVENT_FL_CAP_ANY)
#define SYSCALL_METADATA(sname, nb) \
SYSCALL_TRACE_ENTER_EVENT(sname); \
int slack;
#ifdef CONFIG_TIMER_STATS
+ int start_pid;
void *start_site;
char start_comm[16];
- int start_pid;
#endif
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#define __TIMER_LOCKDEP_MAP_INITIALIZER(_kn)
#endif
+/*
+ * Note that all tvec_bases are 2 byte aligned and lower bit of
+ * base in timer_list is guaranteed to be zero. Use the LSB to
+ * indicate whether the timer is deferrable.
+ *
+ * A deferrable timer will work normally when the system is busy, but
+ * will not cause a CPU to come out of idle just to service it; instead,
+ * the timer will be serviced when the CPU eventually wakes up with a
+ * subsequent non-deferrable timer.
+ */
+#define TBASE_DEFERRABLE_FLAG (0x1)
+
#define TIMER_INITIALIZER(_function, _expires, _data) { \
.entry = { .prev = TIMER_ENTRY_STATIC }, \
.function = (_function), \
.expires = (_expires), \
.data = (_data), \
.base = &boot_tvec_bases, \
+ .slack = -1, \
+ __TIMER_LOCKDEP_MAP_INITIALIZER( \
+ __FILE__ ":" __stringify(__LINE__)) \
+ }
+
+#define TBASE_MAKE_DEFERRED(ptr) ((struct tvec_base *) \
+ ((unsigned char *)(ptr) + TBASE_DEFERRABLE_FLAG))
+
+#define TIMER_DEFERRED_INITIALIZER(_function, _expires, _data) {\
+ .entry = { .prev = TIMER_ENTRY_STATIC }, \
+ .function = (_function), \
+ .expires = (_expires), \
+ .data = (_data), \
+ .base = TBASE_MAKE_DEFERRED(&boot_tvec_bases), \
__TIMER_LOCKDEP_MAP_INITIALIZER( \
__FILE__ ":" __stringify(__LINE__)) \
}
extern void add_timer(struct timer_list *timer);
+extern int try_to_del_timer_sync(struct timer_list *timer);
+
#ifdef CONFIG_SMP
- extern int try_to_del_timer_sync(struct timer_list *timer);
extern int del_timer_sync(struct timer_list *timer);
#else
-# define try_to_del_timer_sync(t) del_timer(t)
# define del_timer_sync(t) del_timer(t)
#endif
--- /dev/null
+#ifndef _LINUX_TIMERQUEUE_H
+#define _LINUX_TIMERQUEUE_H
+
+#include <linux/rbtree.h>
+#include <linux/ktime.h>
+
+
+struct timerqueue_node {
+ struct rb_node node;
+ ktime_t expires;
+};
+
+struct timerqueue_head {
+ struct rb_root head;
+ struct timerqueue_node *next;
+};
+
+
+extern void timerqueue_add(struct timerqueue_head *head,
+ struct timerqueue_node *node);
+extern void timerqueue_del(struct timerqueue_head *head,
+ struct timerqueue_node *node);
+extern struct timerqueue_node *timerqueue_iterate_next(
+ struct timerqueue_node *node);
+
+/**
+ * timerqueue_getnext - Returns the timer with the earlies expiration time
+ *
+ * @head: head of timerqueue
+ *
+ * Returns a pointer to the timer node that has the
+ * earliest expiration time.
+ */
+static inline
+struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
+{
+ return head->next;
+}
+
+static inline void timerqueue_init(struct timerqueue_node *node)
+{
+ RB_CLEAR_NODE(&node->node);
+}
+
+static inline void timerqueue_init_head(struct timerqueue_head *head)
+{
+ head->head = RB_ROOT;
+ head->next = NULL;
+}
+#endif /* _LINUX_TIMERQUEUE_H */
#define TP_PROTO(args...) args
#define TP_ARGS(args...) args
+#define TP_CONDITION(args...) args
#ifdef CONFIG_TRACEPOINTS
* as "(void *, void)". The DECLARE_TRACE_NOARGS() will pass in just
* "void *data", where as the DECLARE_TRACE() will pass in "void *data, proto".
*/
-#define __DO_TRACE(tp, proto, args) \
+#define __DO_TRACE(tp, proto, args, cond) \
do { \
struct tracepoint_func *it_func_ptr; \
void *it_func; \
void *__data; \
\
+ if (!(cond)) \
+ return; \
rcu_read_lock_sched_notrace(); \
it_func_ptr = rcu_dereference_sched((tp)->funcs); \
if (it_func_ptr) { \
* not add unwanted padding between the beginning of the section and the
* structure. Force alignment to the same alignment as the section start.
*/
-#define __DECLARE_TRACE(name, proto, args, data_proto, data_args) \
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
extern struct tracepoint __tracepoint_##name; \
static inline void trace_##name(proto) \
{ \
do_trace: \
__DO_TRACE(&__tracepoint_##name, \
TP_PROTO(data_proto), \
- TP_ARGS(data_args)); \
+ TP_ARGS(data_args), \
+ TP_CONDITION(cond)); \
} \
static inline int \
register_trace_##name(void (*probe)(data_proto), void *data) \
EXPORT_SYMBOL(__tracepoint_##name)
#else /* !CONFIG_TRACEPOINTS */
-#define __DECLARE_TRACE(name, proto, args, data_proto, data_args) \
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
static inline void trace_##name(proto) \
{ } \
static inline int \
* "void *__data, proto" as the callback prototype.
*/
#define DECLARE_TRACE_NOARGS(name) \
- __DECLARE_TRACE(name, void, , void *__data, __data)
+ __DECLARE_TRACE(name, void, , 1, void *__data, __data)
#define DECLARE_TRACE(name, proto, args) \
- __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \
+ __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), 1, \
PARAMS(void *__data, proto), \
PARAMS(__data, args))
+#define DECLARE_TRACE_CONDITION(name, proto, args, cond) \
+ __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), PARAMS(cond), \
+ PARAMS(void *__data, proto), \
+ PARAMS(__data, args))
+
+#define TRACE_EVENT_FLAGS(event, flag)
+
#endif /* DECLARE_TRACE */
#ifndef TRACE_EVENT
DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#define DEFINE_EVENT_CONDITION(template, name, proto, \
+ args, cond) \
+ DECLARE_TRACE_CONDITION(name, PARAMS(proto), \
+ PARAMS(args), PARAMS(cond))
#define TRACE_EVENT(name, proto, args, struct, assign, print) \
DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
#define TRACE_EVENT_FN(name, proto, args, struct, \
assign, print, reg, unreg) \
DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#define TRACE_EVENT_CONDITION(name, proto, args, cond, \
+ struct, assign, print) \
+ DECLARE_TRACE_CONDITION(name, PARAMS(proto), \
+ PARAMS(args), PARAMS(cond))
+
+#define TRACE_EVENT_FLAGS(event, flag)
#endif /* ifdef TRACE_EVENT (see note above) */
.timer = TIMER_INITIALIZER(NULL, 0, 0), \
}
+#define __DEFERRED_WORK_INITIALIZER(n, f) { \
+ .work = __WORK_INITIALIZER((n).work, (f)), \
+ .timer = TIMER_DEFERRED_INITIALIZER(NULL, 0, 0), \
+ }
+
#define DECLARE_WORK(n, f) \
struct work_struct n = __WORK_INITIALIZER(n, f)
#define DECLARE_DELAYED_WORK(n, f) \
struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f)
+#define DECLARE_DEFERRED_WORK(n, f) \
+ struct delayed_work n = __DEFERRED_WORK_INITIALIZER(n, f)
+
/*
* initialize a work item's function pointer
*/
#define WM8775_AIN3 4
#define WM8775_AIN4 8
-/* subdev group ID */
-#define WM8775_GID (1 << 0)
-
#endif
#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
DEFINE_TRACE(name)
+#undef TRACE_EVENT_CONDITION
+#define TRACE_EVENT_CONDITION(name, proto, args, cond, tstruct, assign, print) \
+ TRACE_EVENT(name, \
+ PARAMS(proto), \
+ PARAMS(args), \
+ PARAMS(tstruct), \
+ PARAMS(assign), \
+ PARAMS(print))
+
#undef TRACE_EVENT_FN
#define TRACE_EVENT_FN(name, proto, args, tstruct, \
assign, print, reg, unreg) \
#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
DEFINE_TRACE(name)
+#undef DEFINE_EVENT_CONDITION
+#define DEFINE_EVENT_CONDITION(template, name, proto, args, cond) \
+ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
#undef DECLARE_TRACE
#define DECLARE_TRACE(name, proto, args) \
DEFINE_TRACE(name)
#undef TRACE_EVENT
#undef TRACE_EVENT_FN
+#undef TRACE_EVENT_CONDITION
#undef DECLARE_EVENT_CLASS
#undef DEFINE_EVENT
#undef DEFINE_EVENT_PRINT
+#undef DEFINE_EVENT_CONDITION
#undef TRACE_HEADER_MULTI_READ
#undef DECLARE_TRACE
#include <linux/ktime.h>
#include <linux/tracepoint.h>
-#ifndef _TRACE_POWER_ENUM_
-#define _TRACE_POWER_ENUM_
-enum {
- POWER_NONE = 0,
- POWER_CSTATE = 1, /* C-State */
- POWER_PSTATE = 2, /* Fequency change or DVFS */
- POWER_SSTATE = 3, /* Suspend */
-};
+DECLARE_EVENT_CLASS(cpu,
+
+ TP_PROTO(unsigned int state, unsigned int cpu_id),
+
+ TP_ARGS(state, cpu_id),
+
+ TP_STRUCT__entry(
+ __field( u32, state )
+ __field( u32, cpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->state = state;
+ __entry->cpu_id = cpu_id;
+ ),
+
+ TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state,
+ (unsigned long)__entry->cpu_id)
+);
+
+DEFINE_EVENT(cpu, cpu_idle,
+
+ TP_PROTO(unsigned int state, unsigned int cpu_id),
+
+ TP_ARGS(state, cpu_id)
+);
+
+/* This file can get included multiple times, TRACE_HEADER_MULTI_READ at top */
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING
+
+#define PWR_EVENT_EXIT -1
#endif
+DEFINE_EVENT(cpu, cpu_frequency,
+
+ TP_PROTO(unsigned int frequency, unsigned int cpu_id),
+
+ TP_ARGS(frequency, cpu_id)
+);
+
+TRACE_EVENT(machine_suspend,
+
+ TP_PROTO(unsigned int state),
+
+ TP_ARGS(state),
+
+ TP_STRUCT__entry(
+ __field( u32, state )
+ ),
+
+ TP_fast_assign(
+ __entry->state = state;
+ ),
+
+ TP_printk("state=%lu", (unsigned long)__entry->state)
+);
+
+/* This code will be removed after deprecation time exceeded (2.6.41) */
+#ifdef CONFIG_EVENT_POWER_TRACING_DEPRECATED
+
/*
* The power events are used for cpuidle & suspend (power_start, power_end)
* and for cpufreq (power_frequency)
);
+/* Deprecated dummy functions must be protected against multi-declartion */
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+
+enum {
+ POWER_NONE = 0,
+ POWER_CSTATE = 1,
+ POWER_PSTATE = 2,
+};
+#endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */
+
+#else /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
+
+#ifndef _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+#define _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED
+enum {
+ POWER_NONE = 0,
+ POWER_CSTATE = 1,
+ POWER_PSTATE = 2,
+};
+
+/* These dummy declaration have to be ripped out when the deprecated
+ events get removed */
+static inline void trace_power_start(u64 type, u64 state, u64 cpuid) {};
+static inline void trace_power_end(u64 cpuid) {};
+static inline void trace_power_frequency(u64 type, u64 state, u64 cpuid) {};
+#endif /* _PWR_EVENT_AVOID_DOUBLE_DEFINING_DEPRECATED */
+
+#endif /* CONFIG_EVENT_POWER_TRACING_DEPRECATED */
+
/*
* The clock events are used for clock enable/disable and for
* clock rate change
TP_ARGS(name, state, cpu_id)
);
-
#endif /* _TRACE_POWER_H */
/* This part must be outside protection */
syscall_regfunc, syscall_unregfunc
);
+TRACE_EVENT_FLAGS(sys_enter, TRACE_EVENT_FL_CAP_ANY)
+
TRACE_EVENT_FN(sys_exit,
TP_PROTO(struct pt_regs *regs, long ret),
syscall_regfunc, syscall_unregfunc
);
+TRACE_EVENT_FLAGS(sys_exit, TRACE_EVENT_FL_CAP_ANY)
+
#endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */
#endif /* _TRACE_EVENTS_SYSCALLS_H */
TRACE_EVENT(name, PARAMS(proto), PARAMS(args), \
PARAMS(tstruct), PARAMS(assign), PARAMS(print)) \
+#undef TRACE_EVENT_FLAGS
+#define TRACE_EVENT_FLAGS(name, value) \
+ __TRACE_EVENT_FLAGS(name, value)
+
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+#undef TRACE_EVENT_FLAGS
+#define TRACE_EVENT_FLAGS(event, flag)
+
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
/*
#undef __array
#define __array(type, item, len) \
- BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
- ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+ do { \
+ mutex_lock(&event_storage_mutex); \
+ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
+ snprintf(event_storage, sizeof(event_storage), \
+ "%s[%d]", #type, len); \
+ ret = trace_define_field(event_call, event_storage, #item, \
offsetof(typeof(field), item), \
sizeof(field.item), \
is_signed_type(type), FILTER_OTHER); \
- if (ret) \
- return ret;
+ mutex_unlock(&event_storage_mutex); \
+ if (ret) \
+ return ret; \
+ } while (0);
#undef __dynamic_array
#define __dynamic_array(type, item, len) \
config RCU_TRACE
bool "Enable tracing for RCU"
- depends on TREE_RCU || TREE_PREEMPT_RCU
help
This option provides tracing in RCU which presents stats
in debugfs for debugging RCU implementation.
TREE_PREEMPT_RCU implementations, permitting Makefile to
trivially select kernel/rcutree_trace.c.
+config RCU_BOOST
+ bool "Enable RCU priority boosting"
+ depends on RT_MUTEXES && TINY_PREEMPT_RCU
+ default n
+ help
+ This option boosts the priority of preempted RCU readers that
+ block the current preemptible RCU grace period for too long.
+ This option also prevents heavy loads from blocking RCU
+ callback invocation for all flavors of RCU.
+
+ Say Y here if you are working with real-time apps or heavy loads
+ Say N here if you are unsure.
+
+config RCU_BOOST_PRIO
+ int "Real-time priority to boost RCU readers to"
+ range 1 99
+ depends on RCU_BOOST
+ default 1
+ help
+ This option specifies the real-time priority to which preempted
+ RCU readers are to be boosted. If you are working with CPU-bound
+ real-time applications, you should specify a priority higher then
+ the highest-priority CPU-bound application.
+
+ Specify the real-time priority, or take the default if unsure.
+
+config RCU_BOOST_DELAY
+ int "Milliseconds to delay boosting after RCU grace-period start"
+ range 0 3000
+ depends on RCU_BOOST
+ default 500
+ help
+ This option specifies the time to wait after the beginning of
+ a given grace period before priority-boosting preempted RCU
+ readers blocking that grace period. Note that any RCU reader
+ blocking an expedited RCU grace period is boosted immediately.
+
+ Accept the default if unsure.
+
+config SRCU_SYNCHRONIZE_DELAY
+ int "Microseconds to delay before waiting for readers"
+ range 0 20
+ default 10
+ help
+ This option controls how long SRCU delays before entering its
+ loop waiting on SRCU readers. The purpose of this loop is
+ to avoid the unconditional context-switch penalty that would
+ otherwise be incurred if there was an active SRCU reader,
+ in a manner similar to adaptive locking schemes. This should
+ be set to be a bit longer than the common-case SRCU read-side
+ critical-section overhead.
+
+ Accept the default if unsure.
+
endmenu # "RCU Subsystem"
config IKCONFIG
endif # NAMESPACES
+config SCHED_AUTOGROUP
+ bool "Automatic process group scheduling"
+ select EVENTFD
+ select CGROUPS
+ select CGROUP_SCHED
+ select FAIR_GROUP_SCHED
+ help
+ This option optimizes the scheduler for common desktop workloads by
+ automatically creating and populating task groups. This separation
+ of workloads isolates aggressive CPU burners (like build jobs) from
+ desktop applications. Task group autogeneration is currently based
+ upon task session.
+
config MM_OWNER
bool
*
* Returns the matching dev_t on success or 0 on failure.
*/
-static dev_t __init devt_from_partuuid(char *uuid_str)
+static dev_t devt_from_partuuid(char *uuid_str)
{
dev_t res = 0;
struct device *dev = NULL;
#include <linux/sfi.h>
#include <linux/shmem_fs.h>
#include <linux/slab.h>
+#include <linux/perf_event.h>
#include <asm/io.h>
#include <asm/bugs.h>
"enabled *very* early, fixing it\n");
local_irq_disable();
}
+ idr_init_cache();
+ perf_event_init();
rcu_init();
radix_tree_init();
/* init some links before init_ISA_irqs() */
enable_debug_pagealloc();
kmemleak_init();
debug_objects_mem_init();
- idr_init_cache();
setup_per_cpu_pageset();
numa_policy_init();
if (late_time_init)
smp_prepare_cpus(setup_max_cpus);
do_pre_smp_initcalls();
+ lockup_detector_init();
smp_init();
sched_init_smp();
}
struct take_cpu_down_param {
- struct task_struct *caller;
unsigned long mod;
void *hcpu;
};
static int __ref take_cpu_down(void *_param)
{
struct take_cpu_down_param *param = _param;
- unsigned int cpu = (unsigned long)param->hcpu;
int err;
/* Ensure this CPU doesn't handle any more interrupts. */
cpu_notify(CPU_DYING | param->mod, param->hcpu);
- if (task_cpu(param->caller) == cpu)
- move_task_off_dead_cpu(cpu, param->caller);
- /* Force idle task to run as soon as we yield: it should
- immediately notice cpu is offline and die quickly. */
- sched_idle_next();
return 0;
}
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
struct take_cpu_down_param tcd_param = {
- .caller = current,
.mod = mod,
.hcpu = hcpu,
};
}
BUG_ON(cpu_online(cpu));
- /* Wait for it to sleep (leaving idle task). */
+ /*
+ * The migration_call() CPU_DYING callback will have removed all
+ * runnable tasks from the cpu, there's only the idle task left now
+ * that the migration thread is done doing the stop_machine thing.
+ *
+ * Wait for the stop thread to go away.
+ */
while (!idle_cpu(cpu))
- yield();
+ cpu_relax();
/* This actually kills the CPU. */
__cpu_die(cpu);
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
+void __weak arch_disable_nonboot_cpus_begin(void)
+{
+}
+
+void __weak arch_disable_nonboot_cpus_end(void)
+{
+}
+
int disable_nonboot_cpus(void)
{
int cpu, first_cpu, error = 0;
* with the userspace trying to use the CPU hotplug at the same time
*/
cpumask_clear(frozen_cpus);
+ arch_disable_nonboot_cpus_begin();
printk("Disabling non-boot CPUs ...\n");
for_each_online_cpu(cpu) {
}
}
+ arch_disable_nonboot_cpus_end();
+
if (!error) {
BUG_ON(num_online_cpus() > 1);
/* Make sure the CPUs won't be enabled by someone else */
static inline void put_signal_struct(struct signal_struct *sig)
{
- if (atomic_dec_and_test(&sig->sigcnt))
+ if (atomic_dec_and_test(&sig->sigcnt)) {
+ sched_autogroup_exit(sig);
free_signal_struct(sig);
+ }
}
void __put_task_struct(struct task_struct *tsk)
posix_cpu_timers_init_group(sig);
tty_audit_fork(sig);
+ sched_autogroup_fork(sig);
sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
}
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
- free_signal_struct(p->signal);
+ put_signal_struct(p->signal);
bad_fork_cleanup_sighand:
__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
+/*
+ * Futex flags used to encode options to functions and preserve them across
+ * restarts.
+ */
+#define FLAGS_SHARED 0x01
+#define FLAGS_CLOCKRT 0x02
+#define FLAGS_HAS_TIMEOUT 0x04
+
/*
* Priority Inheritance state:
*/
u32 bitset;
};
+static const struct futex_q futex_q_init = {
+ /* list gets initialized in queue_me()*/
+ .key = FUTEX_KEY_INIT,
+ .bitset = FUTEX_BITSET_MATCH_ANY
+};
+
/*
* Hash buckets are shared by all the futex_keys that hash to the same
* location. Each key may have multiple futex_q structures, one for each task
return 0;
}
-static inline
-void put_futex_key(int fshared, union futex_key *key)
+static inline void put_futex_key(union futex_key *key)
{
drop_futex_key_refs(key);
}
/*
* Wake up waiters matching bitset queued on this futex (uaddr).
*/
-static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
+static int
+futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
if (!bitset)
return -EINVAL;
- ret = get_futex_key(uaddr, fshared, &key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
if (unlikely(ret != 0))
goto out;
}
spin_unlock(&hb->lock);
- put_futex_key(fshared, &key);
+ put_futex_key(&key);
out:
return ret;
}
* to this virtual address:
*/
static int
-futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
int ret, op_ret;
retry:
- ret = get_futex_key(uaddr1, fshared, &key1);
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
if (unlikely(ret != 0))
goto out_put_key1;
if (ret)
goto out_put_keys;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
goto retry;
}
double_unlock_hb(hb1, hb2);
out_put_keys:
- put_futex_key(fshared, &key2);
+ put_futex_key(&key2);
out_put_key1:
- put_futex_key(fshared, &key1);
+ put_futex_key(&key1);
out:
return ret;
}
/**
* futex_requeue() - Requeue waiters from uaddr1 to uaddr2
* @uaddr1: source futex user address
- * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @flags: futex flags (FLAGS_SHARED, etc.)
* @uaddr2: target futex user address
* @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
* @nr_requeue: number of waiters to requeue (0-INT_MAX)
* @cmpval: @uaddr1 expected value (or %NULL)
* @requeue_pi: if we are attempting to requeue from a non-pi futex to a
- * pi futex (pi to pi requeue is not supported)
+ * pi futex (pi to pi requeue is not supported)
*
* Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
* uaddr2 atomically on behalf of the top waiter.
* >=0 - on success, the number of tasks requeued or woken
* <0 - on error
*/
-static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
- int nr_wake, int nr_requeue, u32 *cmpval,
- int requeue_pi)
+static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+ u32 __user *uaddr2, int nr_wake, int nr_requeue,
+ u32 *cmpval, int requeue_pi)
{
union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
int drop_count = 0, task_count = 0, ret;
pi_state = NULL;
}
- ret = get_futex_key(uaddr1, fshared, &key1);
+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
if (unlikely(ret != 0))
goto out_put_key1;
if (ret)
goto out_put_keys;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
goto retry;
}
if (curval != *cmpval) {
break;
case -EFAULT:
double_unlock_hb(hb1, hb2);
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
ret = fault_in_user_writeable(uaddr2);
if (!ret)
goto retry;
case -EAGAIN:
/* The owner was exiting, try again. */
double_unlock_hb(hb1, hb2);
- put_futex_key(fshared, &key2);
- put_futex_key(fshared, &key1);
+ put_futex_key(&key2);
+ put_futex_key(&key1);
cond_resched();
goto retry;
default:
drop_futex_key_refs(&key1);
out_put_keys:
- put_futex_key(fshared, &key2);
+ put_futex_key(&key2);
out_put_key1:
- put_futex_key(fshared, &key1);
+ put_futex_key(&key1);
out:
if (pi_state != NULL)
free_pi_state(pi_state);
* private futexes.
*/
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner, int fshared)
+ struct task_struct *newowner)
{
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
goto retry;
}
-/*
- * In case we must use restart_block to restart a futex_wait,
- * we encode in the 'flags' shared capability
- */
-#define FLAGS_SHARED 0x01
-#define FLAGS_CLOCKRT 0x02
-#define FLAGS_HAS_TIMEOUT 0x04
-
static long futex_wait_restart(struct restart_block *restart);
/**
* fixup_owner() - Post lock pi_state and corner case management
* @uaddr: user address of the futex
- * @fshared: whether the futex is shared (1) or not (0)
* @q: futex_q (contains pi_state and access to the rt_mutex)
* @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
*
* 0 - success, lock not taken
* <0 - on error (-EFAULT)
*/
-static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
- int locked)
+static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
{
struct task_struct *owner;
int ret = 0;
* did a lock-steal - fix up the PI-state in that case:
*/
if (q->pi_state->owner != current)
- ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+ ret = fixup_pi_state_owner(uaddr, q, current);
goto out;
}
* lock. Fix the state up.
*/
owner = rt_mutex_owner(&q->pi_state->pi_mutex);
- ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+ ret = fixup_pi_state_owner(uaddr, q, owner);
goto out;
}
* futex_wait_setup() - Prepare to wait on a futex
* @uaddr: the futex userspace address
* @val: the expected value
- * @fshared: whether the futex is shared (1) or not (0)
+ * @flags: futex flags (FLAGS_SHARED, etc.)
* @q: the associated futex_q
* @hb: storage for hash_bucket pointer to be returned to caller
*
* 0 - uaddr contains val and hb has been locked
* <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
*/
-static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
struct futex_q *q, struct futex_hash_bucket **hb)
{
u32 uval;
* rare, but normal.
*/
retry:
- q->key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q->key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
if (unlikely(ret != 0))
return ret;
if (ret)
goto out;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &q->key);
+ put_futex_key(&q->key);
goto retry;
}
out:
if (ret)
- put_futex_key(fshared, &q->key);
+ put_futex_key(&q->key);
return ret;
}
-static int futex_wait(u32 __user *uaddr, int fshared,
- u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+ ktime_t *abs_time, u32 bitset)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct restart_block *restart;
struct futex_hash_bucket *hb;
- struct futex_q q;
+ struct futex_q q = futex_q_init;
int ret;
if (!bitset)
return -EINVAL;
-
- q.pi_state = NULL;
q.bitset = bitset;
- q.rt_waiter = NULL;
- q.requeue_pi_key = NULL;
if (abs_time) {
to = &timeout;
- hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
hrtimer_set_expires_range_ns(&to->timer, *abs_time,
current->timer_slack_ns);
* Prepare to wait on uaddr. On success, holds hb lock and increments
* q.key refs.
*/
- ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
if (ret)
goto out;
restart->futex.val = val;
restart->futex.time = abs_time->tv64;
restart->futex.bitset = bitset;
- restart->futex.flags = FLAGS_HAS_TIMEOUT;
-
- if (fshared)
- restart->futex.flags |= FLAGS_SHARED;
- if (clockrt)
- restart->futex.flags |= FLAGS_CLOCKRT;
+ restart->futex.flags = flags;
ret = -ERESTART_RESTARTBLOCK;
static long futex_wait_restart(struct restart_block *restart)
{
u32 __user *uaddr = restart->futex.uaddr;
- int fshared = 0;
ktime_t t, *tp = NULL;
if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
tp = &t;
}
restart->fn = do_no_restart_syscall;
- if (restart->futex.flags & FLAGS_SHARED)
- fshared = 1;
- return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
- restart->futex.bitset,
- restart->futex.flags & FLAGS_CLOCKRT);
+
+ return (long)futex_wait(uaddr, restart->futex.flags,
+ restart->futex.val, tp, restart->futex.bitset);
}
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-static int futex_lock_pi(u32 __user *uaddr, int fshared,
- int detect, ktime_t *time, int trylock)
+static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
+ ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct futex_hash_bucket *hb;
- struct futex_q q;
+ struct futex_q q = futex_q_init;
int res, ret;
if (refill_pi_state_cache())
hrtimer_set_expires(&to->timer, *time);
}
- q.pi_state = NULL;
- q.rt_waiter = NULL;
- q.requeue_pi_key = NULL;
retry:
- q.key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q.key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key);
if (unlikely(ret != 0))
goto out;
* exit to complete.
*/
queue_unlock(&q, hb);
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
cond_resched();
goto retry;
default:
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
- res = fixup_owner(uaddr, fshared, &q, !ret);
+ res = fixup_owner(uaddr, &q, !ret);
/*
* If fixup_owner() returned an error, proprogate that. If it acquired
* the lock, clear our -ETIMEDOUT or -EINTR.
queue_unlock(&q, hb);
out_put_key:
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
out:
if (to)
destroy_hrtimer_on_stack(&to->timer);
if (ret)
goto out_put_key;
- if (!fshared)
+ if (!(flags & FLAGS_SHARED))
goto retry_private;
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
goto retry;
}
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
*/
-static int futex_unlock_pi(u32 __user *uaddr, int fshared)
+static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
return -EPERM;
- ret = get_futex_key(uaddr, fshared, &key);
+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
if (unlikely(ret != 0))
goto out;
out_unlock:
spin_unlock(&hb->lock);
- put_futex_key(fshared, &key);
+ put_futex_key(&key);
out:
return ret;
pi_faulted:
spin_unlock(&hb->lock);
- put_futex_key(fshared, &key);
+ put_futex_key(&key);
ret = fault_in_user_writeable(uaddr);
if (!ret)
/**
* futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
* @uaddr: the futex we initially wait on (non-pi)
- * @fshared: whether the futexes are shared (1) or not (0). They must be
+ * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
* the same type, no requeueing from private to shared, etc.
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* 0 - On success
* <0 - On error
*/
-static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
u32 val, ktime_t *abs_time, u32 bitset,
- int clockrt, u32 __user *uaddr2)
+ u32 __user *uaddr2)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct rt_mutex_waiter rt_waiter;
struct rt_mutex *pi_mutex = NULL;
struct futex_hash_bucket *hb;
- union futex_key key2;
- struct futex_q q;
+ union futex_key key2 = FUTEX_KEY_INIT;
+ struct futex_q q = futex_q_init;
int res, ret;
if (!bitset)
if (abs_time) {
to = &timeout;
- hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
hrtimer_set_expires_range_ns(&to->timer, *abs_time,
current->timer_slack_ns);
debug_rt_mutex_init_waiter(&rt_waiter);
rt_waiter.task = NULL;
- key2 = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2);
if (unlikely(ret != 0))
goto out;
- q.pi_state = NULL;
q.bitset = bitset;
q.rt_waiter = &rt_waiter;
q.requeue_pi_key = &key2;
* Prepare to wait on uaddr. On success, increments q.key (key1) ref
* count.
*/
- ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
if (ret)
goto out_key2;
*/
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
- ret = fixup_pi_state_owner(uaddr2, &q, current,
- fshared);
+ ret = fixup_pi_state_owner(uaddr2, &q, current);
spin_unlock(q.lock_ptr);
}
} else {
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
- res = fixup_owner(uaddr2, fshared, &q, !ret);
+ res = fixup_owner(uaddr2, &q, !ret);
/*
* If fixup_owner() returned an error, proprogate that. If it
* acquired the lock, clear -ETIMEDOUT or -EINTR.
}
out_put_keys:
- put_futex_key(fshared, &q.key);
+ put_futex_key(&q.key);
out_key2:
- put_futex_key(fshared, &key2);
+ put_futex_key(&key2);
out:
if (to) {
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
- int clockrt, ret = -ENOSYS;
- int cmd = op & FUTEX_CMD_MASK;
- int fshared = 0;
+ int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK;
+ unsigned int flags = 0;
if (!(op & FUTEX_PRIVATE_FLAG))
- fshared = 1;
+ flags |= FLAGS_SHARED;
- clockrt = op & FUTEX_CLOCK_REALTIME;
- if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
- return -ENOSYS;
+ if (op & FUTEX_CLOCK_REALTIME) {
+ flags |= FLAGS_CLOCKRT;
+ if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+ return -ENOSYS;
+ }
switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAIT_BITSET:
- ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
+ ret = futex_wait(uaddr, flags, val, timeout, val3);
break;
case FUTEX_WAKE:
val3 = FUTEX_BITSET_MATCH_ANY;
case FUTEX_WAKE_BITSET:
- ret = futex_wake(uaddr, fshared, val, val3);
+ ret = futex_wake(uaddr, flags, val, val3);
break;
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
+ ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
break;
case FUTEX_CMP_REQUEUE:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
- 0);
+ ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
break;
case FUTEX_WAKE_OP:
- ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
+ ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
break;
case FUTEX_LOCK_PI:
if (futex_cmpxchg_enabled)
- ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
+ ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
break;
case FUTEX_UNLOCK_PI:
if (futex_cmpxchg_enabled)
- ret = futex_unlock_pi(uaddr, fshared);
+ ret = futex_unlock_pi(uaddr, flags);
break;
case FUTEX_TRYLOCK_PI:
if (futex_cmpxchg_enabled)
- ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+ ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
break;
case FUTEX_WAIT_REQUEUE_PI:
val3 = FUTEX_BITSET_MATCH_ANY;
- ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
- clockrt, uaddr2);
+ ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+ uaddr2);
break;
case FUTEX_CMP_REQUEUE_PI:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
- 1);
+ ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
break;
default:
ret = -ENOSYS;
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
struct hrtimer *timer;
+ struct timerqueue_node *next;
- if (!base->first)
+ next = timerqueue_getnext(&base->active);
+ if (!next)
continue;
- timer = rb_entry(base->first, struct hrtimer, node);
+ timer = container_of(next, struct hrtimer, node);
+
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
/*
* clock_was_set() has changed base->offset so the
static int enqueue_hrtimer(struct hrtimer *timer,
struct hrtimer_clock_base *base)
{
- struct rb_node **link = &base->active.rb_node;
- struct rb_node *parent = NULL;
- struct hrtimer *entry;
- int leftmost = 1;
-
debug_activate(timer);
- /*
- * Find the right place in the rbtree:
- */
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct hrtimer, node);
- /*
- * We dont care about collisions. Nodes with
- * the same expiry time stay together.
- */
- if (hrtimer_get_expires_tv64(timer) <
- hrtimer_get_expires_tv64(entry)) {
- link = &(*link)->rb_left;
- } else {
- link = &(*link)->rb_right;
- leftmost = 0;
- }
- }
-
- /*
- * Insert the timer to the rbtree and check whether it
- * replaces the first pending timer
- */
- if (leftmost)
- base->first = &timer->node;
+ timerqueue_add(&base->active, &timer->node);
- rb_link_node(&timer->node, parent, link);
- rb_insert_color(&timer->node, &base->active);
/*
* HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
* state of a possibly running callback.
*/
timer->state |= HRTIMER_STATE_ENQUEUED;
- return leftmost;
+ return (&timer->node == base->active.next);
}
/*
if (!(timer->state & HRTIMER_STATE_ENQUEUED))
goto out;
- /*
- * Remove the timer from the rbtree and replace the first
- * entry pointer if necessary.
- */
- if (base->first == &timer->node) {
- base->first = rb_next(&timer->node);
+ if (&timer->node == timerqueue_getnext(&base->active)) {
#ifdef CONFIG_HIGH_RES_TIMERS
/* Reprogram the clock event device. if enabled */
if (reprogram && hrtimer_hres_active()) {
}
#endif
}
- rb_erase(&timer->node, &base->active);
+ timerqueue_del(&base->active, &timer->node);
out:
timer->state = newstate;
}
if (!hrtimer_hres_active()) {
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
struct hrtimer *timer;
+ struct timerqueue_node *next;
- if (!base->first)
+ next = timerqueue_getnext(&base->active);
+ if (!next)
continue;
- timer = rb_entry(base->first, struct hrtimer, node);
+ timer = container_of(next, struct hrtimer, node);
delta.tv64 = hrtimer_get_expires_tv64(timer);
delta = ktime_sub(delta, base->get_time());
if (delta.tv64 < mindelta.tv64)
timer->base = &cpu_base->clock_base[clock_id];
hrtimer_init_timer_hres(timer);
+ timerqueue_init(&timer->node);
#ifdef CONFIG_TIMER_STATS
timer->start_site = NULL;
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
ktime_t basenow;
- struct rb_node *node;
+ struct timerqueue_node *node;
basenow = ktime_add(now, base->offset);
- while ((node = base->first)) {
+ while ((node = timerqueue_getnext(&base->active))) {
struct hrtimer *timer;
- timer = rb_entry(node, struct hrtimer, node);
+ timer = container_of(node, struct hrtimer, node);
/*
* The immediate goal for using the softexpires is
*/
void hrtimer_run_queues(void)
{
- struct rb_node *node;
+ struct timerqueue_node *node;
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
struct hrtimer_clock_base *base;
int index, gettime = 1;
for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
base = &cpu_base->clock_base[index];
-
- if (!base->first)
+ if (!timerqueue_getnext(&base->active))
continue;
if (gettime) {
raw_spin_lock(&cpu_base->lock);
- while ((node = base->first)) {
+ while ((node = timerqueue_getnext(&base->active))) {
struct hrtimer *timer;
- timer = rb_entry(node, struct hrtimer, node);
+ timer = container_of(node, struct hrtimer, node);
if (base->softirq_time.tv64 <=
hrtimer_get_expires_tv64(timer))
break;
raw_spin_lock_init(&cpu_base->lock);
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
cpu_base->clock_base[i].cpu_base = cpu_base;
+ timerqueue_init_head(&cpu_base->clock_base[i].active);
+ }
hrtimer_init_hres(cpu_base);
}
struct hrtimer_clock_base *new_base)
{
struct hrtimer *timer;
- struct rb_node *node;
+ struct timerqueue_node *node;
- while ((node = rb_first(&old_base->active))) {
- timer = rb_entry(node, struct hrtimer, node);
+ while ((node = timerqueue_getnext(&old_base->active))) {
+ timer = container_of(node, struct hrtimer, node);
BUG_ON(hrtimer_callback_running(timer));
debug_deactivate(timer);
constraints_initialized = 1;
- perf_pmu_register(&perf_breakpoint);
+ perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
return register_die_notifier(&hw_breakpoint_exceptions_nb);
*/
static int irq_thread(void *data)
{
- struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+ static struct sched_param param = {
+ .sched_priority = MAX_USER_RT_PRIO/2,
+ };
struct irqaction *action = data;
struct irq_desc *desc = irq_to_desc(action->irq);
int wake, oneshot = desc->status & IRQ_ONESHOT;
return p->pre_handler == aggr_pre_handler;
}
+/* Return true(!0) if the kprobe is unused */
+static inline int kprobe_unused(struct kprobe *p)
+{
+ return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
+ list_empty(&p->list);
+}
+
/*
* Keep all fields in the kprobe consistent
*/
-static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
{
- memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
- memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+ memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
+ memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
}
#ifdef CONFIG_OPTPROBES
}
}
+/* Free optimized instructions and optimized_kprobe */
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ op = container_of(p, struct optimized_kprobe, kp);
+ arch_remove_optimized_kprobe(op);
+ arch_remove_kprobe(p);
+ kfree(op);
+}
+
/* Return true(!0) if the kprobe is ready for optimization. */
static inline int kprobe_optready(struct kprobe *p)
{
return 0;
}
+/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
+static inline int kprobe_disarmed(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ /* If kprobe is not aggr/opt probe, just return kprobe is disabled */
+ if (!kprobe_aggrprobe(p))
+ return kprobe_disabled(p);
+
+ op = container_of(p, struct optimized_kprobe, kp);
+
+ return kprobe_disabled(p) && list_empty(&op->list);
+}
+
+/* Return true(!0) if the probe is queued on (un)optimizing lists */
+static int __kprobes kprobe_queued(struct kprobe *p)
+{
+ struct optimized_kprobe *op;
+
+ if (kprobe_aggrprobe(p)) {
+ op = container_of(p, struct optimized_kprobe, kp);
+ if (!list_empty(&op->list))
+ return 1;
+ }
+ return 0;
+}
+
/*
* Return an optimized kprobe whose optimizing code replaces
* instructions including addr (exclude breakpoint).
/* Optimization staging list, protected by kprobe_mutex */
static LIST_HEAD(optimizing_list);
+static LIST_HEAD(unoptimizing_list);
static void kprobe_optimizer(struct work_struct *work);
static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
+static DECLARE_COMPLETION(optimizer_comp);
#define OPTIMIZE_DELAY 5
-/* Kprobe jump optimizer */
-static __kprobes void kprobe_optimizer(struct work_struct *work)
+/*
+ * Optimize (replace a breakpoint with a jump) kprobes listed on
+ * optimizing_list.
+ */
+static __kprobes void do_optimize_kprobes(void)
{
- struct optimized_kprobe *op, *tmp;
-
- /* Lock modules while optimizing kprobes */
- mutex_lock(&module_mutex);
- mutex_lock(&kprobe_mutex);
- if (kprobes_all_disarmed || !kprobes_allow_optimization)
- goto end;
-
- /*
- * Wait for quiesence period to ensure all running interrupts
- * are done. Because optprobe may modify multiple instructions
- * there is a chance that Nth instruction is interrupted. In that
- * case, running interrupt can return to 2nd-Nth byte of jump
- * instruction. This wait is for avoiding it.
- */
- synchronize_sched();
+ /* Optimization never be done when disarmed */
+ if (kprobes_all_disarmed || !kprobes_allow_optimization ||
+ list_empty(&optimizing_list))
+ return;
/*
* The optimization/unoptimization refers online_cpus via
*/
get_online_cpus();
mutex_lock(&text_mutex);
- list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
- WARN_ON(kprobe_disabled(&op->kp));
- if (arch_optimize_kprobe(op) < 0)
- op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
- list_del_init(&op->list);
+ arch_optimize_kprobes(&optimizing_list);
+ mutex_unlock(&text_mutex);
+ put_online_cpus();
+}
+
+/*
+ * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
+ * if need) kprobes listed on unoptimizing_list.
+ */
+static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
+{
+ struct optimized_kprobe *op, *tmp;
+
+ /* Unoptimization must be done anytime */
+ if (list_empty(&unoptimizing_list))
+ return;
+
+ /* Ditto to do_optimize_kprobes */
+ get_online_cpus();
+ mutex_lock(&text_mutex);
+ arch_unoptimize_kprobes(&unoptimizing_list, free_list);
+ /* Loop free_list for disarming */
+ list_for_each_entry_safe(op, tmp, free_list, list) {
+ /* Disarm probes if marked disabled */
+ if (kprobe_disabled(&op->kp))
+ arch_disarm_kprobe(&op->kp);
+ if (kprobe_unused(&op->kp)) {
+ /*
+ * Remove unused probes from hash list. After waiting
+ * for synchronization, these probes are reclaimed.
+ * (reclaiming is done by do_free_cleaned_kprobes.)
+ */
+ hlist_del_rcu(&op->kp.hlist);
+ } else
+ list_del_init(&op->list);
}
mutex_unlock(&text_mutex);
put_online_cpus();
-end:
+}
+
+/* Reclaim all kprobes on the free_list */
+static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
+{
+ struct optimized_kprobe *op, *tmp;
+
+ list_for_each_entry_safe(op, tmp, free_list, list) {
+ BUG_ON(!kprobe_unused(&op->kp));
+ list_del_init(&op->list);
+ free_aggr_kprobe(&op->kp);
+ }
+}
+
+/* Start optimizer after OPTIMIZE_DELAY passed */
+static __kprobes void kick_kprobe_optimizer(void)
+{
+ if (!delayed_work_pending(&optimizing_work))
+ schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+}
+
+/* Kprobe jump optimizer */
+static __kprobes void kprobe_optimizer(struct work_struct *work)
+{
+ LIST_HEAD(free_list);
+
+ /* Lock modules while optimizing kprobes */
+ mutex_lock(&module_mutex);
+ mutex_lock(&kprobe_mutex);
+
+ /*
+ * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
+ * kprobes before waiting for quiesence period.
+ */
+ do_unoptimize_kprobes(&free_list);
+
+ /*
+ * Step 2: Wait for quiesence period to ensure all running interrupts
+ * are done. Because optprobe may modify multiple instructions
+ * there is a chance that Nth instruction is interrupted. In that
+ * case, running interrupt can return to 2nd-Nth byte of jump
+ * instruction. This wait is for avoiding it.
+ */
+ synchronize_sched();
+
+ /* Step 3: Optimize kprobes after quiesence period */
+ do_optimize_kprobes();
+
+ /* Step 4: Free cleaned kprobes after quiesence period */
+ do_free_cleaned_kprobes(&free_list);
+
mutex_unlock(&kprobe_mutex);
mutex_unlock(&module_mutex);
+
+ /* Step 5: Kick optimizer again if needed */
+ if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
+ kick_kprobe_optimizer();
+ else
+ /* Wake up all waiters */
+ complete_all(&optimizer_comp);
+}
+
+/* Wait for completing optimization and unoptimization */
+static __kprobes void wait_for_kprobe_optimizer(void)
+{
+ if (delayed_work_pending(&optimizing_work))
+ wait_for_completion(&optimizer_comp);
}
/* Optimize kprobe if p is ready to be optimized */
/* Check if it is already optimized. */
if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
return;
-
op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
- list_add(&op->list, &optimizing_list);
- if (!delayed_work_pending(&optimizing_work))
- schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+
+ if (!list_empty(&op->list))
+ /* This is under unoptimizing. Just dequeue the probe */
+ list_del_init(&op->list);
+ else {
+ list_add(&op->list, &optimizing_list);
+ kick_kprobe_optimizer();
+ }
+}
+
+/* Short cut to direct unoptimizing */
+static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
+{
+ get_online_cpus();
+ arch_unoptimize_kprobe(op);
+ put_online_cpus();
+ if (kprobe_disabled(&op->kp))
+ arch_disarm_kprobe(&op->kp);
}
/* Unoptimize a kprobe if p is optimized */
-static __kprobes void unoptimize_kprobe(struct kprobe *p)
+static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
{
struct optimized_kprobe *op;
- if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
- op = container_of(p, struct optimized_kprobe, kp);
- if (!list_empty(&op->list))
- /* Dequeue from the optimization queue */
+ if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
+ return; /* This is not an optprobe nor optimized */
+
+ op = container_of(p, struct optimized_kprobe, kp);
+ if (!kprobe_optimized(p)) {
+ /* Unoptimized or unoptimizing case */
+ if (force && !list_empty(&op->list)) {
+ /*
+ * Only if this is unoptimizing kprobe and forced,
+ * forcibly unoptimize it. (No need to unoptimize
+ * unoptimized kprobe again :)
+ */
list_del_init(&op->list);
- else
- /* Replace jump with break */
- arch_unoptimize_kprobe(op);
- op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ force_unoptimize_kprobe(op);
+ }
+ return;
+ }
+
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ if (!list_empty(&op->list)) {
+ /* Dequeue from the optimization queue */
+ list_del_init(&op->list);
+ return;
+ }
+ /* Optimized kprobe case */
+ if (force)
+ /* Forcibly update the code: this is a special case */
+ force_unoptimize_kprobe(op);
+ else {
+ list_add(&op->list, &unoptimizing_list);
+ kick_kprobe_optimizer();
}
}
+/* Cancel unoptimizing for reusing */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+ struct optimized_kprobe *op;
+
+ BUG_ON(!kprobe_unused(ap));
+ /*
+ * Unused kprobe MUST be on the way of delayed unoptimizing (means
+ * there is still a relative jump) and disabled.
+ */
+ op = container_of(ap, struct optimized_kprobe, kp);
+ if (unlikely(list_empty(&op->list)))
+ printk(KERN_WARNING "Warning: found a stray unused "
+ "aggrprobe@%p\n", ap->addr);
+ /* Enable the probe again */
+ ap->flags &= ~KPROBE_FLAG_DISABLED;
+ /* Optimize it again (remove from op->list) */
+ BUG_ON(!kprobe_optready(ap));
+ optimize_kprobe(ap);
+}
+
/* Remove optimized instructions */
static void __kprobes kill_optimized_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
op = container_of(p, struct optimized_kprobe, kp);
- if (!list_empty(&op->list)) {
- /* Dequeue from the optimization queue */
+ if (!list_empty(&op->list))
+ /* Dequeue from the (un)optimization queue */
list_del_init(&op->list);
- op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
- }
- /* Don't unoptimize, because the target code will be freed. */
+
+ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+ /* Don't touch the code, because it is already freed. */
arch_remove_optimized_kprobe(op);
}
arch_prepare_optimized_kprobe(op);
}
-/* Free optimized instructions and optimized_kprobe */
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
-{
- struct optimized_kprobe *op;
-
- op = container_of(p, struct optimized_kprobe, kp);
- arch_remove_optimized_kprobe(op);
- kfree(op);
-}
-
/* Allocate new optimized_kprobe and try to prepare optimized instructions */
static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
op = container_of(ap, struct optimized_kprobe, kp);
if (!arch_prepared_optinsn(&op->optinsn)) {
/* If failed to setup optimizing, fallback to kprobe */
- free_aggr_kprobe(ap);
+ arch_remove_optimized_kprobe(op);
+ kfree(op);
return;
}
return;
kprobes_allow_optimization = false;
- printk(KERN_INFO "Kprobes globally unoptimized\n");
- get_online_cpus(); /* For avoiding text_mutex deadlock */
- mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist) {
if (!kprobe_disabled(p))
- unoptimize_kprobe(p);
+ unoptimize_kprobe(p, false);
}
}
-
- mutex_unlock(&text_mutex);
- put_online_cpus();
- /* Allow all currently running kprobes to complete */
- synchronize_sched();
+ /* Wait for unoptimizing completion */
+ wait_for_kprobe_optimizer();
+ printk(KERN_INFO "Kprobes globally unoptimized\n");
}
int sysctl_kprobes_optimization;
}
#endif /* CONFIG_SYSCTL */
+/* Put a breakpoint for a probe. Must be called with text_mutex locked */
static void __kprobes __arm_kprobe(struct kprobe *p)
{
- struct kprobe *old_p;
+ struct kprobe *_p;
/* Check collision with other optimized kprobes */
- old_p = get_optimized_kprobe((unsigned long)p->addr);
- if (unlikely(old_p))
- unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
+ _p = get_optimized_kprobe((unsigned long)p->addr);
+ if (unlikely(_p))
+ /* Fallback to unoptimized kprobe */
+ unoptimize_kprobe(_p, true);
arch_arm_kprobe(p);
optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */
}
-static void __kprobes __disarm_kprobe(struct kprobe *p)
+/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
+static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
{
- struct kprobe *old_p;
+ struct kprobe *_p;
- unoptimize_kprobe(p); /* Try to unoptimize */
- arch_disarm_kprobe(p);
+ unoptimize_kprobe(p, false); /* Try to unoptimize */
- /* If another kprobe was blocked, optimize it. */
- old_p = get_optimized_kprobe((unsigned long)p->addr);
- if (unlikely(old_p))
- optimize_kprobe(old_p);
+ if (!kprobe_queued(p)) {
+ arch_disarm_kprobe(p);
+ /* If another kprobe was blocked, optimize it. */
+ _p = get_optimized_kprobe((unsigned long)p->addr);
+ if (unlikely(_p) && reopt)
+ optimize_kprobe(_p);
+ }
+ /* TODO: reoptimize others after unoptimized this probe */
}
#else /* !CONFIG_OPTPROBES */
#define optimize_kprobe(p) do {} while (0)
-#define unoptimize_kprobe(p) do {} while (0)
+#define unoptimize_kprobe(p, f) do {} while (0)
#define kill_optimized_kprobe(p) do {} while (0)
#define prepare_optimized_kprobe(p) do {} while (0)
#define try_to_optimize_kprobe(p) do {} while (0)
#define __arm_kprobe(p) arch_arm_kprobe(p)
-#define __disarm_kprobe(p) arch_disarm_kprobe(p)
+#define __disarm_kprobe(p, o) arch_disarm_kprobe(p)
+#define kprobe_disarmed(p) kprobe_disabled(p)
+#define wait_for_kprobe_optimizer() do {} while (0)
+
+/* There should be no unused kprobes can be reused without optimization */
+static void reuse_unused_kprobe(struct kprobe *ap)
+{
+ printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
+ BUG_ON(kprobe_unused(ap));
+}
static __kprobes void free_aggr_kprobe(struct kprobe *p)
{
+ arch_remove_kprobe(p);
kfree(p);
}
/* Disarm a kprobe with text_mutex */
static void __kprobes disarm_kprobe(struct kprobe *kp)
{
- get_online_cpus(); /* For avoiding text_mutex deadlock */
+ /* Ditto */
mutex_lock(&text_mutex);
- __disarm_kprobe(kp);
+ __disarm_kprobe(kp, true);
mutex_unlock(&text_mutex);
- put_online_cpus();
}
/*
BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
if (p->break_handler || p->post_handler)
- unoptimize_kprobe(ap); /* Fall back to normal kprobe */
+ unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */
if (p->break_handler) {
if (ap->break_handler)
* This is the second or subsequent kprobe at the address - handle
* the intricacies
*/
-static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
struct kprobe *p)
{
int ret = 0;
- struct kprobe *ap = old_p;
+ struct kprobe *ap = orig_p;
- if (!kprobe_aggrprobe(old_p)) {
- /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
- ap = alloc_aggr_kprobe(old_p);
+ if (!kprobe_aggrprobe(orig_p)) {
+ /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
+ ap = alloc_aggr_kprobe(orig_p);
if (!ap)
return -ENOMEM;
- init_aggr_kprobe(ap, old_p);
- }
+ init_aggr_kprobe(ap, orig_p);
+ } else if (kprobe_unused(ap))
+ /* This probe is going to die. Rescue it */
+ reuse_unused_kprobe(ap);
if (kprobe_gone(ap)) {
/*
return add_new_kprobe(ap, p);
}
-/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
-static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
-{
- struct kprobe *kp;
-
- list_for_each_entry_rcu(kp, &p->list, list) {
- if (!kprobe_disabled(kp))
- /*
- * There is an active probe on the list.
- * We can't disable aggr_kprobe.
- */
- return 0;
- }
- p->flags |= KPROBE_FLAG_DISABLED;
- return 1;
-}
-
static int __kprobes in_kprobes_functions(unsigned long addr)
{
struct kprobe_blackpoint *kb;
/* Check passed kprobe is valid and return kprobe in kprobe_table. */
static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
{
- struct kprobe *old_p, *list_p;
+ struct kprobe *ap, *list_p;
- old_p = get_kprobe(p->addr);
- if (unlikely(!old_p))
+ ap = get_kprobe(p->addr);
+ if (unlikely(!ap))
return NULL;
- if (p != old_p) {
- list_for_each_entry_rcu(list_p, &old_p->list, list)
+ if (p != ap) {
+ list_for_each_entry_rcu(list_p, &ap->list, list)
if (list_p == p)
/* kprobe p is a valid probe */
goto valid;
return NULL;
}
valid:
- return old_p;
+ return ap;
}
/* Return error if the kprobe is being re-registered */
static inline int check_kprobe_rereg(struct kprobe *p)
{
int ret = 0;
- struct kprobe *old_p;
mutex_lock(&kprobe_mutex);
- old_p = __get_valid_kprobe(p);
- if (old_p)
+ if (__get_valid_kprobe(p))
ret = -EINVAL;
mutex_unlock(&kprobe_mutex);
+
return ret;
}
}
EXPORT_SYMBOL_GPL(register_kprobe);
+/* Check if all probes on the aggrprobe are disabled */
+static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
+{
+ struct kprobe *kp;
+
+ list_for_each_entry_rcu(kp, &ap->list, list)
+ if (!kprobe_disabled(kp))
+ /*
+ * There is an active probe on the list.
+ * We can't disable this ap.
+ */
+ return 0;
+
+ return 1;
+}
+
+/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
+static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
+{
+ struct kprobe *orig_p;
+
+ /* Get an original kprobe for return */
+ orig_p = __get_valid_kprobe(p);
+ if (unlikely(orig_p == NULL))
+ return NULL;
+
+ if (!kprobe_disabled(p)) {
+ /* Disable probe if it is a child probe */
+ if (p != orig_p)
+ p->flags |= KPROBE_FLAG_DISABLED;
+
+ /* Try to disarm and disable this/parent probe */
+ if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
+ disarm_kprobe(orig_p);
+ orig_p->flags |= KPROBE_FLAG_DISABLED;
+ }
+ }
+
+ return orig_p;
+}
+
/*
* Unregister a kprobe without a scheduler synchronization.
*/
static int __kprobes __unregister_kprobe_top(struct kprobe *p)
{
- struct kprobe *old_p, *list_p;
+ struct kprobe *ap, *list_p;
- old_p = __get_valid_kprobe(p);
- if (old_p == NULL)
+ /* Disable kprobe. This will disarm it if needed. */
+ ap = __disable_kprobe(p);
+ if (ap == NULL)
return -EINVAL;
- if (old_p == p ||
- (kprobe_aggrprobe(old_p) &&
- list_is_singular(&old_p->list))) {
+ if (ap == p)
/*
- * Only probe on the hash list. Disarm only if kprobes are
- * enabled and not gone - otherwise, the breakpoint would
- * already have been removed. We save on flushing icache.
+ * This probe is an independent(and non-optimized) kprobe
+ * (not an aggrprobe). Remove from the hash list.
*/
- if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
- disarm_kprobe(old_p);
- hlist_del_rcu(&old_p->hlist);
- } else {
+ goto disarmed;
+
+ /* Following process expects this probe is an aggrprobe */
+ WARN_ON(!kprobe_aggrprobe(ap));
+
+ if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
+ /*
+ * !disarmed could be happen if the probe is under delayed
+ * unoptimizing.
+ */
+ goto disarmed;
+ else {
+ /* If disabling probe has special handlers, update aggrprobe */
if (p->break_handler && !kprobe_gone(p))
- old_p->break_handler = NULL;
+ ap->break_handler = NULL;
if (p->post_handler && !kprobe_gone(p)) {
- list_for_each_entry_rcu(list_p, &old_p->list, list) {
+ list_for_each_entry_rcu(list_p, &ap->list, list) {
if ((list_p != p) && (list_p->post_handler))
goto noclean;
}
- old_p->post_handler = NULL;
+ ap->post_handler = NULL;
}
noclean:
+ /*
+ * Remove from the aggrprobe: this path will do nothing in
+ * __unregister_kprobe_bottom().
+ */
list_del_rcu(&p->list);
- if (!kprobe_disabled(old_p)) {
- try_to_disable_aggr_kprobe(old_p);
- if (!kprobes_all_disarmed) {
- if (kprobe_disabled(old_p))
- disarm_kprobe(old_p);
- else
- /* Try to optimize this probe again */
- optimize_kprobe(old_p);
- }
- }
+ if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
+ /*
+ * Try to optimize this probe again, because post
+ * handler may have been changed.
+ */
+ optimize_kprobe(ap);
}
return 0;
+
+disarmed:
+ BUG_ON(!kprobe_disarmed(ap));
+ hlist_del_rcu(&ap->hlist);
+ return 0;
}
static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
{
- struct kprobe *old_p;
+ struct kprobe *ap;
if (list_empty(&p->list))
+ /* This is an independent kprobe */
arch_remove_kprobe(p);
else if (list_is_singular(&p->list)) {
- /* "p" is the last child of an aggr_kprobe */
- old_p = list_entry(p->list.next, struct kprobe, list);
+ /* This is the last child of an aggrprobe */
+ ap = list_entry(p->list.next, struct kprobe, list);
list_del(&p->list);
- arch_remove_kprobe(old_p);
- free_aggr_kprobe(old_p);
+ free_aggr_kprobe(ap);
}
+ /* Otherwise, do nothing. */
}
int __kprobes register_kprobes(struct kprobe **kps, int num)
int __kprobes disable_kprobe(struct kprobe *kp)
{
int ret = 0;
- struct kprobe *p;
mutex_lock(&kprobe_mutex);
- /* Check whether specified probe is valid. */
- p = __get_valid_kprobe(kp);
- if (unlikely(p == NULL)) {
+ /* Disable this kprobe */
+ if (__disable_kprobe(kp) == NULL)
ret = -EINVAL;
- goto out;
- }
- /* If the probe is already disabled (or gone), just return */
- if (kprobe_disabled(kp))
- goto out;
-
- kp->flags |= KPROBE_FLAG_DISABLED;
- if (p != kp)
- /* When kp != p, p is always enabled. */
- try_to_disable_aggr_kprobe(p);
-
- if (!kprobes_all_disarmed && kprobe_disabled(p))
- disarm_kprobe(p);
-out:
mutex_unlock(&kprobe_mutex);
return ret;
}
mutex_lock(&kprobe_mutex);
/* If kprobes are already disarmed, just return */
- if (kprobes_all_disarmed)
- goto already_disabled;
+ if (kprobes_all_disarmed) {
+ mutex_unlock(&kprobe_mutex);
+ return;
+ }
kprobes_all_disarmed = true;
printk(KERN_INFO "Kprobes globally disabled\n");
- /*
- * Here we call get_online_cpus() for avoiding text_mutex deadlock,
- * because disarming may also unoptimize kprobes.
- */
- get_online_cpus();
mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist) {
if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
- __disarm_kprobe(p);
+ __disarm_kprobe(p, false);
}
}
-
mutex_unlock(&text_mutex);
- put_online_cpus();
mutex_unlock(&kprobe_mutex);
- /* Allow all currently running kprobes to complete */
- synchronize_sched();
- return;
-already_disabled:
- mutex_unlock(&kprobe_mutex);
- return;
+ /* Wait for disarming all kprobes by optimizer */
+ wait_for_kprobe_optimizer();
}
/*
wait_for_completion(&create.done);
if (!IS_ERR(create.result)) {
- struct sched_param param = { .sched_priority = 0 };
+ static struct sched_param param = { .sched_priority = 0 };
va_list args;
va_start(args, namefmt);
namelen += 2;
for (i = 0; i < LOCKSTAT_POINTS; i++) {
- char sym[KSYM_SYMBOL_LEN];
char ip[32];
if (class->contention_point[i] == 0)
if (!i)
seq_line(m, '-', 40-namelen, namelen);
- sprint_symbol(sym, class->contention_point[i]);
snprintf(ip, sizeof(ip), "[<%p>]",
(void *)class->contention_point[i]);
- seq_printf(m, "%40s %14lu %29s %s\n", name,
- stats->contention_point[i],
- ip, sym);
+ seq_printf(m, "%40s %14lu %29s %pS\n",
+ name, stats->contention_point[i],
+ ip, (void *)class->contention_point[i]);
}
for (i = 0; i < LOCKSTAT_POINTS; i++) {
- char sym[KSYM_SYMBOL_LEN];
char ip[32];
if (class->contending_point[i] == 0)
if (!i)
seq_line(m, '-', 40-namelen, namelen);
- sprint_symbol(sym, class->contending_point[i]);
snprintf(ip, sizeof(ip), "[<%p>]",
(void *)class->contending_point[i]);
- seq_printf(m, "%40s %14lu %29s %s\n", name,
- stats->contending_point[i],
- ip, sym);
+ seq_printf(m, "%40s %14lu %29s %pS\n",
+ name, stats->contending_point[i],
+ ip, (void *)class->contending_point[i]);
}
if (i) {
seq_puts(m, "\n");
#include <linux/percpu.h>
#include <linux/kmemleak.h>
#include <linux/jump_label.h>
+#include <linux/pfn.h>
#define CREATE_TRACE_POINTS
#include <trace/events/module.h>
#define ARCH_SHF_SMALL 0
#endif
+/*
+ * Modules' sections will be aligned on page boundaries
+ * to ensure complete separation of code and data, but
+ * only when CONFIG_DEBUG_SET_MODULE_RONX=y
+ */
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+# define debug_align(X) ALIGN(X, PAGE_SIZE)
+#else
+# define debug_align(X) (X)
+#endif
+
+/*
+ * Given BASE and SIZE this macro calculates the number of pages the
+ * memory regions occupies
+ */
+#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \
+ (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \
+ PFN_DOWN((unsigned long)BASE) + 1) \
+ : (0UL))
+
/* If this is set, the section belongs in the init part of the module */
#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
return 0;
}
+#ifdef CONFIG_DEBUG_SET_MODULE_RONX
+/*
+ * LKM RO/NX protection: protect module's text/ro-data
+ * from modification and any data from execution.
+ */
+void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages))
+{
+ unsigned long begin_pfn = PFN_DOWN((unsigned long)start);
+ unsigned long end_pfn = PFN_DOWN((unsigned long)end);
+
+ if (end_pfn > begin_pfn)
+ set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+}
+
+static void set_section_ro_nx(void *base,
+ unsigned long text_size,
+ unsigned long ro_size,
+ unsigned long total_size)
+{
+ /* begin and end PFNs of the current subsection */
+ unsigned long begin_pfn;
+ unsigned long end_pfn;
+
+ /*
+ * Set RO for module text and RO-data:
+ * - Always protect first page.
+ * - Do not protect last partial page.
+ */
+ if (ro_size > 0)
+ set_page_attributes(base, base + ro_size, set_memory_ro);
+
+ /*
+ * Set NX permissions for module data:
+ * - Do not protect first partial page.
+ * - Always protect last page.
+ */
+ if (total_size > text_size) {
+ begin_pfn = PFN_UP((unsigned long)base + text_size);
+ end_pfn = PFN_UP((unsigned long)base + total_size);
+ if (end_pfn > begin_pfn)
+ set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn);
+ }
+}
+
+/* Setting memory back to RW+NX before releasing it */
+void unset_section_ro_nx(struct module *mod, void *module_region)
+{
+ unsigned long total_pages;
+
+ if (mod->module_core == module_region) {
+ /* Set core as NX+RW */
+ total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size);
+ set_memory_nx((unsigned long)mod->module_core, total_pages);
+ set_memory_rw((unsigned long)mod->module_core, total_pages);
+
+ } else if (mod->module_init == module_region) {
+ /* Set init as NX+RW */
+ total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size);
+ set_memory_nx((unsigned long)mod->module_init, total_pages);
+ set_memory_rw((unsigned long)mod->module_init, total_pages);
+ }
+}
+
+/* Iterate through all modules and set each module's text as RW */
+void set_all_modules_text_rw()
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if ((mod->module_core) && (mod->core_text_size)) {
+ set_page_attributes(mod->module_core,
+ mod->module_core + mod->core_text_size,
+ set_memory_rw);
+ }
+ if ((mod->module_init) && (mod->init_text_size)) {
+ set_page_attributes(mod->module_init,
+ mod->module_init + mod->init_text_size,
+ set_memory_rw);
+ }
+ }
+ mutex_unlock(&module_mutex);
+}
+
+/* Iterate through all modules and set each module's text as RO */
+void set_all_modules_text_ro()
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if ((mod->module_core) && (mod->core_text_size)) {
+ set_page_attributes(mod->module_core,
+ mod->module_core + mod->core_text_size,
+ set_memory_ro);
+ }
+ if ((mod->module_init) && (mod->init_text_size)) {
+ set_page_attributes(mod->module_init,
+ mod->module_init + mod->init_text_size,
+ set_memory_ro);
+ }
+ }
+ mutex_unlock(&module_mutex);
+}
+#else
+static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
+static inline void unset_section_ro_nx(struct module *mod, void *module_region) { }
+#endif
+
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
destroy_params(mod->kp, mod->num_kp);
/* This may be NULL, but that's OK */
+ unset_section_ro_nx(mod, mod->module_init);
module_free(mod, mod->module_init);
kfree(mod->args);
percpu_modfree(mod);
lockdep_free_key_range(mod->module_core, mod->core_size);
/* Finally, free the core (containing the module structure) */
+ unset_section_ro_nx(mod, mod->module_core);
module_free(mod, mod->module_core);
#ifdef CONFIG_MPU
s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
DEBUGP("\t%s\n", name);
}
- if (m == 0)
+ switch (m) {
+ case 0: /* executable */
+ mod->core_size = debug_align(mod->core_size);
mod->core_text_size = mod->core_size;
+ break;
+ case 1: /* RO: text and ro-data */
+ mod->core_size = debug_align(mod->core_size);
+ mod->core_ro_size = mod->core_size;
+ break;
+ case 3: /* whole core */
+ mod->core_size = debug_align(mod->core_size);
+ break;
+ }
}
DEBUGP("Init section allocation order:\n");
| INIT_OFFSET_MASK);
DEBUGP("\t%s\n", sname);
}
- if (m == 0)
+ switch (m) {
+ case 0: /* executable */
+ mod->init_size = debug_align(mod->init_size);
mod->init_text_size = mod->init_size;
+ break;
+ case 1: /* RO: text and ro-data */
+ mod->init_size = debug_align(mod->init_size);
+ mod->init_ro_size = mod->init_size;
+ break;
+ case 3: /* whole init */
+ mod->init_size = debug_align(mod->init_size);
+ break;
+ }
}
}
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_COMING, mod);
+ /* Set RO and NX regions for core */
+ set_section_ro_nx(mod->module_core,
+ mod->core_text_size,
+ mod->core_ro_size,
+ mod->core_size);
+
+ /* Set RO and NX regions for init */
+ set_section_ro_nx(mod->module_init,
+ mod->init_text_size,
+ mod->init_ro_size,
+ mod->init_size);
+
do_mod_ctors(mod);
/* Start the module */
if (mod->init != NULL)
mod->symtab = mod->core_symtab;
mod->strtab = mod->core_strtab;
#endif
+ unset_section_ro_nx(mod, mod->module_init);
module_free(mod, mod->module_init);
mod->module_init = NULL;
mod->init_size = 0;
* memory barriers as we'll eventually observe the right
* values at the cost of a few extra spins.
*/
- cpu_relax();
+ arch_mutex_cpu_relax();
}
#endif
spin_lock_mutex(&lock->wait_lock, flags);
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/smp.h>
+#include <linux/idr.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
+#include <linux/reboot.h>
#include <linux/vmstat.h>
+#include <linux/device.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
#include <linux/rculist.h>
}
}
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+{
+ /*
+ * only top level events have the pid namespace they were created in
+ */
+ if (event->parent)
+ event = event->parent;
+
+ return task_tgid_nr_ns(p, event->ns);
+}
+
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+ /*
+ * only top level events have the pid namespace they were created in
+ */
+ if (event->parent)
+ event = event->parent;
+
+ return task_pid_nr_ns(p, event->ns);
+}
+
/*
* If we inherit events we want to return the parent event id
* to userspace.
ctx->nr_stat++;
}
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__read_size(struct perf_event *event)
+{
+ int entry = sizeof(u64); /* value */
+ int size = 0;
+ int nr = 1;
+
+ if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ size += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ size += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_ID)
+ entry += sizeof(u64);
+
+ if (event->attr.read_format & PERF_FORMAT_GROUP) {
+ nr += event->group_leader->nr_siblings;
+ size += sizeof(u64);
+ }
+
+ size += entry * nr;
+ event->read_size = size;
+}
+
+static void perf_event__header_size(struct perf_event *event)
+{
+ struct perf_sample_data *data;
+ u64 sample_type = event->attr.sample_type;
+ u16 size = 0;
+
+ perf_event__read_size(event);
+
+ if (sample_type & PERF_SAMPLE_IP)
+ size += sizeof(data->ip);
+
+ if (sample_type & PERF_SAMPLE_ADDR)
+ size += sizeof(data->addr);
+
+ if (sample_type & PERF_SAMPLE_PERIOD)
+ size += sizeof(data->period);
+
+ if (sample_type & PERF_SAMPLE_READ)
+ size += event->read_size;
+
+ event->header_size = size;
+}
+
+static void perf_event__id_header_size(struct perf_event *event)
+{
+ struct perf_sample_data *data;
+ u64 sample_type = event->attr.sample_type;
+ u16 size = 0;
+
+ if (sample_type & PERF_SAMPLE_TID)
+ size += sizeof(data->tid_entry);
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ size += sizeof(data->time);
+
+ if (sample_type & PERF_SAMPLE_ID)
+ size += sizeof(data->id);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ size += sizeof(data->stream_id);
+
+ if (sample_type & PERF_SAMPLE_CPU)
+ size += sizeof(data->cpu_entry);
+
+ event->id_header_size = size;
+}
+
static void perf_group_attach(struct perf_event *event)
{
- struct perf_event *group_leader = event->group_leader;
+ struct perf_event *group_leader = event->group_leader, *pos;
/*
* We can have double attach due to group movement in perf_event_open.
list_add_tail(&event->group_entry, &group_leader->sibling_list);
group_leader->nr_siblings++;
+
+ perf_event__header_size(group_leader);
+
+ list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
+ perf_event__header_size(pos);
}
/*
if (event->group_leader != event) {
list_del_init(&event->group_entry);
event->group_leader->nr_siblings--;
- return;
+ goto out;
}
if (!list_empty(&event->group_entry))
/* Inherit group flags from the previous leader */
sibling->group_flags = event->group_flags;
}
+
+out:
+ perf_event__header_size(event->group_leader);
+
+ list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
+ perf_event__header_size(tmp);
}
static inline int
/*
* not supported on inherited events
*/
- if (event->attr.inherit)
+ if (event->attr.inherit || !is_sampling_event(event))
return -EINVAL;
atomic_add(refresh, &event->event_limit);
return perf_event_release_kernel(event);
}
-static int perf_event_read_size(struct perf_event *event)
-{
- int entry = sizeof(u64); /* value */
- int size = 0;
- int nr = 1;
-
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
- size += sizeof(u64);
-
- if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
- size += sizeof(u64);
-
- if (event->attr.read_format & PERF_FORMAT_ID)
- entry += sizeof(u64);
-
- if (event->attr.read_format & PERF_FORMAT_GROUP) {
- nr += event->group_leader->nr_siblings;
- size += sizeof(u64);
- }
-
- size += entry * nr;
-
- return size;
-}
-
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
if (event->state == PERF_EVENT_STATE_ERROR)
return 0;
- if (count < perf_event_read_size(event))
+ if (count < event->read_size)
return -ENOSPC;
WARN_ON_ONCE(event->ctx->parent_ctx);
int ret = 0;
u64 value;
- if (!event->attr.sample_period)
+ if (!is_sampling_event(event))
return -EINVAL;
if (copy_from_user(&value, arg, sizeof(value)))
} while (len);
}
+static void __perf_event_header__init_id(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ u64 sample_type = event->attr.sample_type;
+
+ data->type = sample_type;
+ header->size += event->id_header_size;
+
+ if (sample_type & PERF_SAMPLE_TID) {
+ /* namespace issues */
+ data->tid_entry.pid = perf_event_pid(event, current);
+ data->tid_entry.tid = perf_event_tid(event, current);
+ }
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ data->time = perf_clock();
+
+ if (sample_type & PERF_SAMPLE_ID)
+ data->id = primary_event_id(event);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ data->stream_id = event->id;
+
+ if (sample_type & PERF_SAMPLE_CPU) {
+ data->cpu_entry.cpu = raw_smp_processor_id();
+ data->cpu_entry.reserved = 0;
+ }
+}
+
+static void perf_event_header__init_id(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ if (event->attr.sample_id_all)
+ __perf_event_header__init_id(header, data, event);
+}
+
+static void __perf_event__output_id_sample(struct perf_output_handle *handle,
+ struct perf_sample_data *data)
+{
+ u64 sample_type = data->type;
+
+ if (sample_type & PERF_SAMPLE_TID)
+ perf_output_put(handle, data->tid_entry);
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ perf_output_put(handle, data->time);
+
+ if (sample_type & PERF_SAMPLE_ID)
+ perf_output_put(handle, data->id);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ perf_output_put(handle, data->stream_id);
+
+ if (sample_type & PERF_SAMPLE_CPU)
+ perf_output_put(handle, data->cpu_entry);
+}
+
+static void perf_event__output_id_sample(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *sample)
+{
+ if (event->attr.sample_id_all)
+ __perf_event__output_id_sample(handle, sample);
+}
+
int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size,
int nmi, int sample)
struct perf_buffer *buffer;
unsigned long tail, offset, head;
int have_lost;
+ struct perf_sample_data sample_data;
struct {
struct perf_event_header header;
u64 id;
goto out;
have_lost = local_read(&buffer->lost);
- if (have_lost)
- size += sizeof(lost_event);
+ if (have_lost) {
+ lost_event.header.size = sizeof(lost_event);
+ perf_event_header__init_id(&lost_event.header, &sample_data,
+ event);
+ size += lost_event.header.size;
+ }
perf_output_get_handle(handle);
if (have_lost) {
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
- lost_event.header.size = sizeof(lost_event);
lost_event.id = event->id;
lost_event.lost = local_xchg(&buffer->lost, 0);
perf_output_put(handle, lost_event);
+ perf_event__output_id_sample(event, handle, &sample_data);
}
return 0;
rcu_read_unlock();
}
-static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
-{
- /*
- * only top level events have the pid namespace they were created in
- */
- if (event->parent)
- event = event->parent;
-
- return task_tgid_nr_ns(p, event->ns);
-}
-
-static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
-{
- /*
- * only top level events have the pid namespace they were created in
- */
- if (event->parent)
- event = event->parent;
-
- return task_pid_nr_ns(p, event->ns);
-}
-
static void perf_output_read_one(struct perf_output_handle *handle,
struct perf_event *event,
u64 enabled, u64 running)
{
u64 sample_type = event->attr.sample_type;
- data->type = sample_type;
-
header->type = PERF_RECORD_SAMPLE;
- header->size = sizeof(*header);
+ header->size = sizeof(*header) + event->header_size;
header->misc = 0;
header->misc |= perf_misc_flags(regs);
- if (sample_type & PERF_SAMPLE_IP) {
- data->ip = perf_instruction_pointer(regs);
-
- header->size += sizeof(data->ip);
- }
-
- if (sample_type & PERF_SAMPLE_TID) {
- /* namespace issues */
- data->tid_entry.pid = perf_event_pid(event, current);
- data->tid_entry.tid = perf_event_tid(event, current);
-
- header->size += sizeof(data->tid_entry);
- }
-
- if (sample_type & PERF_SAMPLE_TIME) {
- data->time = perf_clock();
-
- header->size += sizeof(data->time);
- }
-
- if (sample_type & PERF_SAMPLE_ADDR)
- header->size += sizeof(data->addr);
-
- if (sample_type & PERF_SAMPLE_ID) {
- data->id = primary_event_id(event);
-
- header->size += sizeof(data->id);
- }
-
- if (sample_type & PERF_SAMPLE_STREAM_ID) {
- data->stream_id = event->id;
-
- header->size += sizeof(data->stream_id);
- }
-
- if (sample_type & PERF_SAMPLE_CPU) {
- data->cpu_entry.cpu = raw_smp_processor_id();
- data->cpu_entry.reserved = 0;
-
- header->size += sizeof(data->cpu_entry);
- }
-
- if (sample_type & PERF_SAMPLE_PERIOD)
- header->size += sizeof(data->period);
+ __perf_event_header__init_id(header, data, event);
- if (sample_type & PERF_SAMPLE_READ)
- header->size += perf_event_read_size(event);
+ if (sample_type & PERF_SAMPLE_IP)
+ data->ip = perf_instruction_pointer(regs);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
struct task_struct *task)
{
struct perf_output_handle handle;
+ struct perf_sample_data sample;
struct perf_read_event read_event = {
.header = {
.type = PERF_RECORD_READ,
.misc = 0,
- .size = sizeof(read_event) + perf_event_read_size(event),
+ .size = sizeof(read_event) + event->read_size,
},
.pid = perf_event_pid(event, task),
.tid = perf_event_tid(event, task),
};
int ret;
+ perf_event_header__init_id(&read_event.header, &sample, event);
ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
if (ret)
return;
perf_output_put(&handle, read_event);
perf_output_read(&handle, event);
+ perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
}
struct perf_task_event *task_event)
{
struct perf_output_handle handle;
+ struct perf_sample_data sample;
struct task_struct *task = task_event->task;
- int size, ret;
+ int ret, size = task_event->event_id.header.size;
- size = task_event->event_id.header.size;
- ret = perf_output_begin(&handle, event, size, 0, 0);
+ perf_event_header__init_id(&task_event->event_id.header, &sample, event);
+ ret = perf_output_begin(&handle, event,
+ task_event->event_id.header.size, 0, 0);
if (ret)
- return;
+ goto out;
task_event->event_id.pid = perf_event_pid(event, task);
task_event->event_id.ppid = perf_event_pid(event, current);
perf_output_put(&handle, task_event->event_id);
+ perf_event__output_id_sample(event, &handle, &sample);
+
perf_output_end(&handle);
+out:
+ task_event->event_id.header.size = size;
}
static int perf_event_task_match(struct perf_event *event)
struct perf_comm_event *comm_event)
{
struct perf_output_handle handle;
+ struct perf_sample_data sample;
int size = comm_event->event_id.header.size;
- int ret = perf_output_begin(&handle, event, size, 0, 0);
+ int ret;
+
+ perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
+ ret = perf_output_begin(&handle, event,
+ comm_event->event_id.header.size, 0, 0);
if (ret)
- return;
+ goto out;
comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
perf_output_put(&handle, comm_event->event_id);
perf_output_copy(&handle, comm_event->comm,
comm_event->comm_size);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
perf_output_end(&handle);
+out:
+ comm_event->event_id.header.size = size;
}
static int perf_event_comm_match(struct perf_event *event)
comm_event->comm_size = size;
comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
-
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
struct perf_mmap_event *mmap_event)
{
struct perf_output_handle handle;
+ struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
- int ret = perf_output_begin(&handle, event, size, 0, 0);
+ int ret;
+ perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
+ ret = perf_output_begin(&handle, event,
+ mmap_event->event_id.header.size, 0, 0);
if (ret)
- return;
+ goto out;
mmap_event->event_id.pid = perf_event_pid(event, current);
mmap_event->event_id.tid = perf_event_tid(event, current);
perf_output_put(&handle, mmap_event->event_id);
perf_output_copy(&handle, mmap_event->file_name,
mmap_event->file_size);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
perf_output_end(&handle);
+out:
+ mmap_event->event_id.header.size = size;
}
static int perf_event_mmap_match(struct perf_event *event,
static void perf_log_throttle(struct perf_event *event, int enable)
{
struct perf_output_handle handle;
+ struct perf_sample_data sample;
int ret;
struct {
if (enable)
throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
- ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
+ perf_event_header__init_id(&throttle_event.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event,
+ throttle_event.header.size, 1, 0);
if (ret)
return;
perf_output_put(&handle, throttle_event);
+ perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
}
struct hw_perf_event *hwc = &event->hw;
int ret = 0;
+ /*
+ * Non-sampling counters might still use the PMI to fold short
+ * hardware counters, ignore those.
+ */
+ if (unlikely(!is_sampling_event(event)))
+ return 0;
+
if (!throttle) {
hwc->interrupts++;
} else {
if (!regs)
return;
- if (!hwc->sample_period)
+ if (!is_sampling_event(event))
return;
if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
struct hw_perf_event *hwc = &event->hw;
struct hlist_head *head;
- if (hwc->sample_period) {
+ if (is_sampling_event(event)) {
hwc->last_period = hwc->sample_period;
perf_swevent_set_period(event);
}
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -ENOENT;
- /*
- * Raw tracepoint data is a severe data leak, only allow root to
- * have these.
- */
- if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
- perf_paranoid_tracepoint_raw() &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
err = perf_trace_init(event);
if (err)
return err;
static inline void perf_tp_register(void)
{
- perf_pmu_register(&perf_tracepoint);
+ perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
}
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
static void perf_swevent_start_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
+ s64 period;
+
+ if (!is_sampling_event(event))
+ return;
hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hwc->hrtimer.function = perf_swevent_hrtimer;
- if (hwc->sample_period) {
- s64 period = local64_read(&hwc->period_left);
- if (period) {
- if (period < 0)
- period = 10000;
+ period = local64_read(&hwc->period_left);
+ if (period) {
+ if (period < 0)
+ period = 10000;
- local64_set(&hwc->period_left, 0);
- } else {
- period = max_t(u64, 10000, hwc->sample_period);
- }
- __hrtimer_start_range_ns(&hwc->hrtimer,
+ local64_set(&hwc->period_left, 0);
+ } else {
+ period = max_t(u64, 10000, hwc->sample_period);
+ }
+ __hrtimer_start_range_ns(&hwc->hrtimer,
ns_to_ktime(period), 0,
HRTIMER_MODE_REL_PINNED, 0);
- }
}
static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- if (hwc->sample_period) {
+ if (is_sampling_event(event)) {
ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
local64_set(&hwc->period_left, ktime_to_ns(remaining));
out:
mutex_unlock(&pmus_lock);
}
+static struct idr pmu_idr;
+
+static ssize_t
+type_show(struct device *dev, struct device_attribute *attr, char *page)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+}
+
+static struct device_attribute pmu_dev_attrs[] = {
+ __ATTR_RO(type),
+ __ATTR_NULL,
+};
+
+static int pmu_bus_running;
+static struct bus_type pmu_bus = {
+ .name = "event_source",
+ .dev_attrs = pmu_dev_attrs,
+};
+
+static void pmu_dev_release(struct device *dev)
+{
+ kfree(dev);
+}
+
+static int pmu_dev_alloc(struct pmu *pmu)
+{
+ int ret = -ENOMEM;
+
+ pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
+ if (!pmu->dev)
+ goto out;
+
+ device_initialize(pmu->dev);
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
+ if (ret)
+ goto free_dev;
+
+ dev_set_drvdata(pmu->dev, pmu);
+ pmu->dev->bus = &pmu_bus;
+ pmu->dev->release = pmu_dev_release;
+ ret = device_add(pmu->dev);
+ if (ret)
+ goto free_dev;
+
+out:
+ return ret;
+
+free_dev:
+ put_device(pmu->dev);
+ goto out;
+}
-int perf_pmu_register(struct pmu *pmu)
+int perf_pmu_register(struct pmu *pmu, char *name, int type)
{
int cpu, ret;
if (!pmu->pmu_disable_count)
goto unlock;
+ pmu->type = -1;
+ if (!name)
+ goto skip_type;
+ pmu->name = name;
+
+ if (type < 0) {
+ int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
+ if (!err)
+ goto free_pdc;
+
+ err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
+ if (err) {
+ ret = err;
+ goto free_pdc;
+ }
+ }
+ pmu->type = type;
+
+ if (pmu_bus_running) {
+ ret = pmu_dev_alloc(pmu);
+ if (ret)
+ goto free_idr;
+ }
+
+skip_type:
pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
if (pmu->pmu_cpu_context)
goto got_cpu_context;
pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
if (!pmu->pmu_cpu_context)
- goto free_pdc;
+ goto free_dev;
for_each_possible_cpu(cpu) {
struct perf_cpu_context *cpuctx;
return ret;
+free_dev:
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+
+free_idr:
+ if (pmu->type >= PERF_TYPE_MAX)
+ idr_remove(&pmu_idr, pmu->type);
+
free_pdc:
free_percpu(pmu->pmu_disable_count);
goto unlock;
synchronize_rcu();
free_percpu(pmu->pmu_disable_count);
+ if (pmu->type >= PERF_TYPE_MAX)
+ idr_remove(&pmu_idr, pmu->type);
+ device_del(pmu->dev);
+ put_device(pmu->dev);
free_pmu_context(pmu);
}
int idx;
idx = srcu_read_lock(&pmus_srcu);
+
+ rcu_read_lock();
+ pmu = idr_find(&pmu_idr, event->attr.type);
+ rcu_read_unlock();
+ if (pmu)
+ goto unlock;
+
list_for_each_entry_rcu(pmu, &pmus, entry) {
int ret = pmu->event_init(event);
if (!ret)
list_add_tail(&event->owner_entry, ¤t->perf_event_list);
mutex_unlock(¤t->perf_event_mutex);
+ /*
+ * Precalculate sample_data sizes
+ */
+ perf_event__header_size(event);
+ perf_event__id_header_size(event);
+
/*
* Drop the reference on the group_event after placing the
* new event on the sibling_list. This ensures destruction
child_event->ctx = child_ctx;
child_event->overflow_handler = parent_event->overflow_handler;
+ /*
+ * Precalculate sample_data sizes
+ */
+ perf_event__header_size(child_event);
+ perf_event__id_header_size(child_event);
+
/*
* Link it up in the child's context:
*/
mutex_unlock(&swhash->hlist_mutex);
}
-#ifdef CONFIG_HOTPLUG_CPU
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
static void perf_pmu_rotate_stop(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
static inline void perf_event_exit_cpu(int cpu) { }
#endif
+static int
+perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ perf_event_exit_cpu(cpu);
+
+ return NOTIFY_OK;
+}
+
+/*
+ * Run the perf reboot notifier at the very last possible moment so that
+ * the generic watchdog code runs as long as possible.
+ */
+static struct notifier_block perf_reboot_notifier = {
+ .notifier_call = perf_reboot,
+ .priority = INT_MIN,
+};
+
static int __cpuinit
perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
{
{
int ret;
+ idr_init(&pmu_idr);
+
perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
- perf_pmu_register(&perf_swevent);
- perf_pmu_register(&perf_cpu_clock);
- perf_pmu_register(&perf_task_clock);
+ perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
+ perf_pmu_register(&perf_cpu_clock, NULL, -1);
+ perf_pmu_register(&perf_task_clock, NULL, -1);
perf_tp_register();
perf_cpu_notifier(perf_cpu_notify);
+ register_reboot_notifier(&perf_reboot_notifier);
ret = init_hw_breakpoint();
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
}
+
+static int __init perf_event_sysfs_init(void)
+{
+ struct pmu *pmu;
+ int ret;
+
+ mutex_lock(&pmus_lock);
+
+ ret = bus_register(&pmu_bus);
+ if (ret)
+ goto unlock;
+
+ list_for_each_entry(pmu, &pmus, entry) {
+ if (!pmu->name || pmu->type < 0)
+ continue;
+
+ ret = pmu_dev_alloc(pmu);
+ WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
+ }
+ pmu_bus_running = 1;
+ ret = 0;
+
+unlock:
+ mutex_unlock(&pmus_lock);
+
+ return ret;
+}
+device_initcall(perf_event_sysfs_init);
static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
-static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
+
+#define lock_timer(tid, flags) \
+({ struct k_itimer *__timr; \
+ __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \
+ __timr; \
+})
static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
{
* the find to the timer lock. To avoid a dead lock, the timer id MUST
* be release with out holding the timer lock.
*/
-static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
+static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
/*
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/suspend.h>
+#include <trace/events/power.h>
#include "power.h"
if (!suspend_ops)
return -ENOSYS;
+ trace_machine_suspend(state);
if (suspend_ops->begin) {
error = suspend_ops->begin(state);
if (error)
Close:
if (suspend_ops->end)
suspend_ops->end();
+ trace_machine_suspend(PWR_EVENT_EXIT);
return error;
Recover_platform:
void printk_tick(void)
{
- if (__get_cpu_var(printk_pending)) {
- __get_cpu_var(printk_pending) = 0;
+ if (__this_cpu_read(printk_pending)) {
+ __this_cpu_write(printk_pending, 0);
wake_up_interruptible(&log_wait);
}
}
int printk_needs_cpu(int cpu)
{
- if (unlikely(cpu_is_offline(cpu)))
+ if (cpu_is_offline(cpu))
printk_tick();
- return per_cpu(printk_pending, cpu);
+ return __this_cpu_read(printk_pending);
}
void wake_up_klogd(void)
#include <linux/time.h>
#include <linux/cpu.h>
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
- struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
- struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
- struct rcu_head **curtail; /* ->next pointer of last CB. */
-};
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_sched_ctrlblk = {
- .donetail = &rcu_sched_ctrlblk.rcucblist,
- .curtail = &rcu_sched_ctrlblk.rcucblist,
-};
-
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
- .donetail = &rcu_bh_ctrlblk.rcucblist,
- .curtail = &rcu_bh_ctrlblk.rcucblist,
-};
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
+static struct task_struct *rcu_kthread_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
+static void invoke_rcu_kthread(void);
/* Forward declarations for rcutiny_plugin.h. */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+struct rcu_ctrlblk;
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static int rcu_kthread(void *arg);
static void __call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu),
struct rcu_ctrlblk *rcp);
{
if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
rcu_qsctr_help(&rcu_bh_ctrlblk))
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_kthread();
}
/*
void rcu_bh_qs(int cpu)
{
if (rcu_qsctr_help(&rcu_bh_ctrlblk))
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_kthread();
}
/*
}
/*
- * Helper function for rcu_process_callbacks() that operates on the
- * specified rcu_ctrlkblk structure.
+ * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
+ * whose grace period has elapsed.
*/
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
{
struct rcu_head *next, *list;
unsigned long flags;
+ RCU_TRACE(int cb_count = 0);
/* If no RCU callbacks ready to invoke, just return. */
if (&rcp->rcucblist == rcp->donetail)
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
+ local_bh_disable();
list->func(list);
+ local_bh_enable();
list = next;
+ RCU_TRACE(cb_count++);
}
+ RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
}
/*
- * Invoke any callbacks whose grace period has completed.
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed. It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that was used previously for this purpose.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
*/
-static void rcu_process_callbacks(struct softirq_action *unused)
+static int rcu_kthread(void *arg)
{
- __rcu_process_callbacks(&rcu_sched_ctrlblk);
- __rcu_process_callbacks(&rcu_bh_ctrlblk);
- rcu_preempt_process_callbacks();
+ unsigned long work;
+ unsigned long morework;
+ unsigned long flags;
+
+ for (;;) {
+ wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
+ morework = rcu_boost();
+ local_irq_save(flags);
+ work = have_rcu_kthread_work;
+ have_rcu_kthread_work = morework;
+ local_irq_restore(flags);
+ if (work) {
+ rcu_process_callbacks(&rcu_sched_ctrlblk);
+ rcu_process_callbacks(&rcu_bh_ctrlblk);
+ rcu_preempt_process_callbacks();
+ }
+ schedule_timeout_interruptible(1); /* Leave CPU for others. */
+ }
+
+ return 0; /* Not reached, but needed to shut gcc up. */
+}
+
+/*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_kthread(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ have_rcu_kthread_work = 1;
+ wake_up(&rcu_kthread_wq);
+ local_irq_restore(flags);
}
/*
local_irq_save(flags);
*rcp->curtail = head;
rcp->curtail = &head->next;
+ RCU_TRACE(rcp->qlen++);
local_irq_restore(flags);
}
}
EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-void __init rcu_init(void)
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
{
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+ struct sched_param sp;
+
+ rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+ sp.sched_priority = RCU_BOOST_PRIO;
+ sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+ return 0;
}
+early_initcall(rcu_spawn_kthreads);
* Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
*/
+#include <linux/kthread.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt) stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+ struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
+ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
+ struct rcu_head **curtail; /* ->next pointer of last CB. */
+ RCU_TRACE(long qlen); /* Number of pending CBs. */
+};
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+ .donetail = &rcu_sched_ctrlblk.rcucblist,
+ .curtail = &rcu_sched_ctrlblk.rcucblist,
+};
+
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+ .donetail = &rcu_bh_ctrlblk.rcucblist,
+ .curtail = &rcu_bh_ctrlblk.rcucblist,
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
#ifdef CONFIG_TINY_PREEMPT_RCU
#include <linux/delay.h>
struct list_head *gp_tasks;
/* Pointer to the first task blocking the */
/* current grace period, or NULL if there */
- /* is not such task. */
+ /* is no such task. */
struct list_head *exp_tasks;
/* Pointer to first task blocking the */
/* current expedited grace period, or NULL */
/* if there is no such task. If there */
/* is no current expedited grace period, */
/* then there cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+ struct list_head *boost_tasks;
+ /* Pointer to first task that needs to be */
+ /* priority-boosted, or NULL if no priority */
+ /* boosting is needed. If there is no */
+ /* current or expedited grace period, there */
+ /* can be no such task. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
u8 gpnum; /* Current grace period. */
u8 gpcpu; /* Last grace period blocked by the CPU. */
u8 completed; /* Last grace period completed. */
/* If all three are equal, RCU is idle. */
+#ifdef CONFIG_RCU_BOOST
+ s8 boosted_this_gp; /* Has boosting already happened? */
+ unsigned long boost_time; /* When to start boosting (jiffies) */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_TRACE
+ unsigned long n_grace_periods;
+#ifdef CONFIG_RCU_BOOST
+ unsigned long n_tasks_boosted;
+ unsigned long n_exp_boosts;
+ unsigned long n_normal_boosts;
+ unsigned long n_normal_balk_blkd_tasks;
+ unsigned long n_normal_balk_gp_tasks;
+ unsigned long n_normal_balk_boost_tasks;
+ unsigned long n_normal_balk_boosted;
+ unsigned long n_normal_balk_notyet;
+ unsigned long n_normal_balk_nos;
+ unsigned long n_exp_balk_blkd_tasks;
+ unsigned long n_exp_balk_nos;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+#endif /* #ifdef CONFIG_RCU_TRACE */
};
static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
}
+/*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t)
+{
+ struct list_head *np;
+
+ np = t->rcu_node_entry.next;
+ if (np == &rcu_preempt_ctrlblk.blkd_tasks)
+ np = NULL;
+ return np;
+}
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+static void rcu_initiate_boost_trace(void);
+static void rcu_initiate_exp_boost_trace(void);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Dump additional statistice for TINY_PREEMPT_RCU.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+ seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
+ rcu_preempt_ctrlblk.rcb.qlen,
+ rcu_preempt_ctrlblk.n_grace_periods,
+ rcu_preempt_ctrlblk.gpnum,
+ rcu_preempt_ctrlblk.gpcpu,
+ rcu_preempt_ctrlblk.completed,
+ "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
+ "N."[!rcu_preempt_ctrlblk.gp_tasks],
+ "E."[!rcu_preempt_ctrlblk.exp_tasks]);
+#ifdef CONFIG_RCU_BOOST
+ seq_printf(m, " ttb=%c btg=",
+ "B."[!rcu_preempt_ctrlblk.boost_tasks]);
+ switch (rcu_preempt_ctrlblk.boosted_this_gp) {
+ case -1:
+ seq_puts(m, "exp");
+ break;
+ case 0:
+ seq_puts(m, "no");
+ break;
+ case 1:
+ seq_puts(m, "begun");
+ break;
+ case 2:
+ seq_puts(m, "done");
+ break;
+ default:
+ seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
+ }
+ seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+ rcu_preempt_ctrlblk.n_tasks_boosted,
+ rcu_preempt_ctrlblk.n_exp_boosts,
+ rcu_preempt_ctrlblk.n_normal_boosts,
+ (int)(jiffies & 0xffff),
+ (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
+ seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
+ "normal balk",
+ rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
+ rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
+ rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
+ rcu_preempt_ctrlblk.n_normal_balk_boosted,
+ rcu_preempt_ctrlblk.n_normal_balk_notyet,
+ rcu_preempt_ctrlblk.n_normal_balk_nos);
+ seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
+ rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
+ rcu_preempt_ctrlblk.n_exp_balk_nos);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+#ifdef CONFIG_RCU_BOOST
+
+#include "rtmutex_common.h"
+
+/*
+ * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
+ * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
+ */
+static int rcu_boost(void)
+{
+ unsigned long flags;
+ struct rt_mutex mtx;
+ struct list_head *np;
+ struct task_struct *t;
+
+ if (rcu_preempt_ctrlblk.boost_tasks == NULL)
+ return 0; /* Nothing to boost. */
+ raw_local_irq_save(flags);
+ rcu_preempt_ctrlblk.boosted_this_gp++;
+ t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
+ rcu_node_entry);
+ np = rcu_next_node_entry(t);
+ rt_mutex_init_proxy_locked(&mtx, t);
+ t->rcu_boost_mutex = &mtx;
+ t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+ raw_local_irq_restore(flags);
+ rt_mutex_lock(&mtx);
+ RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+ rcu_preempt_ctrlblk.boosted_this_gp++;
+ rt_mutex_unlock(&mtx);
+ return rcu_preempt_ctrlblk.boost_tasks != NULL;
+}
+
+/*
+ * Check to see if it is now time to start boosting RCU readers blocking
+ * the current grace period, and, if so, tell the rcu_kthread_task to
+ * start boosting them. If there is an expedited boost in progress,
+ * we wait for it to complete.
+ *
+ * If there are no blocked readers blocking the current grace period,
+ * return 0 to let the caller know, otherwise return 1. Note that this
+ * return value is independent of whether or not boosting was done.
+ */
+static int rcu_initiate_boost(void)
+{
+ if (!rcu_preempt_blocked_readers_cgp()) {
+ RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
+ return 0;
+ }
+ if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+ rcu_preempt_ctrlblk.boost_tasks == NULL &&
+ rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
+ ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
+ rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
+ invoke_rcu_kthread();
+ RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+ } else
+ RCU_TRACE(rcu_initiate_boost_trace());
+ return 1;
+}
+
+/*
+ * Initiate boosting for an expedited grace period.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
+ rcu_preempt_ctrlblk.boost_tasks =
+ rcu_preempt_ctrlblk.blkd_tasks.next;
+ rcu_preempt_ctrlblk.boosted_this_gp = -1;
+ invoke_rcu_kthread();
+ RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+ } else
+ RCU_TRACE(rcu_initiate_exp_boost_trace());
+ raw_local_irq_restore(flags);
+}
+
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
+
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+ rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+ if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
+ rcu_preempt_ctrlblk.boosted_this_gp = 0;
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * If there is no RCU priority boosting, we don't boost.
+ */
+static int rcu_boost(void)
+{
+ return 0;
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate boosting,
+ * but we do indicate whether there are blocked readers blocking the
+ * current grace period.
+ */
+static int rcu_initiate_boost(void)
+{
+ return rcu_preempt_blocked_readers_cgp();
+}
+
+/*
+ * If there is no RCU priority boosting, we don't initiate expedited boosting.
+ */
+static void rcu_initiate_expedited_boost(void)
+{
+}
+
+/*
+ * If there is no RCU priority boosting, nothing to do at grace-period start.
+ */
+static void rcu_preempt_boost_start_gp(void)
+{
+}
+
+#endif /* else #ifdef CONFIG_RCU_BOOST */
+
/*
* Record a preemptible-RCU quiescent state for the specified CPU. Note
* that this just means that the task currently running on the CPU is
rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+ /* If there is no GP then there is nothing more to do. */
+ if (!rcu_preempt_gp_in_progress())
+ return;
/*
- * If there is no GP, or if blocked readers are still blocking GP,
- * then there is nothing more to do.
+ * Check up on boosting. If there are no readers blocking the
+ * current grace period, leave.
*/
- if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
+ if (rcu_initiate_boost())
return;
/* Advance callbacks. */
if (!rcu_preempt_blocked_readers_any())
rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
- /* If there are done callbacks, make RCU_SOFTIRQ process them. */
+ /* If there are done callbacks, cause them to be invoked. */
if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_kthread();
}
/*
/* Official start of GP. */
rcu_preempt_ctrlblk.gpnum++;
+ RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
/* Any blocked RCU readers block new GP. */
if (rcu_preempt_blocked_readers_any())
rcu_preempt_ctrlblk.gp_tasks =
rcu_preempt_ctrlblk.blkd_tasks.next;
+ /* Set up for RCU priority boosting. */
+ rcu_preempt_boost_start_gp();
+
/* If there is no running reader, CPU is done with GP. */
if (!rcu_preempt_running_reader())
rcu_preempt_cpu_qs();
*/
empty = !rcu_preempt_blocked_readers_cgp();
empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
- np = t->rcu_node_entry.next;
- if (np == &rcu_preempt_ctrlblk.blkd_tasks)
- np = NULL;
+ np = rcu_next_node_entry(t);
list_del(&t->rcu_node_entry);
if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
rcu_preempt_ctrlblk.gp_tasks = np;
if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
rcu_preempt_ctrlblk.exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+ if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
+ rcu_preempt_ctrlblk.boost_tasks = np;
+#endif /* #ifdef CONFIG_RCU_BOOST */
INIT_LIST_HEAD(&t->rcu_node_entry);
/*
if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
rcu_report_exp_done();
}
+#ifdef CONFIG_RCU_BOOST
+ /* Unboost self if was boosted. */
+ if (special & RCU_READ_UNLOCK_BOOSTED) {
+ t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+ rt_mutex_unlock(t->rcu_boost_mutex);
+ t->rcu_boost_mutex = NULL;
+ }
+#endif /* #ifdef CONFIG_RCU_BOOST */
local_irq_restore(flags);
}
rcu_preempt_cpu_qs();
if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
rcu_preempt_ctrlblk.rcb.donetail)
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_kthread();
if (rcu_preempt_gp_in_progress() &&
rcu_cpu_blocking_cur_gp() &&
rcu_preempt_running_reader())
/*
* TINY_PREEMPT_RCU has an extra callback-list tail pointer to
- * update, so this is invoked from __rcu_process_callbacks() to
+ * update, so this is invoked from rcu_process_callbacks() to
* handle that case. Of course, it is invoked for all flavors of
* RCU, but RCU callbacks can appear only on one of the lists, and
* neither ->nexttail nor ->donetail can possibly be NULL, so there
*/
static void rcu_preempt_process_callbacks(void)
{
- __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+ rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
}
/*
local_irq_save(flags);
*rcu_preempt_ctrlblk.nexttail = head;
rcu_preempt_ctrlblk.nexttail = &head->next;
+ RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
rcu_preempt_start_gp(); /* checks to see if GP needed. */
local_irq_restore(flags);
}
/* Wait for tail of ->blkd_tasks list to drain. */
if (rcu_preempted_readers_exp())
+ rcu_initiate_expedited_boost();
wait_event(sync_rcu_preempt_exp_wq,
!rcu_preempted_readers_exp());
#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+#ifdef CONFIG_RCU_TRACE
+
+/*
+ * Because preemptible RCU does not exist, it is not necessary to
+ * dump out its statistics.
+ */
+static void show_tiny_preempt_stats(struct seq_file *m)
+{
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Because preemptible RCU does not exist, it is never necessary to
+ * boost preempted RCU readers.
+ */
+static int rcu_boost(void)
+{
+ return 0;
+}
+
/*
* Because preemptible RCU does not exist, it never has any callbacks
* to check.
#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
#include <linux/kernel_stat.h>
/*
* During boot, we forgive RCU lockdep issues. After this function is
* invoked, we start taking RCU lockdep issues seriously.
*/
-void rcu_scheduler_starting(void)
+void __init rcu_scheduler_starting(void)
{
WARN_ON(nr_context_switches() > 0);
rcu_scheduler_active = 1;
}
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define RCU_BOOST_PRIO 1
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+#ifdef CONFIG_RCU_TRACE
+
+#ifdef CONFIG_RCU_BOOST
+
+static void rcu_initiate_boost_trace(void)
+{
+ if (rcu_preempt_ctrlblk.gp_tasks == NULL)
+ rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
+ else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
+ rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
+ else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
+ rcu_preempt_ctrlblk.n_normal_balk_boosted++;
+ else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
+ rcu_preempt_ctrlblk.n_normal_balk_notyet++;
+ else
+ rcu_preempt_ctrlblk.n_normal_balk_nos++;
+}
+
+static void rcu_initiate_exp_boost_trace(void)
+{
+ if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
+ rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
+ else
+ rcu_preempt_ctrlblk.n_exp_balk_nos++;
+}
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ rcp->qlen -= n;
+ raw_local_irq_restore(flags);
+}
+
+/*
+ * Dump statistics for TINY_RCU, such as they are.
+ */
+static int show_tiny_stats(struct seq_file *m, void *unused)
+{
+ show_tiny_preempt_stats(m);
+ seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen);
+ seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen);
+ return 0;
+}
+
+static int show_tiny_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_tiny_stats, NULL);
+}
+
+static const struct file_operations show_tiny_stats_fops = {
+ .owner = THIS_MODULE,
+ .open = show_tiny_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *rcudir;
+
+static int __init rcutiny_trace_init(void)
+{
+ struct dentry *retval;
+
+ rcudir = debugfs_create_dir("rcu", NULL);
+ if (!rcudir)
+ goto free_out;
+ retval = debugfs_create_file("rcudata", 0444, rcudir,
+ NULL, &show_tiny_stats_fops);
+ if (!retval)
+ goto free_out;
+ return 0;
+free_out:
+ debugfs_remove_recursive(rcudir);
+ return 1;
+}
+
+static void __exit rcutiny_trace_cleanup(void)
+{
+ debugfs_remove_recursive(rcudir);
+}
+
+module_init(rcutiny_trace_init);
+module_exit(rcutiny_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
+MODULE_LICENSE("GPL");
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
#include <linux/srcu.h>
#include <linux/slab.h>
#include <asm/byteorder.h>
+#include <linux/sched.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
static int fqs_holdoff = 0; /* Hold time within burst (us). */
static int fqs_stutter = 3; /* Wait time between bursts (s). */
+static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
+static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
+static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
static char *torture_type = "rcu"; /* What RCU implementation to torture. */
module_param(nreaders, int, 0444);
MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
module_param(fqs_stutter, int, 0444);
MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(test_boost, int, 0444);
+MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+module_param(test_boost_interval, int, 0444);
+MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+module_param(test_boost_duration, int, 0444);
+MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
static struct task_struct *shuffler_task;
static struct task_struct *stutter_task;
static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
#define RCU_TORTURE_PIPE_LEN 10
static atomic_t n_rcu_torture_free;
static atomic_t n_rcu_torture_mberror;
static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_allocerror;
+static long n_rcu_torture_boost_afferror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
static long n_rcu_torture_timers;
static struct list_head rcu_torture_removed;
static cpumask_var_t shuffle_tmp_mask;
#endif
int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+#ifdef CONFIG_RCU_BOOST
+#define rcu_can_boost() 1
+#else /* #ifdef CONFIG_RCU_BOOST */
+#define rcu_can_boost() 0
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
+static unsigned long boost_starttime; /* jiffies of next boost test start. */
+DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
+ /* and boost task create/destroy. */
+
/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
void (*fqs)(void);
int (*stats)(char *page);
int irq_capable;
+ int can_boost;
char *name;
};
.fqs = rcu_force_quiescent_state,
.stats = NULL,
.irq_capable = 1,
+ .can_boost = rcu_can_boost(),
.name = "rcu"
};
.fqs = rcu_force_quiescent_state,
.stats = NULL,
.irq_capable = 1,
+ .can_boost = rcu_can_boost(),
.name = "rcu_sync"
};
.fqs = rcu_force_quiescent_state,
.stats = NULL,
.irq_capable = 1,
+ .can_boost = rcu_can_boost(),
.name = "rcu_expedited"
};
.name = "sched_expedited"
};
+/*
+ * RCU torture priority-boost testing. Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked. If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+
+struct rcu_boost_inflight {
+ struct rcu_head rcu;
+ int inflight;
+};
+
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+ struct rcu_boost_inflight *rbip =
+ container_of(head, struct rcu_boost_inflight, rcu);
+
+ smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+ rbip->inflight = 0;
+}
+
+static int rcu_torture_boost(void *arg)
+{
+ unsigned long call_rcu_time;
+ unsigned long endtime;
+ unsigned long oldstarttime;
+ struct rcu_boost_inflight rbi = { .inflight = 0 };
+ struct sched_param sp;
+
+ VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+
+ /* Set real-time priority. */
+ sp.sched_priority = 1;
+ if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+ VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+ n_rcu_torture_boost_rterror++;
+ }
+
+ /* Each pass through the following loop does one boost-test cycle. */
+ do {
+ /* Wait for the next test interval. */
+ oldstarttime = boost_starttime;
+ while (jiffies - oldstarttime > ULONG_MAX / 2) {
+ schedule_timeout_uninterruptible(1);
+ rcu_stutter_wait("rcu_torture_boost");
+ if (kthread_should_stop() ||
+ fullstop != FULLSTOP_DONTSTOP)
+ goto checkwait;
+ }
+
+ /* Do one boost-test interval. */
+ endtime = oldstarttime + test_boost_duration * HZ;
+ call_rcu_time = jiffies;
+ while (jiffies - endtime > ULONG_MAX / 2) {
+ /* If we don't have a callback in flight, post one. */
+ if (!rbi.inflight) {
+ smp_mb(); /* RCU core before ->inflight = 1. */
+ rbi.inflight = 1;
+ call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+ if (jiffies - call_rcu_time >
+ test_boost_duration * HZ - HZ / 2) {
+ VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+ n_rcu_torture_boost_failure++;
+ }
+ call_rcu_time = jiffies;
+ }
+ cond_resched();
+ rcu_stutter_wait("rcu_torture_boost");
+ if (kthread_should_stop() ||
+ fullstop != FULLSTOP_DONTSTOP)
+ goto checkwait;
+ }
+
+ /*
+ * Set the start time of the next test interval.
+ * Yes, this is vulnerable to long delays, but such
+ * delays simply cause a false negative for the next
+ * interval. Besides, we are running at RT priority,
+ * so delays should be relatively rare.
+ */
+ while (oldstarttime == boost_starttime) {
+ if (mutex_trylock(&boost_mutex)) {
+ boost_starttime = jiffies +
+ test_boost_interval * HZ;
+ n_rcu_torture_boosts++;
+ mutex_unlock(&boost_mutex);
+ break;
+ }
+ schedule_timeout_uninterruptible(1);
+ }
+
+ /* Go do the stutter. */
+checkwait: rcu_stutter_wait("rcu_torture_boost");
+ } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+
+ /* Clean up and exit. */
+ VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+ rcutorture_shutdown_absorb("rcu_torture_boost");
+ while (!kthread_should_stop() || rbi.inflight)
+ schedule_timeout_uninterruptible(1);
+ smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+ return 0;
+}
+
/*
* RCU torture force-quiescent-state kthread. Repeatedly induces
* bursts of calls to force_quiescent_state(), increasing the probability
cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
cnt += sprintf(&page[cnt],
"rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
- "rtmbe: %d nt: %ld",
+ "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
+ "rtbf: %ld rtb: %ld nt: %ld",
rcu_torture_current,
rcu_torture_current_version,
list_empty(&rcu_torture_freelist),
atomic_read(&n_rcu_torture_alloc_fail),
atomic_read(&n_rcu_torture_free),
atomic_read(&n_rcu_torture_mberror),
+ n_rcu_torture_boost_ktrerror,
+ n_rcu_torture_boost_rterror,
+ n_rcu_torture_boost_allocerror,
+ n_rcu_torture_boost_afferror,
+ n_rcu_torture_boost_failure,
+ n_rcu_torture_boosts,
n_rcu_torture_timers);
- if (atomic_read(&n_rcu_torture_mberror) != 0)
+ if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+ n_rcu_torture_boost_ktrerror != 0 ||
+ n_rcu_torture_boost_rterror != 0 ||
+ n_rcu_torture_boost_allocerror != 0 ||
+ n_rcu_torture_boost_afferror != 0 ||
+ n_rcu_torture_boost_failure != 0)
cnt += sprintf(&page[cnt], " !!!");
cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
if (i > 1) {
}
static inline void
-rcu_torture_print_module_parms(char *tag)
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
{
printk(KERN_ALERT "%s" TORTURE_FLAG
"--- %s: nreaders=%d nfakewriters=%d "
"stat_interval=%d verbose=%d test_no_idle_hz=%d "
"shuffle_interval=%d stutter=%d irqreader=%d "
- "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
+ "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+ "test_boost=%d/%d test_boost_interval=%d "
+ "test_boost_duration=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
- stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
+ stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+ test_boost, cur_ops->can_boost,
+ test_boost_interval, test_boost_duration);
}
-static struct notifier_block rcutorture_nb = {
+static struct notifier_block rcutorture_shutdown_nb = {
.notifier_call = rcutorture_shutdown_notify,
};
+static void rcutorture_booster_cleanup(int cpu)
+{
+ struct task_struct *t;
+
+ if (boost_tasks[cpu] == NULL)
+ return;
+ mutex_lock(&boost_mutex);
+ VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
+ t = boost_tasks[cpu];
+ boost_tasks[cpu] = NULL;
+ mutex_unlock(&boost_mutex);
+
+ /* This must be outside of the mutex, otherwise deadlock! */
+ kthread_stop(t);
+}
+
+static int rcutorture_booster_init(int cpu)
+{
+ int retval;
+
+ if (boost_tasks[cpu] != NULL)
+ return 0; /* Already created, nothing more to do. */
+
+ /* Don't allow time recalculation while creating a new task. */
+ mutex_lock(&boost_mutex);
+ VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+ boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
+ "rcu_torture_boost");
+ if (IS_ERR(boost_tasks[cpu])) {
+ retval = PTR_ERR(boost_tasks[cpu]);
+ VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+ n_rcu_torture_boost_ktrerror++;
+ boost_tasks[cpu] = NULL;
+ mutex_unlock(&boost_mutex);
+ return retval;
+ }
+ kthread_bind(boost_tasks[cpu], cpu);
+ wake_up_process(boost_tasks[cpu]);
+ mutex_unlock(&boost_mutex);
+ return 0;
+}
+
+static int rcutorture_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ (void)rcutorture_booster_init(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ rcutorture_booster_cleanup(cpu);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rcutorture_cpu_nb = {
+ .notifier_call = rcutorture_cpu_notify,
+};
+
static void
rcu_torture_cleanup(void)
{
}
fullstop = FULLSTOP_RMMOD;
mutex_unlock(&fullstop_mutex);
- unregister_reboot_notifier(&rcutorture_nb);
+ unregister_reboot_notifier(&rcutorture_shutdown_nb);
if (stutter_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
kthread_stop(stutter_task);
kthread_stop(fqs_task);
}
fqs_task = NULL;
+ if ((test_boost == 1 && cur_ops->can_boost) ||
+ test_boost == 2) {
+ unregister_cpu_notifier(&rcutorture_cpu_nb);
+ for_each_possible_cpu(i)
+ rcutorture_booster_cleanup(i);
+ }
/* Wait for all RCU callbacks to fire. */
if (cur_ops->cleanup)
cur_ops->cleanup();
if (atomic_read(&n_rcu_torture_error))
- rcu_torture_print_module_parms("End of test: FAILURE");
+ rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
else
- rcu_torture_print_module_parms("End of test: SUCCESS");
+ rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
}
static int __init
nrealreaders = nreaders;
else
nrealreaders = 2 * num_online_cpus();
- rcu_torture_print_module_parms("Start of test");
+ rcu_torture_print_module_parms(cur_ops, "Start of test");
fullstop = FULLSTOP_DONTSTOP;
/* Set up the freelist. */
atomic_set(&n_rcu_torture_free, 0);
atomic_set(&n_rcu_torture_mberror, 0);
atomic_set(&n_rcu_torture_error, 0);
+ n_rcu_torture_boost_ktrerror = 0;
+ n_rcu_torture_boost_rterror = 0;
+ n_rcu_torture_boost_allocerror = 0;
+ n_rcu_torture_boost_afferror = 0;
+ n_rcu_torture_boost_failure = 0;
+ n_rcu_torture_boosts = 0;
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
atomic_set(&rcu_torture_wcount[i], 0);
for_each_possible_cpu(cpu) {
goto unwind;
}
}
- register_reboot_notifier(&rcutorture_nb);
+ if (test_boost_interval < 1)
+ test_boost_interval = 1;
+ if (test_boost_duration < 2)
+ test_boost_duration = 2;
+ if ((test_boost == 1 && cur_ops->can_boost) ||
+ test_boost == 2) {
+ int retval;
+
+ boost_starttime = jiffies + test_boost_interval * HZ;
+ register_cpu_notifier(&rcutorture_cpu_nb);
+ for_each_possible_cpu(i) {
+ if (cpu_is_offline(i))
+ continue; /* Heuristic: CPU can go offline. */
+ retval = rcutorture_booster_init(i);
+ if (retval < 0) {
+ firsterr = retval;
+ goto unwind;
+ }
+ }
+ }
+ register_reboot_notifier(&rcutorture_shutdown_nb);
mutex_unlock(&fullstop_mutex);
return 0;
.gpnum = -300, \
.completed = -300, \
.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
- .orphan_cbs_list = NULL, \
- .orphan_cbs_tail = &structname.orphan_cbs_list, \
- .orphan_qlen = 0, \
.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
.n_force_qs = 0, \
.n_force_qs_ngp = 0, \
static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
{
if (rdp->gpnum != rnp->gpnum) {
- rdp->qs_pending = 1;
- rdp->passed_quiesc = 0;
+ /*
+ * If the current grace period is waiting for this CPU,
+ * set up to detect a quiescent state, otherwise don't
+ * go looking for one.
+ */
rdp->gpnum = rnp->gpnum;
+ if (rnp->qsmask & rdp->grpmask) {
+ rdp->qs_pending = 1;
+ rdp->passed_quiesc = 0;
+ } else
+ rdp->qs_pending = 0;
}
}
/* Remember that we saw this grace-period completion. */
rdp->completed = rnp->completed;
+
+ /*
+ * If we were in an extended quiescent state, we may have
+ * missed some grace periods that others CPUs handled on
+ * our behalf. Catch up with this state to avoid noting
+ * spurious new grace periods. If another grace period
+ * has started, then rnp->gpnum will have advanced, so
+ * we will detect this later on.
+ */
+ if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
+ rdp->gpnum = rdp->completed;
+
+ /*
+ * If RCU does not need a quiescent state from this CPU,
+ * then make sure that this CPU doesn't go looking for one.
+ */
+ if ((rnp->qsmask & rdp->grpmask) == 0)
+ rdp->qs_pending = 0;
}
}
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
- * specified flavor of RCU. The callbacks will be adopted by the next
- * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
- * comes first. Because this is invoked from the CPU_DYING notifier,
- * irqs are already disabled.
+ * Move a dying CPU's RCU callbacks to online CPU's callback list.
+ * Synchronization is not required because this function executes
+ * in stop_machine() context.
*/
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
{
int i;
+ /* current DYING CPU is cleared in the cpu_online_mask */
+ int receive_cpu = cpumask_any(cpu_online_mask);
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+ struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
if (rdp->nxtlist == NULL)
return; /* irqs disabled, so comparison is stable. */
- raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
- *rsp->orphan_cbs_tail = rdp->nxtlist;
- rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
+
+ *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+ receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+ receive_rdp->qlen += rdp->qlen;
+ receive_rdp->n_cbs_adopted += rdp->qlen;
+ rdp->n_cbs_orphaned += rdp->qlen;
+
rdp->nxtlist = NULL;
for (i = 0; i < RCU_NEXT_SIZE; i++)
rdp->nxttail[i] = &rdp->nxtlist;
- rsp->orphan_qlen += rdp->qlen;
- rdp->n_cbs_orphaned += rdp->qlen;
rdp->qlen = 0;
- raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
-}
-
-/*
- * Adopt previously orphaned RCU callbacks.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
-{
- unsigned long flags;
- struct rcu_data *rdp;
-
- raw_spin_lock_irqsave(&rsp->onofflock, flags);
- rdp = this_cpu_ptr(rsp->rda);
- if (rsp->orphan_cbs_list == NULL) {
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
- return;
- }
- *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
- rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
- rdp->qlen += rsp->orphan_qlen;
- rdp->n_cbs_adopted += rsp->orphan_qlen;
- rsp->orphan_cbs_list = NULL;
- rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
- rsp->orphan_qlen = 0;
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
}
/*
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (need_report & RCU_OFL_TASKS_EXP_GP)
rcu_report_exp_rnp(rsp, rnp);
-
- rcu_adopt_orphan_cbs(rsp);
}
/*
#else /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
-{
-}
-
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+static void rcu_send_cbs_to_online(struct rcu_state *rsp)
{
}
*/
local_irq_save(flags);
rdp = this_cpu_ptr(rsp->rda);
- rcu_process_gp_end(rsp, rdp);
- check_for_new_grace_period(rsp, rdp);
/* Add the callback to our list. */
*rdp->nxttail[RCU_NEXT_TAIL] = head;
rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
- /* Start a new grace period if one not already started. */
- if (!rcu_gp_in_progress(rsp)) {
- unsigned long nestflag;
- struct rcu_node *rnp_root = rcu_get_root(rsp);
-
- raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
- rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
- }
-
/*
* Force the grace period if too many callbacks or too long waiting.
* Enforce hysteresis, and don't invoke force_quiescent_state()
* is the only one waiting for a grace period to complete.
*/
if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
- rdp->blimit = LONG_MAX;
- if (rsp->n_force_qs == rdp->n_force_qs_snap &&
- *rdp->nxttail[RCU_DONE_TAIL] != head)
- force_quiescent_state(rsp, 0);
- rdp->n_force_qs_snap = rsp->n_force_qs;
- rdp->qlen_last_fqs_check = rdp->qlen;
+
+ /* Are we ignoring a completed grace period? */
+ rcu_process_gp_end(rsp, rdp);
+ check_for_new_grace_period(rsp, rdp);
+
+ /* Start a new grace period if one not already started. */
+ if (!rcu_gp_in_progress(rsp)) {
+ unsigned long nestflag;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+ raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+ rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
+ } else {
+ /* Give the grace period a kick. */
+ rdp->blimit = LONG_MAX;
+ if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+ *rdp->nxttail[RCU_DONE_TAIL] != head)
+ force_quiescent_state(rsp, 0);
+ rdp->n_force_qs_snap = rsp->n_force_qs;
+ rdp->qlen_last_fqs_check = rdp->qlen;
+ }
} else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
force_quiescent_state(rsp, 1);
local_irq_restore(flags);
* decrement rcu_barrier_cpu_count -- otherwise the first CPU
* might complete its grace period before all of the other CPUs
* did their increment, causing this function to return too
- * early.
+ * early. Note that on_each_cpu() disables irqs, which prevents
+ * any CPUs from coming online or going offline until each online
+ * CPU has queued its RCU-barrier callback.
*/
atomic_set(&rcu_barrier_cpu_count, 1);
- preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
- rcu_adopt_orphan_cbs(rsp);
on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
- preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
if (atomic_dec_and_test(&rcu_barrier_cpu_count))
complete(&rcu_barrier_completion);
wait_for_completion(&rcu_barrier_completion);
case CPU_DYING:
case CPU_DYING_FROZEN:
/*
- * preempt_disable() in _rcu_barrier() prevents stop_machine(),
- * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
- * returns, all online cpus have queued rcu_barrier_func().
- * The dying CPU clears its cpu_online_mask bit and
- * moves all of its RCU callbacks to ->orphan_cbs_list
- * in the context of stop_machine(), so subsequent calls
- * to _rcu_barrier() will adopt these callbacks and only
- * then queue rcu_barrier_func() on all remaining CPUs.
+ * The whole machine is "stopped" except this CPU, so we can
+ * touch any data without introducing corruption. We send the
+ * dying CPU's callbacks to an arbitrarily chosen online CPU.
*/
- rcu_send_cbs_to_orphanage(&rcu_bh_state);
- rcu_send_cbs_to_orphanage(&rcu_sched_state);
- rcu_preempt_send_cbs_to_orphanage();
+ rcu_send_cbs_to_online(&rcu_bh_state);
+ rcu_send_cbs_to_online(&rcu_sched_state);
+ rcu_preempt_send_cbs_to_online();
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
{
int i;
- for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
+ for (i = NUM_RCU_LVLS - 1; i > 0; i--)
rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+ rsp->levelspread[0] = RCU_FANOUT_LEAF;
}
#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
static void __init rcu_init_levelspread(struct rcu_state *rsp)
/*
* Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
* In theory, it should be possible to add more levels straightforwardly.
- * In practice, this has not been tested, so there is probably some
- * bug somewhere.
+ * In practice, this did work well going from three levels to four.
+ * Of course, your mileage may vary.
*/
#define MAX_RCU_LVLS 4
-#define RCU_FANOUT (CONFIG_RCU_FANOUT)
-#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
-#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
-#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT)
-
-#if NR_CPUS <= RCU_FANOUT
+#if CONFIG_RCU_FANOUT > 16
+#define RCU_FANOUT_LEAF 16
+#else /* #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT)
+#endif /* #else #if CONFIG_RCU_FANOUT > 16 */
+#define RCU_FANOUT_1 (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
+
+#if NR_CPUS <= RCU_FANOUT_1
# define NUM_RCU_LVLS 1
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 (NR_CPUS)
# define NUM_RCU_LVL_2 0
# define NUM_RCU_LVL_3 0
# define NUM_RCU_LVL_4 0
-#elif NR_CPUS <= RCU_FANOUT_SQ
+#elif NR_CPUS <= RCU_FANOUT_2
# define NUM_RCU_LVLS 2
# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
+# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
# define NUM_RCU_LVL_2 (NR_CPUS)
# define NUM_RCU_LVL_3 0
# define NUM_RCU_LVL_4 0
-#elif NR_CPUS <= RCU_FANOUT_CUBE
+#elif NR_CPUS <= RCU_FANOUT_3
# define NUM_RCU_LVLS 3
# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
-# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
-# define NUM_RCU_LVL_3 NR_CPUS
+# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+# define NUM_RCU_LVL_3 (NR_CPUS)
# define NUM_RCU_LVL_4 0
-#elif NR_CPUS <= RCU_FANOUT_FOURTH
+#elif NR_CPUS <= RCU_FANOUT_4
# define NUM_RCU_LVLS 4
# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE)
-# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
-# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
-# define NUM_RCU_LVL_4 NR_CPUS
+# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
+# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+# define NUM_RCU_LVL_4 (NR_CPUS)
#else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
long qlen_last_fqs_check;
/* qlen at last check for QS forcing */
unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
- unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */
- unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */
+ unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
+ unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
/* End of fields guarded by root rcu_node's lock. */
raw_spinlock_t onofflock; /* exclude on/offline and */
- /* starting new GP. Also */
- /* protects the following */
- /* orphan_cbs fields. */
- struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
- /* orphaned by all CPUs in */
- /* a given leaf rcu_node */
- /* going offline. */
- struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
- long orphan_qlen; /* Number of orphaned cbs. */
+ /* starting new GP. */
raw_spinlock_t fqslock; /* Only one task forcing */
/* quiescent states. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
static int rcu_preempt_pending(int cpu);
static int rcu_preempt_needs_cpu(int cpu);
static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_send_cbs_to_orphanage(void);
+static void rcu_preempt_send_cbs_to_online(void);
static void __init __rcu_init_preempt(void);
static void rcu_needs_cpu_flush(void);
*/
#include <linux/delay.h>
+#include <linux/stop_machine.h>
/*
* Check the RCU kernel configuration parameters and print informative
}
/*
- * Move preemptable RCU's callbacks to ->orphan_cbs_list.
+ * Move preemptable RCU's callbacks from dying CPU to other online CPU.
*/
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
{
- rcu_send_cbs_to_orphanage(&rcu_preempt_state);
+ rcu_send_cbs_to_online(&rcu_preempt_state);
}
/*
/*
* Because there is no preemptable RCU, there are no callbacks to move.
*/
-static void rcu_preempt_send_cbs_to_orphanage(void)
+static void rcu_preempt_send_cbs_to_online(void)
{
}
#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifndef CONFIG_SMP
+
+void synchronize_sched_expedited(void)
+{
+ cond_resched();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
+
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+ /*
+ * There must be a full memory barrier on each affected CPU
+ * between the time that try_stop_cpus() is called and the
+ * time that it returns.
+ *
+ * In the current initial implementation of cpu_stop, the
+ * above condition is already met when the control reaches
+ * this point and the following smp_mb() is not strictly
+ * necessary. Do smp_mb() anyway for documentation and
+ * robustness against future implementation changes.
+ */
+ smp_mb(); /* See above comment block. */
+ return 0;
+}
+
+/*
+ * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ * approach to force grace period to end quickly. This consumes
+ * significant time on all CPUs, and is thus not recommended for
+ * any sort of common-case code.
+ *
+ * Note that it is illegal to call this function while holding any
+ * lock that is acquired by a CPU-hotplug notifier. Failing to
+ * observe this restriction will result in deadlock.
+ *
+ * This implementation can be thought of as an application of ticket
+ * locking to RCU, with sync_sched_expedited_started and
+ * sync_sched_expedited_done taking on the roles of the halves
+ * of the ticket-lock word. Each task atomically increments
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
+ * then attempts to stop all the CPUs. If this succeeds, then each
+ * CPU will have executed a context switch, resulting in an RCU-sched
+ * grace period. We are then done, so we use atomic_cmpxchg() to
+ * update sync_sched_expedited_done to match our snapshot -- but
+ * only if someone else has not already advanced past our snapshot.
+ *
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done. If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot. In this case, our work is
+ * done for us, and we can simply return. Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
+ */
+void synchronize_sched_expedited(void)
+{
+ int firstsnap, s, snap, trycount = 0;
+
+ /* Note that atomic_inc_return() implies full memory barrier. */
+ firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
+ get_online_cpus();
+
+ /*
+ * Each pass through the following loop attempts to force a
+ * context switch on each CPU.
+ */
+ while (try_stop_cpus(cpu_online_mask,
+ synchronize_sched_expedited_cpu_stop,
+ NULL) == -EAGAIN) {
+ put_online_cpus();
+
+ /* No joy, try again later. Or just synchronize_sched(). */
+ if (trycount++ < 10)
+ udelay(trycount * num_online_cpus());
+ else {
+ synchronize_sched();
+ return;
+ }
+
+ /* Check to see if someone else did our work for us. */
+ s = atomic_read(&sync_sched_expedited_done);
+ if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
+ smp_mb(); /* ensure test happens before caller kfree */
+ return;
+ }
+
+ /*
+ * Refetching sync_sched_expedited_started allows later
+ * callers to piggyback on our grace period. We subtract
+ * 1 to get the same token that the last incrementer got.
+ * We retry after they started, so our grace period works
+ * for them, and they started after our first try, so their
+ * grace period works for us.
+ */
+ get_online_cpus();
+ snap = atomic_read(&sync_sched_expedited_started) - 1;
+ smp_mb(); /* ensure read is before try_stop_cpus(). */
+ }
+
+ /*
+ * Everyone up to our most recent fetch is covered by our grace
+ * period. Update the counter, but only if our work is still
+ * relevant -- which it won't be if someone who started later
+ * than we did beat us to the punch.
+ */
+ do {
+ s = atomic_read(&sync_sched_expedited_done);
+ if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+ smp_mb(); /* ensure test happens before caller kfree */
+ break;
+ }
+ } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#endif /* #else #ifndef CONFIG_SMP */
+
#if !defined(CONFIG_RCU_FAST_NO_HZ)
/*
gpnum = rsp->gpnum;
seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
- "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
+ "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
rsp->completed, gpnum, rsp->signaled,
(long)(rsp->jiffies_force_qs - jiffies),
(int)(jiffies & 0xffff),
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
- rsp->n_force_qs_lh, rsp->orphan_qlen);
+ rsp->n_force_qs_lh);
for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
static struct dentry *rcudir;
-static int __init rcuclassic_trace_init(void)
+static int __init rcutree_trace_init(void)
{
struct dentry *retval;
return 1;
}
-static void __exit rcuclassic_trace_cleanup(void)
+static void __exit rcutree_trace_cleanup(void)
{
debugfs_remove_recursive(rcudir);
}
-module_init(rcuclassic_trace_init);
-module_exit(rcuclassic_trace_cleanup);
+module_init(rcutree_trace_init);
+module_exit(rcutree_trace_cleanup);
MODULE_AUTHOR("Paul E. McKenney");
MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
#include <asm/tlb.h>
#include <asm/irq_regs.h>
+#include <asm/mutex.h>
#include "sched_cpupri.h"
#include "workqueue_sched.h"
+#include "sched_autogroup.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
unsigned long shares;
+
+ atomic_t load_weight;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
struct task_group *parent;
struct list_head siblings;
struct list_head children;
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+ struct autogroup *autogroup;
+#endif
};
#define root_task_group init_task_group
-/* task_group_lock serializes add/remove of task groups and also changes to
- * a task group's cpu shares.
- */
+/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
-static int root_task_group_empty(void)
-{
- return list_empty(&root_task_group.children);
-}
-#endif
-
# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
/*
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
* list is used during load balance.
*/
+ int on_list;
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
unsigned long h_load;
/*
- * this cpu's part of tg->shares
+ * Maintaining per-cpu shares distribution for group scheduling
+ *
+ * load_stamp is the last time we updated the load average
+ * load_last is the last time we updated the load average and saw load
+ * load_unacc_exec_time is currently unaccounted execution time
*/
- unsigned long shares;
+ u64 load_avg;
+ u64 load_period;
+ u64 load_stamp, load_last, load_unacc_exec_time;
- /*
- * load.weight at the time we set shares
- */
- unsigned long rq_weight;
+ unsigned long load_contribution;
#endif
#endif
};
*/
static inline struct task_group *task_group(struct task_struct *p)
{
+ struct task_group *tg;
struct cgroup_subsys_state *css;
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
lockdep_is_held(&task_rq(p)->lock));
- return container_of(css, struct task_group, css);
+ tg = container_of(css, struct task_group, css);
+
+ return autogroup_task_group(p, tg);
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
*/
const_debug unsigned int sysctl_sched_nr_migrate = 32;
-/*
- * ratelimit for updating the group shares.
- * default: 0.25ms
- */
-unsigned int sysctl_sched_shares_ratelimit = 250000;
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
-
-/*
- * Inject some fuzzyness into changing the per-cpu group shares
- * this avoids remote rq-locks at the expense of fairness.
- * default: 4
- */
-unsigned int sysctl_sched_shares_thresh = 4;
-
/*
* period over which we average the RT time consumption, measured
* in ms.
lw->inv_weight = 0;
}
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+ lw->weight = w;
+ lw->inv_weight = 0;
+}
+
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
#ifdef CONFIG_FAIR_GROUP_SCHED
-static __read_mostly unsigned long __percpu *update_shares_data;
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
- unsigned long sd_shares,
- unsigned long sd_rq_weight,
- unsigned long *usd_rq_weight)
-{
- unsigned long shares, rq_weight;
- int boost = 0;
-
- rq_weight = usd_rq_weight[cpu];
- if (!rq_weight) {
- boost = 1;
- rq_weight = NICE_0_LOAD;
- }
-
- /*
- * \Sum_j shares_j * rq_weight_i
- * shares_i = -----------------------------
- * \Sum_j rq_weight_j
- */
- shares = (sd_shares * rq_weight) / sd_rq_weight;
- shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-
- if (abs(shares - tg->se[cpu]->load.weight) >
- sysctl_sched_shares_thresh) {
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
- tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
- __set_se_shares(tg->se[cpu], shares);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- }
-}
-
-/*
- * Re-compute the task group their per cpu shares over the given domain.
- * This needs to be done in a bottom-up fashion because the rq weight of a
- * parent group depends on the shares of its child groups.
- */
-static int tg_shares_up(struct task_group *tg, void *data)
-{
- unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
- unsigned long *usd_rq_weight;
- struct sched_domain *sd = data;
- unsigned long flags;
- int i;
-
- if (!tg->se[0])
- return 0;
-
- local_irq_save(flags);
- usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
-
- for_each_cpu(i, sched_domain_span(sd)) {
- weight = tg->cfs_rq[i]->load.weight;
- usd_rq_weight[i] = weight;
-
- rq_weight += weight;
- /*
- * If there are currently no tasks on the cpu pretend there
- * is one of average load so that when a new task gets to
- * run here it will not get delayed by group starvation.
- */
- if (!weight)
- weight = NICE_0_LOAD;
-
- sum_weight += weight;
- shares += tg->cfs_rq[i]->shares;
- }
-
- if (!rq_weight)
- rq_weight = sum_weight;
-
- if ((!shares && rq_weight) || shares > tg->shares)
- shares = tg->shares;
-
- if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
- shares = tg->shares;
-
- for_each_cpu(i, sched_domain_span(sd))
- update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
-
- local_irq_restore(flags);
-
- return 0;
-}
-
/*
* Compute the cpu's hierarchical load factor for each task group.
* This needs to be done in a top-down fashion because the load of a child
load = cpu_rq(cpu)->load.weight;
} else {
load = tg->parent->cfs_rq[cpu]->h_load;
- load *= tg->cfs_rq[cpu]->shares;
+ load *= tg->se[cpu]->load.weight;
load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
}
return 0;
}
-static void update_shares(struct sched_domain *sd)
-{
- s64 elapsed;
- u64 now;
-
- if (root_task_group_empty())
- return;
-
- now = local_clock();
- elapsed = now - sd->last_update;
-
- if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
- sd->last_update = now;
- walk_tg_tree(tg_nop, tg_shares_up, sd);
- }
-}
-
static void update_h_load(long cpu)
{
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
-#else
-
-static inline void update_shares(struct sched_domain *sd)
-{
-}
-
#endif
#ifdef CONFIG_PREEMPT
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-#ifdef CONFIG_SMP
- cfs_rq->shares = shares;
-#endif
-}
-#endif
-
static void calc_load_account_idle(struct rq *this_rq);
static void update_sysctl(void);
static int get_update_sysctl_factor(void);
#include "sched_idletask.c"
#include "sched_fair.c"
#include "sched_rt.c"
+#include "sched_autogroup.c"
#include "sched_stoptask.c"
#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
* The task's runqueue lock must be held.
* Returns true if you have to wait for migration thread.
*/
-static bool migrate_task(struct task_struct *p, int dest_cpu)
+static bool migrate_task(struct task_struct *p, struct rq *rq)
{
- struct rq *rq = task_rq(p);
-
/*
* If the task is not on a runqueue (and not running), then
* the next wake-up will properly place the task.
return dest_cpu;
/* No more Mr. Nice Guy. */
- if (unlikely(dest_cpu >= nr_cpu_ids)) {
- dest_cpu = cpuset_cpus_allowed_fallback(p);
- /*
- * Don't tell them about moving exiting tasks or
- * kernel threads (both mm NULL), since they never
- * leave kernel.
- */
- if (p->mm && printk_ratelimit()) {
- printk(KERN_INFO "process %d (%s) no "
- "longer affine to cpu%d\n",
- task_pid_nr(p), p->comm, cpu);
- }
+ dest_cpu = cpuset_cpus_allowed_fallback(p);
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit()) {
+ printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
+ task_pid_nr(p), p->comm, cpu);
}
return dest_cpu;
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
+#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
+#endif
put_cpu();
}
* select_task_rq() can race against ->cpus_allowed
*/
if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
- likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+ likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
struct migration_arg arg = { p, dest_cpu };
task_rq_unlock(rq, &flags);
if (task_thread_info(rq->curr) != owner || need_resched())
return 0;
- cpu_relax();
+ arch_mutex_cpu_relax();
}
return 1;
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
*/
-unsigned long __sched
+long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
unsigned long timeout)
{
* signaled or for a specified timeout to expire. It can be
* interrupted by a kill signal. The timeout is in jiffies.
*/
-unsigned long __sched
+long __sched
wait_for_completion_killable_timeout(struct completion *x,
unsigned long timeout)
{
}
static int __sched_setscheduler(struct task_struct *p, int policy,
- struct sched_param *param, bool user)
+ const struct sched_param *param, bool user)
{
int retval, oldprio, oldpolicy = -1, on_rq, running;
unsigned long flags;
* NOTE that the task may be already dead.
*/
int sched_setscheduler(struct task_struct *p, int policy,
- struct sched_param *param)
+ const struct sched_param *param)
{
return __sched_setscheduler(p, policy, param, true);
}
* but our caller might not have that capability.
*/
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
- struct sched_param *param)
+ const struct sched_param *param)
{
return __sched_setscheduler(p, policy, param, false);
}
unsigned state;
state = p->state ? __ffs(p->state) + 1 : 0;
- printk(KERN_INFO "%-13.13s %c", p->comm,
+ printk(KERN_INFO "%-15.15s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
#if BITS_PER_LONG == 32
if (state == TASK_RUNNING)
SET_SYSCTL(sched_min_granularity);
SET_SYSCTL(sched_latency);
SET_SYSCTL(sched_wakeup_granularity);
- SET_SYSCTL(sched_shares_ratelimit);
#undef SET_SYSCTL
}
goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (migrate_task(p, dest_cpu)) {
+ if (migrate_task(p, rq)) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, &flags);
}
#ifdef CONFIG_HOTPLUG_CPU
+
/*
- * Figure out where task on dead CPU should go, use force if necessary.
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
*/
-void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void idle_task_exit(void)
{
- struct rq *rq = cpu_rq(dead_cpu);
- int needs_cpu, uninitialized_var(dest_cpu);
- unsigned long flags;
+ struct mm_struct *mm = current->active_mm;
- local_irq_save(flags);
+ BUG_ON(cpu_online(smp_processor_id()));
- raw_spin_lock(&rq->lock);
- needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
- if (needs_cpu)
- dest_cpu = select_fallback_rq(dead_cpu, p);
- raw_spin_unlock(&rq->lock);
- /*
- * It can only fail if we race with set_cpus_allowed(),
- * in the racer should migrate the task anyway.
- */
- if (needs_cpu)
- __migrate_task(p, dead_cpu, dest_cpu);
- local_irq_restore(flags);
+ if (mm != &init_mm)
+ switch_mm(mm, &init_mm, current);
+ mmdrop(mm);
}
/*
static void migrate_nr_uninterruptible(struct rq *rq_src)
{
struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
- unsigned long flags;
- local_irq_save(flags);
- double_rq_lock(rq_src, rq_dest);
rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
rq_src->nr_uninterruptible = 0;
- double_rq_unlock(rq_src, rq_dest);
- local_irq_restore(flags);
-}
-
-/* Run through task list and migrate tasks from the dead cpu. */
-static void migrate_live_tasks(int src_cpu)
-{
- struct task_struct *p, *t;
-
- read_lock(&tasklist_lock);
-
- do_each_thread(t, p) {
- if (p == current)
- continue;
-
- if (task_cpu(p) == src_cpu)
- move_task_off_dead_cpu(src_cpu, p);
- } while_each_thread(t, p);
-
- read_unlock(&tasklist_lock);
}
/*
- * Schedules idle task to be the next runnable task on current CPU.
- * It does so by boosting its priority to highest possible.
- * Used by CPU offline code.
+ * remove the tasks which were accounted by rq from calc_load_tasks.
*/
-void sched_idle_next(void)
+static void calc_global_load_remove(struct rq *rq)
{
- int this_cpu = smp_processor_id();
- struct rq *rq = cpu_rq(this_cpu);
- struct task_struct *p = rq->idle;
- unsigned long flags;
-
- /* cpu has to be offline */
- BUG_ON(cpu_online(this_cpu));
-
- /*
- * Strictly not necessary since rest of the CPUs are stopped by now
- * and interrupts disabled on the current cpu.
- */
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-
- activate_task(rq, p, 0);
-
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+ rq->calc_load_active = 0;
}
/*
- * Ensures that the idle task is using init_mm right before its cpu goes
- * offline.
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
*/
-void idle_task_exit(void)
-{
- struct mm_struct *mm = current->active_mm;
-
- BUG_ON(cpu_online(smp_processor_id()));
-
- if (mm != &init_mm)
- switch_mm(mm, &init_mm, current);
- mmdrop(mm);
-}
-
-/* called under rq->lock with disabled interrupts */
-static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
+static void migrate_tasks(unsigned int dead_cpu)
{
struct rq *rq = cpu_rq(dead_cpu);
-
- /* Must be exiting, otherwise would be on tasklist. */
- BUG_ON(!p->exit_state);
-
- /* Cannot have done final schedule yet: would have vanished. */
- BUG_ON(p->state == TASK_DEAD);
-
- get_task_struct(p);
+ struct task_struct *next, *stop = rq->stop;
+ int dest_cpu;
/*
- * Drop lock around migration; if someone else moves it,
- * that's OK. No task can be added to this CPU, so iteration is
- * fine.
+ * Fudge the rq selection such that the below task selection loop
+ * doesn't get stuck on the currently eligible stop task.
+ *
+ * We're currently inside stop_machine() and the rq is either stuck
+ * in the stop_machine_cpu_stop() loop, or we're executing this code,
+ * either way we should never end up calling schedule() until we're
+ * done here.
*/
- raw_spin_unlock_irq(&rq->lock);
- move_task_off_dead_cpu(dead_cpu, p);
- raw_spin_lock_irq(&rq->lock);
-
- put_task_struct(p);
-}
-
-/* release_task() removes task from tasklist, so we won't find dead tasks. */
-static void migrate_dead_tasks(unsigned int dead_cpu)
-{
- struct rq *rq = cpu_rq(dead_cpu);
- struct task_struct *next;
+ rq->stop = NULL;
for ( ; ; ) {
- if (!rq->nr_running)
+ /*
+ * There's this thread running, bail when that's the only
+ * remaining thread.
+ */
+ if (rq->nr_running == 1)
break;
+
next = pick_next_task(rq);
- if (!next)
- break;
+ BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
- migrate_dead(dead_cpu, next);
+ /* Find suitable destination for @next, with force if needed. */
+ dest_cpu = select_fallback_rq(dead_cpu, next);
+ raw_spin_unlock(&rq->lock);
+
+ __migrate_task(next, dead_cpu, dest_cpu);
+
+ raw_spin_lock(&rq->lock);
}
-}
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
- */
-static void calc_global_load_remove(struct rq *rq)
-{
- atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
- rq->calc_load_active = 0;
+ rq->stop = stop;
}
+
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
unsigned long flags;
struct rq *rq = cpu_rq(cpu);
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
rq->calc_load_update = calc_load_update;
break;
case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- migrate_live_tasks(cpu);
- /* Idle task back to normal (off runqueue, low prio) */
- raw_spin_lock_irq(&rq->lock);
- deactivate_task(rq, rq->idle, 0);
- __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
- rq->idle->sched_class = &idle_sched_class;
- migrate_dead_tasks(cpu);
- raw_spin_unlock_irq(&rq->lock);
- migrate_nr_uninterruptible(rq);
- BUG_ON(rq->nr_running != 0);
- calc_global_load_remove(rq);
- break;
-
case CPU_DYING:
- case CPU_DYING_FROZEN:
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
+ migrate_tasks(cpu);
+ BUG_ON(rq->nr_running != 1); /* the migration thread */
raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ migrate_nr_uninterruptible(rq);
+ calc_global_load_remove(rq);
break;
#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
- struct sched_entity *se, int cpu, int add,
+ struct sched_entity *se, int cpu,
struct sched_entity *parent)
{
struct rq *rq = cpu_rq(cpu);
tg->cfs_rq[cpu] = cfs_rq;
init_cfs_rq(cfs_rq, rq);
cfs_rq->tg = tg;
- if (add)
- list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
tg->se[cpu] = se;
/* se could be NULL for init_task_group */
se->cfs_rq = parent->my_q;
se->my_q = cfs_rq;
- se->load.weight = tg->shares;
- se->load.inv_weight = 0;
+ update_load_set(&se->load, 0);
se->parent = parent;
}
#endif
#ifdef CONFIG_RT_GROUP_SCHED
static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
- struct sched_rt_entity *rt_se, int cpu, int add,
+ struct sched_rt_entity *rt_se, int cpu,
struct sched_rt_entity *parent)
{
struct rq *rq = cpu_rq(cpu);
init_rt_rq(rt_rq, rq);
rt_rq->tg = tg;
rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
- if (add)
- list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
tg->rt_se[cpu] = rt_se;
if (!rt_se)
#ifdef CONFIG_CGROUP_SCHED
list_add(&init_task_group.list, &task_groups);
INIT_LIST_HEAD(&init_task_group.children);
-
+ autogroup_init(&init_task);
#endif /* CONFIG_CGROUP_SCHED */
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
- update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
- __alignof__(unsigned long));
-#endif
for_each_possible_cpu(i) {
struct rq *rq;
#ifdef CONFIG_FAIR_GROUP_SCHED
init_task_group.shares = init_task_group_load;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
/*
* How much cpu bandwidth does init_task_group get?
*
* We achieve this by letting init_task_group's tasks sit
* directly in rq->cfs (i.e init_task_group->se[] = NULL).
*/
- init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
-#endif
+ init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
- init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#endif
+ init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
#endif
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
#endif /* SMP */
- perf_event_init();
-
scheduler_running = 1;
}
if (!se)
goto err_free_rq;
- init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
+ init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
}
return 1;
return 0;
}
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
- list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
- &cpu_rq(cpu)->leaf_cfs_rq_list);
-}
-
static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
{
- list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ /*
+ * Only empty task groups can be destroyed; so we can speculatively
+ * check on_list without danger of it being re-added.
+ */
+ if (!tg->cfs_rq[cpu]->on_list)
+ return;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
#else /* !CONFG_FAIR_GROUP_SCHED */
static inline void free_fair_sched_group(struct task_group *tg)
return 1;
}
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
-}
-
static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
{
}
if (!rt_se)
goto err_free_rq;
- init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
+ init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
}
return 1;
err:
return 0;
}
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
- list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
- &cpu_rq(cpu)->leaf_rt_rq_list);
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
- list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
-}
#else /* !CONFIG_RT_GROUP_SCHED */
static inline void free_rt_sched_group(struct task_group *tg)
{
{
return 1;
}
-
-static inline void register_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
-
-static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
-{
-}
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CGROUP_SCHED
{
struct task_group *tg;
unsigned long flags;
- int i;
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
if (!tg)
goto err;
spin_lock_irqsave(&task_group_lock, flags);
- for_each_possible_cpu(i) {
- register_fair_sched_group(tg, i);
- register_rt_sched_group(tg, i);
- }
list_add_rcu(&tg->list, &task_groups);
WARN_ON(!parent); /* root should already exist */
unsigned long flags;
int i;
- spin_lock_irqsave(&task_group_lock, flags);
- for_each_possible_cpu(i) {
+ /* end participation in shares distribution */
+ for_each_possible_cpu(i)
unregister_fair_sched_group(tg, i);
- unregister_rt_sched_group(tg, i);
- }
+
+ spin_lock_irqsave(&task_group_lock, flags);
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
#endif /* CONFIG_CGROUP_SCHED */
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
-{
- struct cfs_rq *cfs_rq = se->cfs_rq;
- int on_rq;
-
- on_rq = se->on_rq;
- if (on_rq)
- dequeue_entity(cfs_rq, se, 0);
-
- se->load.weight = shares;
- se->load.inv_weight = 0;
-
- if (on_rq)
- enqueue_entity(cfs_rq, se, 0);
-}
-
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
- struct cfs_rq *cfs_rq = se->cfs_rq;
- struct rq *rq = cfs_rq->rq;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- __set_se_shares(se, shares);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-
static DEFINE_MUTEX(shares_mutex);
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
if (tg->shares == shares)
goto done;
- spin_lock_irqsave(&task_group_lock, flags);
- for_each_possible_cpu(i)
- unregister_fair_sched_group(tg, i);
- list_del_rcu(&tg->siblings);
- spin_unlock_irqrestore(&task_group_lock, flags);
-
- /* wait for any ongoing reference to this group to finish */
- synchronize_sched();
-
- /*
- * Now we are free to modify the group's share on each cpu
- * w/o tripping rebalance_share or load_balance_fair.
- */
tg->shares = shares;
for_each_possible_cpu(i) {
- /*
- * force a rebalance
- */
- cfs_rq_set_shares(tg->cfs_rq[i], 0);
- set_se_shares(tg->se[i], shares);
+ struct rq *rq = cpu_rq(i);
+ struct sched_entity *se;
+
+ se = tg->se[i];
+ /* Propagate contribution to hierarchy */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ for_each_sched_entity(se)
+ update_cfs_shares(group_cfs_rq(se), 0);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
- /*
- * Enable load balance activity on this group, by inserting it back on
- * each cpu's rq->leaf_cfs_rq_list.
- */
- spin_lock_irqsave(&task_group_lock, flags);
- for_each_possible_cpu(i)
- register_fair_sched_group(tg, i);
- list_add_rcu(&tg->siblings, &tg->parent->children);
- spin_unlock_irqrestore(&task_group_lock, flags);
done:
mutex_unlock(&shares_mutex);
return 0;
};
#endif /* CONFIG_CGROUP_CPUACCT */
-#ifndef CONFIG_SMP
-
-void synchronize_sched_expedited(void)
-{
- barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
- /*
- * There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
- * time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
- */
- smp_mb(); /* See above comment block. */
- return 0;
-}
-
-/*
- * Wait for an rcu-sched grace period to elapse, but use "big hammer"
- * approach to force grace period to end quickly. This consumes
- * significant time on all CPUs, and is thus not recommended for
- * any sort of common-case code.
- *
- * Note that it is illegal to call this function while holding any
- * lock that is acquired by a CPU-hotplug notifier. Failing to
- * observe this restriction will result in deadlock.
- */
-void synchronize_sched_expedited(void)
-{
- int snap, trycount = 0;
-
- smp_mb(); /* ensure prior mod happens before capturing snap. */
- snap = atomic_read(&synchronize_sched_expedited_count) + 1;
- get_online_cpus();
- while (try_stop_cpus(cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- if (trycount++ < 10)
- udelay(trycount * num_online_cpus());
- else {
- synchronize_sched();
- return;
- }
- if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
- smp_mb(); /* ensure test happens before caller kfree */
- return;
- }
- get_online_cpus();
- }
- atomic_inc(&synchronize_sched_expedited_count);
- smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
- put_online_cpus();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#endif /* #else #ifndef CONFIG_SMP */
--- /dev/null
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+static struct autogroup autogroup_default;
+static atomic_t autogroup_seq_nr;
+
+static void autogroup_init(struct task_struct *init_task)
+{
+ autogroup_default.tg = &init_task_group;
+ init_task_group.autogroup = &autogroup_default;
+ kref_init(&autogroup_default.kref);
+ init_rwsem(&autogroup_default.lock);
+ init_task->signal->autogroup = &autogroup_default;
+}
+
+static inline void autogroup_free(struct task_group *tg)
+{
+ kfree(tg->autogroup);
+}
+
+static inline void autogroup_destroy(struct kref *kref)
+{
+ struct autogroup *ag = container_of(kref, struct autogroup, kref);
+
+ sched_destroy_group(ag->tg);
+}
+
+static inline void autogroup_kref_put(struct autogroup *ag)
+{
+ kref_put(&ag->kref, autogroup_destroy);
+}
+
+static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
+{
+ kref_get(&ag->kref);
+ return ag;
+}
+
+static inline struct autogroup *autogroup_task_get(struct task_struct *p)
+{
+ struct autogroup *ag;
+ unsigned long flags;
+
+ if (!lock_task_sighand(p, &flags))
+ return autogroup_kref_get(&autogroup_default);
+
+ ag = autogroup_kref_get(p->signal->autogroup);
+ unlock_task_sighand(p, &flags);
+
+ return ag;
+}
+
+static inline struct autogroup *autogroup_create(void)
+{
+ struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
+ struct task_group *tg;
+
+ if (!ag)
+ goto out_fail;
+
+ tg = sched_create_group(&init_task_group);
+
+ if (IS_ERR(tg))
+ goto out_free;
+
+ kref_init(&ag->kref);
+ init_rwsem(&ag->lock);
+ ag->id = atomic_inc_return(&autogroup_seq_nr);
+ ag->tg = tg;
+ tg->autogroup = ag;
+
+ return ag;
+
+out_free:
+ kfree(ag);
+out_fail:
+ if (printk_ratelimit()) {
+ printk(KERN_WARNING "autogroup_create: %s failure.\n",
+ ag ? "sched_create_group()" : "kmalloc()");
+ }
+
+ return autogroup_kref_get(&autogroup_default);
+}
+
+static inline bool
+task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+{
+ if (tg != &root_task_group)
+ return false;
+
+ if (p->sched_class != &fair_sched_class)
+ return false;
+
+ /*
+ * We can only assume the task group can't go away on us if
+ * autogroup_move_group() can see us on ->thread_group list.
+ */
+ if (p->flags & PF_EXITING)
+ return false;
+
+ return true;
+}
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+ int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+ if (enabled && task_wants_autogroup(p, tg))
+ return p->signal->autogroup->tg;
+
+ return tg;
+}
+
+static void
+autogroup_move_group(struct task_struct *p, struct autogroup *ag)
+{
+ struct autogroup *prev;
+ struct task_struct *t;
+ unsigned long flags;
+
+ BUG_ON(!lock_task_sighand(p, &flags));
+
+ prev = p->signal->autogroup;
+ if (prev == ag) {
+ unlock_task_sighand(p, &flags);
+ return;
+ }
+
+ p->signal->autogroup = autogroup_kref_get(ag);
+
+ t = p;
+ do {
+ sched_move_task(t);
+ } while_each_thread(p, t);
+
+ unlock_task_sighand(p, &flags);
+ autogroup_kref_put(prev);
+}
+
+/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+void sched_autogroup_create_attach(struct task_struct *p)
+{
+ struct autogroup *ag = autogroup_create();
+
+ autogroup_move_group(p, ag);
+ /* drop extra refrence added by autogroup_create() */
+ autogroup_kref_put(ag);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+
+/* Cannot be called under siglock. Currently has no users */
+void sched_autogroup_detach(struct task_struct *p)
+{
+ autogroup_move_group(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detach);
+
+void sched_autogroup_fork(struct signal_struct *sig)
+{
+ sig->autogroup = autogroup_task_get(current);
+}
+
+void sched_autogroup_exit(struct signal_struct *sig)
+{
+ autogroup_kref_put(sig->autogroup);
+}
+
+static int __init setup_autogroup(char *str)
+{
+ sysctl_sched_autogroup_enabled = 0;
+
+ return 1;
+}
+
+__setup("noautogroup", setup_autogroup);
+
+#ifdef CONFIG_PROC_FS
+
+int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
+{
+ static unsigned long next = INITIAL_JIFFIES;
+ struct autogroup *ag;
+ int err;
+
+ if (*nice < -20 || *nice > 19)
+ return -EINVAL;
+
+ err = security_task_setnice(current, *nice);
+ if (err)
+ return err;
+
+ if (*nice < 0 && !can_nice(current, *nice))
+ return -EPERM;
+
+ /* this is a heavy operation taking global locks.. */
+ if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
+ return -EAGAIN;
+
+ next = HZ / 10 + jiffies;
+ ag = autogroup_task_get(p);
+
+ down_write(&ag->lock);
+ err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
+ if (!err)
+ ag->nice = *nice;
+ up_write(&ag->lock);
+
+ autogroup_kref_put(ag);
+
+ return err;
+}
+
+void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
+{
+ struct autogroup *ag = autogroup_task_get(p);
+
+ down_read(&ag->lock);
+ seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
+ up_read(&ag->lock);
+
+ autogroup_kref_put(ag);
+}
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+ return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
--- /dev/null
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+struct autogroup {
+ struct kref kref;
+ struct task_group *tg;
+ struct rw_semaphore lock;
+ unsigned long id;
+ int nice;
+};
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg);
+
+#else /* !CONFIG_SCHED_AUTOGROUP */
+
+static inline void autogroup_init(struct task_struct *init_task) { }
+static inline void autogroup_free(struct task_group *tg) { }
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+ return tg;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+ return 0;
+}
+#endif
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
}
EXPORT_SYMBOL_GPL(sched_clock);
-static __read_mostly int sched_clock_running;
+__read_mostly int sched_clock_running;
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
__read_mostly int sched_clock_stable;
#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void print_cfs_group_stats(struct seq_file *m, int cpu,
- struct task_group *tg)
+static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
struct sched_entity *se = tg->se[cpu];
if (!se)
0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
#endif
-#ifdef CONFIG_CGROUP_SCHED
- {
- char path[64];
-
- rcu_read_lock();
- cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
- rcu_read_unlock();
- SEQ_printf(m, " %s", path);
- }
-#endif
SEQ_printf(m, "\n");
}
read_unlock_irqrestore(&tasklist_lock, flags);
}
-#if defined(CONFIG_CGROUP_SCHED) && \
- (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
-static void task_group_path(struct task_group *tg, char *buf, int buflen)
-{
- /* may be NULL if the underlying cgroup isn't fully-created yet */
- if (!tg->css.cgroup) {
- buf[0] = '\0';
- return;
- }
- cgroup_path(tg->css.cgroup, buf, buflen);
-}
-#endif
-
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
struct sched_entity *last;
unsigned long flags;
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
- char path[128];
- struct task_group *tg = cfs_rq->tg;
-
- task_group_path(tg, path, sizeof(path));
-
- SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
-#else
SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
-#endif
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));
spread0 = min_vruntime - rq0_min_vruntime;
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
SPLIT_NS(spread0));
- SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
- SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
-
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
+ SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
+ SPLIT_NS(cfs_rq->load_avg));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
+ SPLIT_NS(cfs_rq->load_period));
+ SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
+ cfs_rq->load_contribution);
+ SEQ_printf(m, " .%-30s: %d\n", "load_tg",
+ atomic_read(&cfs_rq->tg->load_weight));
#endif
+
print_cfs_group_stats(m, cpu, cfs_rq->tg);
#endif
}
void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
{
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
- char path[128];
- struct task_group *tg = rt_rq->tg;
-
- task_group_path(tg, path, sizeof(path));
-
- SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
-#else
SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
-#endif
-
#define P(x) \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
#undef P
}
+extern __read_mostly int sched_clock_running;
+
static void print_cpu(struct seq_file *m, int cpu)
{
struct rq *rq = cpu_rq(cpu);
static int sched_debug_show(struct seq_file *m, void *v)
{
- u64 now = ktime_to_ns(ktime_get());
+ u64 ktime, sched_clk, cpu_clk;
+ unsigned long flags;
int cpu;
- SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
+ local_irq_save(flags);
+ ktime = ktime_to_ns(ktime_get());
+ sched_clk = sched_clock();
+ cpu_clk = local_clock();
+ local_irq_restore(flags);
+
+ SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
- SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
+#define P(x) \
+ SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
+#define PN(x) \
+ SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+ PN(ktime);
+ PN(sched_clk);
+ PN(cpu_clk);
+ P(jiffies);
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ P(sched_clock_stable);
+#endif
+#undef PN
+#undef P
+
+ SEQ_printf(m, "\n");
+ SEQ_printf(m, "sysctl_sched\n");
#define P(x) \
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
#define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
- P(jiffies);
PN(sysctl_sched_latency);
PN(sysctl_sched_min_granularity);
PN(sysctl_sched_wakeup_granularity);
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+/*
+ * The exponential sliding window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+
static const struct sched_class fair_sched_class;
/**************************************************************
return cfs_rq->tg->cfs_rq[this_cpu];
}
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ if (!cfs_rq->on_list) {
+ /*
+ * Ensure we either appear before our parent (if already
+ * enqueued) or force our parent to appear after us when it is
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases.
+ */
+ if (cfs_rq->tg->parent &&
+ cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ } else {
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ }
+
+ cfs_rq->on_list = 1;
+ }
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->on_list) {
+ list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+ cfs_rq->on_list = 0;
+ }
+}
+
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
return &cpu_rq(this_cpu)->cfs;
}
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
WRT_SYSCTL(sched_min_granularity);
WRT_SYSCTL(sched_latency);
WRT_SYSCTL(sched_wakeup_granularity);
- WRT_SYSCTL(sched_shares_ratelimit);
#undef WRT_SYSCTL
return 0;
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
+
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->load_unacc_exec_time += delta_exec;
+#endif
}
static void update_curr(struct cfs_rq *cfs_rq)
list_add(&se->group_node, &cfs_rq->tasks);
}
cfs_rq->nr_running++;
- se->on_rq = 1;
}
static void
list_del_init(&se->group_node);
}
cfs_rq->nr_running--;
- se->on_rq = 0;
}
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+ int global_update)
+{
+ struct task_group *tg = cfs_rq->tg;
+ long load_avg;
+
+ load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+ load_avg -= cfs_rq->load_contribution;
+
+ if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+ atomic_add(load_avg, &tg->load_weight);
+ cfs_rq->load_contribution += load_avg;
+ }
+}
+
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+ u64 period = sysctl_sched_shares_window;
+ u64 now, delta;
+ unsigned long load = cfs_rq->load.weight;
+
+ if (!cfs_rq)
+ return;
+
+ now = rq_of(cfs_rq)->clock;
+ delta = now - cfs_rq->load_stamp;
+
+ /* truncate load history at 4 idle periods */
+ if (cfs_rq->load_stamp > cfs_rq->load_last &&
+ now - cfs_rq->load_last > 4 * period) {
+ cfs_rq->load_period = 0;
+ cfs_rq->load_avg = 0;
+ }
+
+ cfs_rq->load_stamp = now;
+ cfs_rq->load_unacc_exec_time = 0;
+ cfs_rq->load_period += delta;
+ if (load) {
+ cfs_rq->load_last = now;
+ cfs_rq->load_avg += delta * load;
+ }
+
+ /* consider updating load contribution on each fold or truncate */
+ if (global_update || cfs_rq->load_period > period
+ || !cfs_rq->load_period)
+ update_cfs_rq_load_contribution(cfs_rq, global_update);
+
+ while (cfs_rq->load_period > period) {
+ /*
+ * Inline assembly required to prevent the compiler
+ * optimising this loop into a divmod call.
+ * See __iter_div_u64_rem() for another example of this.
+ */
+ asm("" : "+rm" (cfs_rq->load_period));
+ cfs_rq->load_period /= 2;
+ cfs_rq->load_avg /= 2;
+ }
+
+ if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+ list_del_leaf_cfs_rq(cfs_rq);
+}
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ unsigned long weight)
+{
+ if (se->on_rq) {
+ /* commit outstanding execution time */
+ if (cfs_rq->curr == se)
+ update_curr(cfs_rq);
+ account_entity_dequeue(cfs_rq, se);
+ }
+
+ update_load_set(&se->load, weight);
+
+ if (se->on_rq)
+ account_entity_enqueue(cfs_rq, se);
+}
+
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+ struct task_group *tg;
+ struct sched_entity *se;
+ long load_weight, load, shares;
+
+ if (!cfs_rq)
+ return;
+
+ tg = cfs_rq->tg;
+ se = tg->se[cpu_of(rq_of(cfs_rq))];
+ if (!se)
+ return;
+
+ load = cfs_rq->load.weight + weight_delta;
+
+ load_weight = atomic_read(&tg->load_weight);
+ load_weight -= cfs_rq->load_contribution;
+ load_weight += load;
+
+ shares = (tg->shares * load);
+ if (load_weight)
+ shares /= load_weight;
+
+ if (shares < MIN_SHARES)
+ shares = MIN_SHARES;
+ if (shares > tg->shares)
+ shares = tg->shares;
+
+ reweight_entity(cfs_rq_of(se), se, shares);
+}
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq, 0);
+ }
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq, se->load.weight);
account_entity_enqueue(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP) {
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
+ se->on_rq = 1;
+
+ if (cfs_rq->nr_running == 1)
+ list_add_leaf_cfs_rq(cfs_rq);
}
static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
+ se->on_rq = 0;
+ update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
update_min_vruntime(cfs_rq);
+ update_cfs_shares(cfs_rq, 0);
/*
* Normalize the entity after updating the min_vruntime because the
*/
update_curr(cfs_rq);
+ /*
+ * Update share accounting for long-running entities.
+ */
+ update_entity_shares_tick(cfs_rq);
+
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
flags = ENQUEUE_WAKEUP;
}
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq, 0);
+ }
+
hrtick_update(rq);
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
+
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight)
break;
flags |= DEQUEUE_SLEEP;
}
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq, 0);
+ }
+
hrtick_update(rq);
}
* Adding load to a group doesn't make a group heavier, but can cause movement
* of group shares between cpus. Assuming the shares were perfectly aligned one
* can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
*/
-static long effective_load(struct task_group *tg, int cpu,
- long wl, long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];
if (!tg->parent)
return wl;
- /*
- * By not taking the decrease of shares on the other cpu into
- * account our error leans towards reducing the affine wakeups.
- */
- if (!wl && sched_feat(ASYM_EFF_LOAD))
- return wl;
-
for_each_sched_entity(se) {
long S, rw, s, a, b;
- long more_w;
-
- /*
- * Instead of using this increment, also add the difference
- * between when the shares were last updated and now.
- */
- more_w = se->my_q->load.weight - se->my_q->rq_weight;
- wl += more_w;
- wg += more_w;
S = se->my_q->tg->shares;
- s = se->my_q->shares;
- rw = se->my_q->rq_weight;
+ s = se->load.weight;
+ rw = se->my_q->load.weight;
a = S*(rw + wl);
b = S*rw + s*wg;
sd = tmp;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
- if (sched_feat(LB_SHARES_UPDATE)) {
- /*
- * Pick the largest domain to update shares over
- */
- tmp = sd;
- if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
- tmp = affine_sd;
-
- if (tmp) {
- raw_spin_unlock(&rq->lock);
- update_shares(tmp);
- raw_spin_lock(&rq->lock);
- }
- }
-#endif
-
if (affine_sd) {
if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
return select_idle_sibling(p, cpu);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static int update_shares_cpu(struct task_group *tg, int cpu)
+{
+ struct cfs_rq *cfs_rq;
+ unsigned long flags;
+ struct rq *rq;
+
+ if (!tg->se[cpu])
+ return 0;
+
+ rq = cpu_rq(cpu);
+ cfs_rq = tg->cfs_rq[cpu];
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ update_rq_clock(rq);
+ update_cfs_load(cfs_rq, 1);
+
+ /*
+ * We need to update shares after updating tg->load_weight in
+ * order to adjust the weight of groups with long running tasks.
+ */
+ update_cfs_shares(cfs_rq, 0);
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ return 0;
+}
+
+static void update_shares(int cpu)
+{
+ struct cfs_rq *cfs_rq;
+ struct rq *rq = cpu_rq(cpu);
+
+ rcu_read_lock();
+ for_each_leaf_cfs_rq(rq, cfs_rq)
+ update_shares_cpu(cfs_rq->tg, cpu);
+ rcu_read_unlock();
+}
+
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
return max_load_move - rem_load_move;
}
#else
+static inline void update_shares(int cpu)
+{
+}
+
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
schedstat_inc(sd, lb_count[idle]);
redo:
- update_shares(sd);
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
cpus, balance);
else
ld_moved = 0;
out:
- if (ld_moved)
- update_shares(sd);
return ld_moved;
}
*/
raw_spin_unlock(&this_rq->lock);
+ update_shares(this_cpu);
for_each_domain(this_cpu, sd) {
unsigned long interval;
int balance = 1;
int update_next_balance = 0;
int need_serialize;
+ update_shares(cpu);
+
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
SCHED_FEAT(HRTICK, 0)
SCHED_FEAT(DOUBLE_TICK, 0)
SCHED_FEAT(LB_BIAS, 1)
-SCHED_FEAT(LB_SHARES_UPDATE, 1)
-SCHED_FEAT(ASYM_EFF_LOAD, 1)
/*
* Spin-wait on mutex acquisition when the mutex owner is running on
return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
}
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+ list_add_rcu(&rt_rq->leaf_rt_rq_list,
+ &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
+}
+
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+ list_del_rcu(&rt_rq->leaf_rt_rq_list);
+}
+
#define for_each_leaf_rt_rq(rt_rq, rq) \
list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
return ktime_to_ns(def_rt_bandwidth.rt_period);
}
+static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+
+static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
+{
+}
+
#define for_each_leaf_rt_rq(rt_rq, rq) \
for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
return;
+ if (!rt_rq->rt_nr_running)
+ list_add_leaf_rt_rq(rt_rq);
+
if (head)
list_add(&rt_se->run_list, queue);
else
__clear_bit(rt_se_prio(rt_se), array->bitmap);
dec_rt_tasks(rt_se, rt_rq);
+ if (!rt_rq->rt_nr_running)
+ list_del_leaf_rt_rq(rt_rq);
}
/*
cpumask_any(cpu_online_mask));
case CPU_DEAD:
case CPU_DEAD_FROZEN: {
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+ static struct sched_param param = {
+ .sched_priority = MAX_RT_PRIO-1
+ };
p = per_cpu(ksoftirqd, hotcpu);
per_cpu(ksoftirqd, hotcpu) = NULL;
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/smp.h>
+#include <linux/delay.h>
#include <linux/srcu.h>
static int init_srcu_struct_fields(struct srcu_struct *sp)
* all srcu_read_lock() calls using the old counters have completed.
* Their corresponding critical sections might well be still
* executing, but the srcu_read_lock() primitives themselves
- * will have finished executing.
+ * will have finished executing. We initially give readers
+ * an arbitrarily chosen 10 microseconds to get out of their
+ * SRCU read-side critical sections, then loop waiting 1/HZ
+ * seconds per iteration.
*/
+ if (srcu_readers_active_idx(sp, idx))
+ udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
while (srcu_readers_active_idx(sp, idx))
schedule_timeout_interruptible(1);
err = session;
out:
write_unlock_irq(&tasklist_lock);
- if (err > 0)
+ if (err > 0) {
proc_sid_connector(group_leader);
+ sched_autogroup_create_attach(group_leader);
+ }
return err;
}
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-static int min_sched_shares_ratelimit = 100000; /* 100 usec */
-static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
#endif
#ifdef CONFIG_COMPACTION
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
},
- {
- .procname = "sched_shares_ratelimit",
- .data = &sysctl_sched_shares_ratelimit,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_proc_update_handler,
- .extra1 = &min_sched_shares_ratelimit,
- .extra2 = &max_sched_shares_ratelimit,
- },
{
.procname = "sched_tunable_scaling",
.data = &sysctl_sched_tunable_scaling,
.extra1 = &min_sched_tunable_scaling,
.extra2 = &max_sched_tunable_scaling,
},
- {
- .procname = "sched_shares_thresh",
- .data = &sysctl_sched_shares_thresh,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- },
{
.procname = "sched_migration_cost",
.data = &sysctl_sched_migration_cost,
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "sched_shares_window",
+ .data = &sysctl_sched_shares_window,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{
.procname = "timer_migration",
.data = &sysctl_timer_migration,
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_SCHED_AUTOGROUP
+ {
+ .procname = "sched_autogroup_enabled",
+ .data = &sysctl_sched_autogroup_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
.extra1 = &zero,
.extra2 = &one,
},
-#endif
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
{
- .procname = "unknown_nmi_panic",
- .data = &unknown_nmi_panic,
+ .procname = "nmi_watchdog",
+ .data = &watchdog_enabled,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dowatchdog_enabled,
},
+#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
{
- .procname = "nmi_watchdog",
- .data = &nmi_watchdog_enabled,
+ .procname = "unknown_nmi_panic",
+ .data = &unknown_nmi_panic,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = proc_nmi_enabled,
+ .proc_handler = proc_dointvec,
},
#endif
#if defined(CONFIG_X86)
{ CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" },
{ CTL_INT, KERN_COMPAT_LOG, "compat-log" },
{ CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" },
- { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" },
{ CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" },
{}
};
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/math64.h>
+#include <linux/kernel.h>
/*
* fixed point arithmetic scale factor for skew
int index;
int num_samples = sync->num_samples;
- if (num_samples > sizeof(buffer)/sizeof(buffer[0])) {
+ if (num_samples > ARRAY_SIZE(buffer)) {
samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
if (!samples) {
samples = buffer;
- num_samples = sizeof(buffer)/sizeof(buffer[0]);
+ num_samples = ARRAY_SIZE(buffer);
}
} else {
samples = buffer;
cycle_t cycle_interval;
/* Number of clock shifted nano seconds in one NTP interval. */
u64 xtime_interval;
+ /* shifted nano seconds left over when rounding cycle_interval */
+ s64 xtime_remainder;
/* Raw nano seconds accumulated per NTP interval. */
u32 raw_interval;
static void timekeeper_setup_internals(struct clocksource *clock)
{
cycle_t interval;
- u64 tmp;
+ u64 tmp, ntpinterval;
timekeeper.clock = clock;
clock->cycle_last = clock->read(clock);
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH;
tmp <<= clock->shift;
+ ntpinterval = tmp;
tmp += clock->mult/2;
do_div(tmp, clock->mult);
if (tmp == 0)
/* Go back from cycles -> shifted ns */
timekeeper.xtime_interval = (u64) interval * clock->mult;
+ timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
timekeeper.raw_interval =
((u64) interval * clock->mult) >> clock->shift;
/* Accumulate error between NTP and clock interval */
timekeeper.ntp_error += tick_length << shift;
- timekeeper.ntp_error -= timekeeper.xtime_interval <<
+ timekeeper.ntp_error -=
+ (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
(timekeeper.ntp_error_shift + shift);
return offset;
{
struct hrtimer *timer, tmp;
unsigned long next = 0, i;
- struct rb_node *curr;
+ struct timerqueue_node *curr;
unsigned long flags;
next_one:
i = 0;
raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
- curr = base->first;
+ curr = timerqueue_getnext(&base->active);
/*
* Crude but we have to do this O(N*N) thing, because
* we have to unlock the base when printing:
*/
while (curr && i < next) {
- curr = rb_next(curr);
+ curr = timerqueue_iterate_next(curr);
i++;
}
if (curr) {
- timer = rb_entry(curr, struct hrtimer, node);
+ timer = container_of(curr, struct hrtimer, node);
tmp = *timer;
raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
EXPORT_SYMBOL(boot_tvec_bases);
static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
-/*
- * Note that all tvec_bases are 2 byte aligned and lower bit of
- * base in timer_list is guaranteed to be zero. Use the LSB to
- * indicate whether the timer is deferrable.
- *
- * A deferrable timer will work normally when the system is busy, but
- * will not cause a CPU to come out of idle just to service it; instead,
- * the timer will be serviced when the CPU eventually wakes up with a
- * subsequent non-deferrable timer.
- */
-#define TBASE_DEFERRABLE_FLAG (0x1)
-
/* Functions below help us manage 'deferrable' flag */
static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
{
static inline void timer_set_deferrable(struct timer_list *timer)
{
- timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
- TBASE_DEFERRABLE_FLAG));
+ timer->base = TBASE_MAKE_DEFERRED(timer->base);
}
static inline void
}
EXPORT_SYMBOL_GPL(set_timer_slack);
-
-static inline void set_running_timer(struct tvec_base *base,
- struct timer_list *timer)
-{
-#ifdef CONFIG_SMP
- base->running_timer = timer;
-#endif
-}
-
static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
{
unsigned long expires = timer->expires;
}
EXPORT_SYMBOL(del_timer);
-#ifdef CONFIG_SMP
/**
* try_to_del_timer_sync - Try to deactivate a timer
* @timer: timer do del
*
* This function tries to deactivate a timer. Upon successful (ret >= 0)
* exit the timer is not queued and the handler is not running on any CPU.
- *
- * It must not be called from interrupt contexts.
*/
int try_to_del_timer_sync(struct timer_list *timer)
{
}
EXPORT_SYMBOL(try_to_del_timer_sync);
+#ifdef CONFIG_SMP
/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
*
* Synchronization rules: Callers must prevent restarting of the timer,
* otherwise this function is meaningless. It must not be called from
- * interrupt contexts. The caller must not hold locks which would prevent
+ * hardirq contexts. The caller must not hold locks which would prevent
* completion of the timer's handler. The timer's handler must not call
* add_timer_on(). Upon exit the timer is not queued and the handler is
* not running on any CPU.
int del_timer_sync(struct timer_list *timer)
{
#ifdef CONFIG_LOCKDEP
- unsigned long flags;
-
- local_irq_save(flags);
+ local_bh_disable();
lock_map_acquire(&timer->lockdep_map);
lock_map_release(&timer->lockdep_map);
- local_irq_restore(flags);
+ local_bh_enable();
#endif
-
+ /*
+ * don't use it in hardirq context, because it
+ * could lead to deadlock.
+ */
+ WARN_ON(in_irq());
for (;;) {
int ret = try_to_del_timer_sync(timer);
if (ret >= 0)
timer_stats_account_timer(timer);
- set_running_timer(base, timer);
+ base->running_timer = timer;
detach_timer(timer, 1);
spin_unlock_irq(&base->lock);
spin_lock_irq(&base->lock);
}
}
- set_running_timer(base, NULL);
+ base->running_timer = NULL;
spin_unlock_irq(&base->lock);
}
*/
unsigned long get_next_timer_interrupt(unsigned long now)
{
- struct tvec_base *base = __get_cpu_var(tvec_bases);
+ struct tvec_base *base = __this_cpu_read(tvec_bases);
unsigned long expires;
/*
*/
static void run_timer_softirq(struct softirq_action *h)
{
- struct tvec_base *base = __get_cpu_var(tvec_bases);
+ struct tvec_base *base = __this_cpu_read(tvec_bases);
hrtimer_run_pending();
select CONTEXT_SWITCH_TRACER
bool
+config EVENT_POWER_TRACING_DEPRECATED
+ depends on EVENT_TRACING
+ bool "Deprecated power event trace API, to be removed"
+ default y
+ help
+ Provides old power event types:
+ C-state/idle accounting events:
+ power:power_start
+ power:power_end
+ and old cpufreq accounting event:
+ power:power_frequency
+ This is for userspace compatibility
+ and will vanish after 5 kernel iterations,
+ namely 2.6.41.
+
config CONTEXT_SWITCH_TRACER
bool
#define CREATE_TRACE_POINTS
#include <trace/events/power.h>
-EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
+#ifdef EVENT_POWER_TRACING_DEPRECATED
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
+#endif
+EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
/* Count the events in use (per event id, not per instance) */
static int total_ref_count;
+static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
+ struct perf_event *p_event)
+{
+ /* No tracing, just counting, so no obvious leak */
+ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
+ return 0;
+
+ /* Some events are ok to be traced by non-root users... */
+ if (p_event->attach_state == PERF_ATTACH_TASK) {
+ if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
+ return 0;
+ }
+
+ /*
+ * ...otherwise raw tracepoint data can be a severe data leak,
+ * only allow root to have these.
+ */
+ if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return 0;
+}
+
static int perf_trace_event_init(struct ftrace_event_call *tp_event,
struct perf_event *p_event)
{
struct hlist_head __percpu *list;
- int ret = -ENOMEM;
+ int ret;
int cpu;
+ ret = perf_trace_event_perm(tp_event, p_event);
+ if (ret)
+ return ret;
+
p_event->tp_event = tp_event;
if (tp_event->perf_refcount++ > 0)
return 0;
+ ret = -ENOMEM;
+
list = alloc_percpu(struct hlist_head);
if (!list)
goto fail;
DEFINE_MUTEX(event_mutex);
+DEFINE_MUTEX(event_storage_mutex);
+EXPORT_SYMBOL_GPL(event_storage_mutex);
+
+char event_storage[EVENT_STORAGE_SIZE];
+EXPORT_SYMBOL_GPL(event_storage);
+
LIST_HEAD(ftrace_events);
LIST_HEAD(ftrace_common_fields);
#undef __array
#define __array(type, item, len) \
- BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
- ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+ do { \
+ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
+ mutex_lock(&event_storage_mutex); \
+ snprintf(event_storage, sizeof(event_storage), \
+ "%s[%d]", #type, len); \
+ ret = trace_define_field(event_call, event_storage, #item, \
offsetof(typeof(field), item), \
sizeof(field.item), \
is_signed_type(type), FILTER_OTHER); \
- if (ret) \
- return ret;
+ mutex_unlock(&event_storage_mutex); \
+ if (ret) \
+ return ret; \
+ } while (0);
#undef __array_desc
#define __array_desc(type, container, item, len) \
static int trace_wakeup_test_thread(void *data)
{
/* Make this a RT thread, doesn't need to be too high */
- struct sched_param param = { .sched_priority = 5 };
+ static struct sched_param param = { .sched_priority = 5 };
struct completion *x = data;
sched_setscheduler(current, SCHED_FIFO, ¶m);
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
+ put_user_ns(ns);
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
{
if (!strncmp(str, "panic", 5))
hardlockup_panic = 1;
+ else if (!strncmp(str, "0", 1))
+ no_watchdog = 1;
return 1;
}
__setup("nmi_watchdog=", hardlockup_panic_setup);
*/
static int watchdog(void *unused)
{
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+ static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
sched_setscheduler(current, SCHED_FIFO, ¶m);
goto out_save;
}
- printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
+ printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n",
+ cpu, PTR_ERR(event));
return PTR_ERR(event);
/* success path */
.notifier_call = cpu_callback
};
-static int __init spawn_watchdog_task(void)
+void __init lockup_detector_init(void)
{
void *cpu = (void *)(long)smp_processor_id();
int err;
if (no_watchdog)
- return 0;
+ return;
err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
WARN_ON(notifier_to_errno(err));
cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
- return 0;
+ return;
}
-early_initcall(spawn_watchdog_task);
An NMI is generated every 60 seconds or so to check for hardlockups.
config HARDLOCKUP_DETECTOR
- def_bool LOCKUP_DETECTOR && PERF_EVENTS && HAVE_PERF_EVENTS_NMI
+ def_bool LOCKUP_DETECTOR && PERF_EVENTS && HAVE_PERF_EVENTS_NMI && \
+ !ARCH_HAS_NMI_WATCHDOG
config BOOTPARAM_SOFTLOCKUP_PANIC
bool "Panic (Reboot) On Soft Lockups"
endif
lib-y := ctype.o string.o vsprintf.o cmdline.o \
- rbtree.o radix-tree.o dump_stack.o \
+ rbtree.o radix-tree.o dump_stack.o timerqueue.o\
idr.o int_sqrt.o extable.o prio_tree.o \
sha1.o irq_regs.o reciprocal_div.o argv_split.o \
proportions.o prio_heap.o ratelimit.o show_mem.o \
--- /dev/null
+/*
+ * Generic Timer-queue
+ *
+ * Manages a simple queue of timers, ordered by expiration time.
+ * Uses rbtrees for quick list adds and expiration.
+ *
+ * NOTE: All of the following functions need to be serialized
+ * to avoid races. No locking is done by this libary code.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/timerqueue.h>
+#include <linux/rbtree.h>
+#include <linux/module.h>
+
+/**
+ * timerqueue_add - Adds timer to timerqueue.
+ *
+ * @head: head of timerqueue
+ * @node: timer node to be added
+ *
+ * Adds the timer node to the timerqueue, sorted by the
+ * node's expires value.
+ */
+void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
+{
+ struct rb_node **p = &head->head.rb_node;
+ struct rb_node *parent = NULL;
+ struct timerqueue_node *ptr;
+
+ /* Make sure we don't add nodes that are already added */
+ WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));
+
+ while (*p) {
+ parent = *p;
+ ptr = rb_entry(parent, struct timerqueue_node, node);
+ if (node->expires.tv64 < ptr->expires.tv64)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+ rb_link_node(&node->node, parent, p);
+ rb_insert_color(&node->node, &head->head);
+
+ if (!head->next || node->expires.tv64 < head->next->expires.tv64)
+ head->next = node;
+}
+EXPORT_SYMBOL_GPL(timerqueue_add);
+
+/**
+ * timerqueue_del - Removes a timer from the timerqueue.
+ *
+ * @head: head of timerqueue
+ * @node: timer node to be removed
+ *
+ * Removes the timer node from the timerqueue.
+ */
+void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
+{
+ WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));
+
+ /* update next pointer */
+ if (head->next == node) {
+ struct rb_node *rbn = rb_next(&node->node);
+
+ head->next = rbn ?
+ rb_entry(rbn, struct timerqueue_node, node) : NULL;
+ }
+ rb_erase(&node->node, &head->head);
+ RB_CLEAR_NODE(&node->node);
+}
+EXPORT_SYMBOL_GPL(timerqueue_del);
+
+/**
+ * timerqueue_iterate_next - Returns the timer after the provided timer
+ *
+ * @node: Pointer to a timer.
+ *
+ * Provides the timer that is after the given node. This is used, when
+ * necessary, to iterate through the list of timers in a timer list
+ * without modifying the list.
+ */
+struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node)
+{
+ struct rb_node *next;
+
+ if (!node)
+ return NULL;
+ next = rb_next(&node->node);
+ if (!next)
+ return NULL;
+ return container_of(next, struct timerqueue_node, node);
+}
+EXPORT_SYMBOL_GPL(timerqueue_iterate_next);
rcu_read_lock();
p = rcu_dereference(mm->owner);
- VM_BUG_ON(!p);
/*
- * because we don't have task_lock(), "p" can exit while
- * we're here. In that case, "mem" can point to root
- * cgroup but never be NULL. (and task_struct itself is freed
- * by RCU, cgroup itself is RCU safe.) Then, we have small
- * risk here to get wrong cgroup. But such kind of mis-account
- * by race always happens because we don't have cgroup_mutex().
- * It's overkill and we allow that small race, here.
+ * Because we don't have task_lock(), "p" can exit.
+ * In that case, "mem" can point to root or p can be NULL with
+ * race with swapoff. Then, we have small risk of mis-accouning.
+ * But such kind of mis-account by race always happens because
+ * we don't have cgroup_mutex(). It's overkill and we allo that
+ * small race, here.
+ * (*) swapoff at el will charge against mm-struct not against
+ * task-struct. So, mm->owner can be NULL.
*/
mem = mem_cgroup_from_task(p);
- VM_BUG_ON(!mem);
- if (mem_cgroup_is_root(mem)) {
+ if (!mem || mem_cgroup_is_root(mem)) {
rcu_read_unlock();
goto done;
}
struct net_bridge_port *port,
struct sk_buff *skb)
{
- struct sk_buff *skb2 = skb;
+ struct sk_buff *skb2;
struct ipv6hdr *ip6h;
struct icmp6hdr *icmp6h;
u8 nexthdr;
if (!skb2)
return -ENOMEM;
+ err = -EINVAL;
+ if (!pskb_may_pull(skb2, offset + sizeof(struct icmp6hdr)))
+ goto out;
+
len -= offset - skb_network_offset(skb2);
__skb_pull(skb2, offset);
skb_reset_transport_header(skb2);
- err = -EINVAL;
- if (!pskb_may_pull(skb2, sizeof(*icmp6h)))
- goto out;
-
icmp6h = icmp6_hdr(skb2);
switch (icmp6h->icmp6_type) {
switch (icmp6h->icmp6_type) {
case ICMPV6_MGM_REPORT:
{
- struct mld_msg *mld = (struct mld_msg *)icmp6h;
+ struct mld_msg *mld;
+ if (!pskb_may_pull(skb2, sizeof(*mld))) {
+ err = -EINVAL;
+ goto out;
+ }
+ mld = (struct mld_msg *)skb_transport_header(skb2);
BR_INPUT_SKB_CB(skb2)->mrouters_only = 1;
err = br_ip6_multicast_add_group(br, port, &mld->mld_mca);
break;
break;
case ICMPV6_MGM_REDUCTION:
{
- struct mld_msg *mld = (struct mld_msg *)icmp6h;
+ struct mld_msg *mld;
+ if (!pskb_may_pull(skb2, sizeof(*mld))) {
+ err = -EINVAL;
+ goto out;
+ }
+ mld = (struct mld_msg *)skb_transport_header(skb2);
br_ip6_multicast_leave_group(br, port, &mld->mld_mca);
}
}
out:
- __skb_push(skb2, offset);
- if (skb2 != skb)
- kfree_skb(skb2);
+ kfree_skb(skb2);
return err;
}
#endif
llc_mac_hdr_init(skb, p->dev->dev_addr, p->br->group_addr);
+ skb_reset_mac_header(skb);
+
NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
dev_queue_xmit);
}
struct list_head tx_ops;
unsigned long dropped_usr_msgs;
struct proc_dir_entry *bcm_proc_read;
- char procname [20]; /* pointer printed in ASCII with \0 */
+ char procname [32]; /* inode number in decimal with \0 */
};
static inline struct bcm_sock *bcm_sk(const struct sock *sk)
if (proc_dir) {
/* unique socket address as filename */
- sprintf(bo->procname, "%p", sock);
+ sprintf(bo->procname, "%lu", sock_i_ino(sk));
bo->bcm_proc_read = proc_create_data(bo->procname, 0644,
proc_dir,
&bcm_proc_fops, sk);
}
if (res.type == RTN_LOCAL) {
- if (!fl.fl4_src)
- fl.fl4_src = fl.fl4_dst;
+ if (!fl.fl4_src) {
+ if (res.fi->fib_prefsrc)
+ fl.fl4_src = res.fi->fib_prefsrc;
+ else
+ fl.fl4_src = fl.fl4_dst;
+ }
dev_out = net->loopback_dev;
fl.oif = dev_out->ifindex;
res.fi = NULL;
# The empty.o file is created in the make process in order to determine
# the target endianness and word size. It is made before all other C
# files, including recordmcount.
-cmd_record_mcount = if [ $(@) != "scripts/mod/empty.o" ]; then \
- $(objtree)/scripts/recordmcount "$(@)"; \
- fi;
+sub_cmd_record_mcount = \
+ if [ $(@) != "scripts/mod/empty.o" ]; then \
+ $(objtree)/scripts/recordmcount "$(@)"; \
+ fi;
else
-cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \
+sub_cmd_record_mcount = set -e ; perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \
"$(if $(CONFIG_CPU_BIG_ENDIAN),big,little)" \
"$(if $(CONFIG_64BIT),64,32)" \
"$(OBJDUMP)" "$(OBJCOPY)" "$(CC) $(KBUILD_CFLAGS)" \
"$(LD)" "$(NM)" "$(RM)" "$(MV)" \
"$(if $(part-of-module),1,0)" "$(@)";
endif
+cmd_record_mcount = \
+ if [ "$(findstring -pg,$(_c_flags))" = "-pg" ]; then \
+ $(sub_cmd_record_mcount) \
+ fi;
endif
define rule_cc_o_c
}
if (current_entry->prompt && current_entry != &rootmenu)
prop_warn(prop, "prompt redefined");
+
+ /* Apply all upper menus' visibilities to actual prompts. */
+ if(type == P_PROMPT) {
+ struct menu *menu = current_entry;
+
+ while ((menu = menu->parent) != NULL) {
+ if (!menu->visibility)
+ continue;
+ prop->visible.expr
+ = expr_alloc_and(prop->visible.expr,
+ menu->visibility);
+ }
+ }
+
current_entry->prompt = prop;
}
prop->text = prompt;
# '@parameter' - name of a parameter
# '%CONST' - name of a constant.
+## init lots of data
+
my $errors = 0;
my $warnings = 0;
my $anon_struct_union = 0;
$type_param, "\$1" );
my $blankline_list = "";
-sub usage {
- print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n";
- print " [ -no-doc-sections ]\n";
- print " [ -function funcname [ -function funcname ...] ]\n";
- print " [ -nofunction funcname [ -nofunction funcname ...] ]\n";
- print " c source file(s) > outputfile\n";
- print " -v : verbose output, more warnings & other info listed\n";
- exit 1;
-}
-
# read arguments
if ($#ARGV == -1) {
usage();
}
+my $kernelversion;
+my $dohighlight = "";
+
my $verbose = 0;
my $output_mode = "man";
my $no_doc_sections = 0;
'November', 'December')[(localtime)[4]] .
" " . ((localtime)[5]+1900);
-# Essentially these are globals
+# Essentially these are globals.
# They probably want to be tidied up, made more localised or something.
# CAVEAT EMPTOR! Some of the others I localised may not want to be, which
# could cause "use of undefined value" or other bugs.
}
}
+# continue execution near EOF;
+
+sub usage {
+ print "Usage: $0 [ -v ] [ -docbook | -html | -text | -man | -list ]\n";
+ print " [ -no-doc-sections ]\n";
+ print " [ -function funcname [ -function funcname ...] ]\n";
+ print " [ -nofunction funcname [ -nofunction funcname ...] ]\n";
+ print " c source file(s) > outputfile\n";
+ print " -v : verbose output, more warnings & other info listed\n";
+ exit 1;
+}
+
# get kernel version from env
sub get_kernel_version() {
my $version = 'unknown kernel version';
}
return $version;
}
-my $kernelversion = get_kernel_version();
-
-# generate a sequence of code that will splice in highlighting information
-# using the s// operator.
-my $dohighlight = "";
-foreach my $pattern (keys %highlights) {
-# print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n";
- $dohighlight .= "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n";
-}
##
# dumps section contents to arrays/hashes intended for that purpose.
});
}
-sub process_file($);
-
-# Read the file that maps relative names to absolute names for
-# separate source and object directories and for shadow trees.
-if (open(SOURCE_MAP, "<.tmp_filelist.txt")) {
- my ($relname, $absname);
- while(<SOURCE_MAP>) {
- chop();
- ($relname, $absname) = (split())[0..1];
- $relname =~ s:^/+::;
- $source_map{$relname} = $absname;
- }
- close(SOURCE_MAP);
-}
-
-foreach (@ARGV) {
- chomp;
- process_file($_);
-}
-if ($verbose && $errors) {
- print STDERR "$errors errors\n";
-}
-if ($verbose && $warnings) {
- print STDERR "$warnings warnings\n";
-}
-
-exit($errors);
-
sub reset_state {
$function = "";
%constants = ();
}
}
}
+
+
+$kernelversion = get_kernel_version();
+
+# generate a sequence of code that will splice in highlighting information
+# using the s// operator.
+foreach my $pattern (keys %highlights) {
+# print STDERR "scanning pattern:$pattern, highlight:($highlights{$pattern})\n";
+ $dohighlight .= "\$contents =~ s:$pattern:$highlights{$pattern}:gs;\n";
+}
+
+# Read the file that maps relative names to absolute names for
+# separate source and object directories and for shadow trees.
+if (open(SOURCE_MAP, "<.tmp_filelist.txt")) {
+ my ($relname, $absname);
+ while(<SOURCE_MAP>) {
+ chop();
+ ($relname, $absname) = (split())[0..1];
+ $relname =~ s:^/+::;
+ $source_map{$relname} = $absname;
+ }
+ close(SOURCE_MAP);
+}
+
+foreach (@ARGV) {
+ chomp;
+ process_file($_);
+}
+if ($verbose && $errors) {
+ print STDERR "$errors errors\n";
+}
+if ($verbose && $warnings) {
+ print STDERR "$warnings warnings\n";
+}
+
+exit($errors);
result = security_filter_rule_init(entry->lsm[lsm_rule].type,
Audit_equal, args,
&entry->lsm[lsm_rule].rule);
+ if (!entry->lsm[lsm_rule].rule)
+ return -EINVAL;
return result;
}
int i, n;
for (i = 0; i < num_mixer_volumes; i++) {
- if (strcmp(name, mixer_vols[i].name) == 0) {
+ if (strncmp(name, mixer_vols[i].name, 32) == 0) {
if (present)
mixer_vols[i].num = i;
return mixer_vols[i].levels;
}
n = num_mixer_volumes++;
- strcpy(mixer_vols[n].name, name);
+ strncpy(mixer_vols[n].name, name, 32);
if (present)
mixer_vols[n].num = n;
SND_PCI_QUIRK(0x1028, 0x01cc, "Dell D820", POS_FIX_LPIB),
SND_PCI_QUIRK(0x1028, 0x01de, "Dell Precision 390", POS_FIX_LPIB),
SND_PCI_QUIRK(0x1028, 0x01f6, "Dell Latitude 131L", POS_FIX_LPIB),
+ SND_PCI_QUIRK(0x1028, 0x0470, "Dell Inspiron 1120", POS_FIX_LPIB),
SND_PCI_QUIRK(0x103c, 0x306d, "HP dv3", POS_FIX_LPIB),
SND_PCI_QUIRK(0x1043, 0x813d, "ASUS P5AD2", POS_FIX_LPIB),
SND_PCI_QUIRK(0x1043, 0x81b3, "ASUS", POS_FIX_LPIB),
};
struct max98088_priv {
- u8 reg_cache[M98088_REG_CNT];
enum max98088_type devtype;
void *control_data;
struct max98088_pdata *pdata;
static void max98088_sync_cache(struct snd_soc_codec *codec)
{
- struct max98088_priv *max98088 = snd_soc_codec_get_drvdata(codec);
+ u16 *reg_cache = codec->reg_cache;
int i;
if (!codec->cache_sync)
/* write back cached values if they're writeable and
* different from the hardware default.
*/
- for (i = 1; i < ARRAY_SIZE(max98088->reg_cache); i++) {
+ for (i = 1; i < codec->driver->reg_cache_size; i++) {
if (!max98088_access[i].writable)
continue;
- if (max98088->reg_cache[i] == max98088_reg[i])
+ if (reg_cache[i] == max98088_reg[i])
continue;
- snd_soc_write(codec, i, max98088->reg_cache[i]);
+ snd_soc_write(codec, i, reg_cache[i]);
}
codec->cache_sync = 0;
int ret = 0;
codec->cache_sync = 1;
- memcpy(codec->reg_cache, max98088_reg, sizeof(max98088_reg));
ret = snd_soc_codec_set_cache_io(codec, 8, 8, SND_SOC_I2C);
if (ret != 0) {
/* codec private data */
struct wm8523_priv {
enum snd_soc_control_type control_type;
- u16 reg_cache[WM8523_REGISTER_COUNT];
struct regulator_bulk_data supplies[WM8523_NUM_SUPPLIES];
unsigned int sysclk;
unsigned int rate_constraint_list[WM8523_NUM_RATES];
enum snd_soc_bias_level level)
{
struct wm8523_priv *wm8523 = snd_soc_codec_get_drvdata(codec);
+ u16 *reg_cache = codec->reg_cache;
int ret, i;
switch (level) {
/* Sync back default/cached values */
for (i = WM8523_AIF_CTRL1;
i < WM8523_MAX_REGISTER; i++)
- snd_soc_write(codec, i, wm8523->reg_cache[i]);
+ snd_soc_write(codec, i, reg_cache[i]);
msleep(100);
static int wm8523_probe(struct snd_soc_codec *codec)
{
struct wm8523_priv *wm8523 = snd_soc_codec_get_drvdata(codec);
+ u16 *reg_cache = codec->reg_cache;
int ret, i;
codec->hw_write = (hw_write_t)i2c_master_send;
}
/* Change some default settings - latch VU and enable ZC */
- wm8523->reg_cache[WM8523_DAC_GAINR] |= WM8523_DACR_VU;
- wm8523->reg_cache[WM8523_DAC_CTRL3] |= WM8523_ZC;
+ reg_cache[WM8523_DAC_GAINR] |= WM8523_DACR_VU;
+ reg_cache[WM8523_DAC_CTRL3] |= WM8523_ZC;
wm8523_set_bias_level(codec, SND_SOC_BIAS_STANDBY);
/* codec private data */
struct wm8741_priv {
enum snd_soc_control_type control_type;
- u16 reg_cache[WM8741_REGISTER_COUNT];
struct regulator_bulk_data supplies[WM8741_NUM_SUPPLIES];
unsigned int sysclk;
struct snd_pcm_hw_constraint_list *sysclk_constraints;
static int wm8741_probe(struct snd_soc_codec *codec)
{
struct wm8741_priv *wm8741 = snd_soc_codec_get_drvdata(codec);
+ u16 *reg_cache = codec->reg_cache;
int ret = 0;
ret = snd_soc_codec_set_cache_io(codec, 7, 9, wm8741->control_type);
}
/* Change some default settings - latch VU */
- wm8741->reg_cache[WM8741_DACLLSB_ATTENUATION] |= WM8741_UPDATELL;
- wm8741->reg_cache[WM8741_DACLMSB_ATTENUATION] |= WM8741_UPDATELM;
- wm8741->reg_cache[WM8741_DACRLSB_ATTENUATION] |= WM8741_UPDATERL;
- wm8741->reg_cache[WM8741_DACRLSB_ATTENUATION] |= WM8741_UPDATERM;
+ reg_cache[WM8741_DACLLSB_ATTENUATION] |= WM8741_UPDATELL;
+ reg_cache[WM8741_DACLMSB_ATTENUATION] |= WM8741_UPDATELM;
+ reg_cache[WM8741_DACRLSB_ATTENUATION] |= WM8741_UPDATERL;
+ reg_cache[WM8741_DACRLSB_ATTENUATION] |= WM8741_UPDATERM;
snd_soc_add_controls(codec, wm8741_snd_controls,
ARRAY_SIZE(wm8741_snd_controls));
* are using 2 wire for device control, so we cache them instead.
*/
static const u16 wm8753_reg[] = {
- 0x0008, 0x0000, 0x000a, 0x000a,
- 0x0033, 0x0000, 0x0007, 0x00ff,
- 0x00ff, 0x000f, 0x000f, 0x007b,
- 0x0000, 0x0032, 0x0000, 0x00c3,
- 0x00c3, 0x00c0, 0x0000, 0x0000,
+ 0x0000, 0x0008, 0x0000, 0x000a,
+ 0x000a, 0x0033, 0x0000, 0x0007,
+ 0x00ff, 0x00ff, 0x000f, 0x000f,
+ 0x007b, 0x0000, 0x0032, 0x0000,
+ 0x00c3, 0x00c3, 0x00c0, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000,
- 0x0000, 0x0000, 0x0000, 0x0055,
- 0x0005, 0x0050, 0x0055, 0x0050,
- 0x0055, 0x0050, 0x0055, 0x0079,
- 0x0079, 0x0079, 0x0079, 0x0079,
0x0000, 0x0000, 0x0000, 0x0000,
- 0x0097, 0x0097, 0x0000, 0x0004,
- 0x0000, 0x0083, 0x0024, 0x01ba,
- 0x0000, 0x0083, 0x0024, 0x01ba,
- 0x0000, 0x0000, 0x0000
+ 0x0055, 0x0005, 0x0050, 0x0055,
+ 0x0050, 0x0055, 0x0050, 0x0055,
+ 0x0079, 0x0079, 0x0079, 0x0079,
+ 0x0079, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0097, 0x0097, 0x0000,
+ 0x0004, 0x0000, 0x0083, 0x0024,
+ 0x01ba, 0x0000, 0x0083, 0x0024,
+ 0x01ba, 0x0000, 0x0000, 0x0000
};
/* codec private data */
enum snd_soc_control_type control_type;
unsigned int sysclk;
unsigned int pcmclk;
- u16 reg_cache[ARRAY_SIZE(wm8753_reg)];
int dai_func;
};
-/*
- * read wm8753 register cache
- */
-static inline unsigned int wm8753_read_reg_cache(struct snd_soc_codec *codec,
- unsigned int reg)
-{
- u16 *cache = codec->reg_cache;
- if (reg < 1 || reg >= (ARRAY_SIZE(wm8753_reg) + 1))
- return -1;
- return cache[reg - 1];
-}
-
-/*
- * write wm8753 register cache
- */
-static inline void wm8753_write_reg_cache(struct snd_soc_codec *codec,
- unsigned int reg, unsigned int value)
-{
- u16 *cache = codec->reg_cache;
- if (reg < 1 || reg >= (ARRAY_SIZE(wm8753_reg) + 1))
- return;
- cache[reg - 1] = value;
-}
-
-/*
- * write to the WM8753 register space
- */
-static int wm8753_write(struct snd_soc_codec *codec, unsigned int reg,
- unsigned int value)
-{
- u8 data[2];
-
- /* data is
- * D15..D9 WM8753 register offset
- * D8...D0 register data
- */
- data[0] = (reg << 1) | ((value >> 8) & 0x0001);
- data[1] = value & 0x00ff;
-
- wm8753_write_reg_cache(codec, reg, value);
- if (codec->hw_write(codec->control_data, data, 2) == 2)
- return 0;
- else
- return -EIO;
-}
-
-#define wm8753_reset(c) wm8753_write(c, WM8753_RESET, 0)
+#define wm8753_reset(c) snd_soc_write(c, WM8753_RESET, 0)
/*
* WM8753 Controls
struct snd_ctl_elem_value *ucontrol)
{
struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
- int mode = wm8753_read_reg_cache(codec, WM8753_IOCTL);
+ int mode = snd_soc_read(codec, WM8753_IOCTL);
ucontrol->value.integer.value[0] = (mode & 0xc) >> 2;
return 0;
struct snd_ctl_elem_value *ucontrol)
{
struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
- int mode = wm8753_read_reg_cache(codec, WM8753_IOCTL);
+ int mode = snd_soc_read(codec, WM8753_IOCTL);
struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
if (((mode & 0xc) >> 2) == ucontrol->value.integer.value[0])
if (pll_id == WM8753_PLL1) {
offset = 0;
enable = 0x10;
- reg = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0xffef;
+ reg = snd_soc_read(codec, WM8753_CLOCK) & 0xffef;
} else {
offset = 4;
enable = 0x8;
- reg = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0xfff7;
+ reg = snd_soc_read(codec, WM8753_CLOCK) & 0xfff7;
}
if (!freq_in || !freq_out) {
/* disable PLL */
- wm8753_write(codec, WM8753_PLL1CTL1 + offset, 0x0026);
- wm8753_write(codec, WM8753_CLOCK, reg);
+ snd_soc_write(codec, WM8753_PLL1CTL1 + offset, 0x0026);
+ snd_soc_write(codec, WM8753_CLOCK, reg);
return 0;
} else {
u16 value = 0;
/* set up N and K PLL divisor ratios */
/* bits 8:5 = PLL_N, bits 3:0 = PLL_K[21:18] */
value = (pll_div.n << 5) + ((pll_div.k & 0x3c0000) >> 18);
- wm8753_write(codec, WM8753_PLL1CTL2 + offset, value);
+ snd_soc_write(codec, WM8753_PLL1CTL2 + offset, value);
/* bits 8:0 = PLL_K[17:9] */
value = (pll_div.k & 0x03fe00) >> 9;
- wm8753_write(codec, WM8753_PLL1CTL3 + offset, value);
+ snd_soc_write(codec, WM8753_PLL1CTL3 + offset, value);
/* bits 8:0 = PLL_K[8:0] */
value = pll_div.k & 0x0001ff;
- wm8753_write(codec, WM8753_PLL1CTL4 + offset, value);
+ snd_soc_write(codec, WM8753_PLL1CTL4 + offset, value);
/* set PLL as input and enable */
- wm8753_write(codec, WM8753_PLL1CTL1 + offset, 0x0027 |
+ snd_soc_write(codec, WM8753_PLL1CTL1 + offset, 0x0027 |
(pll_div.div2 << 3));
- wm8753_write(codec, WM8753_CLOCK, reg | enable);
+ snd_soc_write(codec, WM8753_CLOCK, reg | enable);
}
return 0;
}
unsigned int fmt)
{
struct snd_soc_codec *codec = codec_dai->codec;
- u16 voice = wm8753_read_reg_cache(codec, WM8753_PCM) & 0x01ec;
+ u16 voice = snd_soc_read(codec, WM8753_PCM) & 0x01ec;
/* interface format */
switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) {
return -EINVAL;
}
- wm8753_write(codec, WM8753_PCM, voice);
+ snd_soc_write(codec, WM8753_PCM, voice);
return 0;
}
struct snd_soc_pcm_runtime *rtd = substream->private_data;
struct snd_soc_codec *codec = rtd->codec;
struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
- u16 voice = wm8753_read_reg_cache(codec, WM8753_PCM) & 0x01f3;
- u16 srate = wm8753_read_reg_cache(codec, WM8753_SRATE1) & 0x017f;
+ u16 voice = snd_soc_read(codec, WM8753_PCM) & 0x01f3;
+ u16 srate = snd_soc_read(codec, WM8753_SRATE1) & 0x017f;
/* bit size */
switch (params_format(params)) {
/* sample rate */
if (params_rate(params) * 384 == wm8753->pcmclk)
srate |= 0x80;
- wm8753_write(codec, WM8753_SRATE1, srate);
+ snd_soc_write(codec, WM8753_SRATE1, srate);
- wm8753_write(codec, WM8753_PCM, voice);
+ snd_soc_write(codec, WM8753_PCM, voice);
return 0;
}
struct snd_soc_codec *codec = codec_dai->codec;
u16 voice, ioctl;
- voice = wm8753_read_reg_cache(codec, WM8753_PCM) & 0x011f;
- ioctl = wm8753_read_reg_cache(codec, WM8753_IOCTL) & 0x015d;
+ voice = snd_soc_read(codec, WM8753_PCM) & 0x011f;
+ ioctl = snd_soc_read(codec, WM8753_IOCTL) & 0x015d;
/* set master/slave audio interface */
switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) {
return -EINVAL;
}
- wm8753_write(codec, WM8753_PCM, voice);
- wm8753_write(codec, WM8753_IOCTL, ioctl);
+ snd_soc_write(codec, WM8753_PCM, voice);
+ snd_soc_write(codec, WM8753_IOCTL, ioctl);
return 0;
}
switch (div_id) {
case WM8753_PCMDIV:
- reg = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0x003f;
- wm8753_write(codec, WM8753_CLOCK, reg | div);
+ reg = snd_soc_read(codec, WM8753_CLOCK) & 0x003f;
+ snd_soc_write(codec, WM8753_CLOCK, reg | div);
break;
case WM8753_BCLKDIV:
- reg = wm8753_read_reg_cache(codec, WM8753_SRATE2) & 0x01c7;
- wm8753_write(codec, WM8753_SRATE2, reg | div);
+ reg = snd_soc_read(codec, WM8753_SRATE2) & 0x01c7;
+ snd_soc_write(codec, WM8753_SRATE2, reg | div);
break;
case WM8753_VXCLKDIV:
- reg = wm8753_read_reg_cache(codec, WM8753_SRATE2) & 0x003f;
- wm8753_write(codec, WM8753_SRATE2, reg | div);
+ reg = snd_soc_read(codec, WM8753_SRATE2) & 0x003f;
+ snd_soc_write(codec, WM8753_SRATE2, reg | div);
break;
default:
return -EINVAL;
unsigned int fmt)
{
struct snd_soc_codec *codec = codec_dai->codec;
- u16 hifi = wm8753_read_reg_cache(codec, WM8753_HIFI) & 0x01e0;
+ u16 hifi = snd_soc_read(codec, WM8753_HIFI) & 0x01e0;
/* interface format */
switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) {
return -EINVAL;
}
- wm8753_write(codec, WM8753_HIFI, hifi);
+ snd_soc_write(codec, WM8753_HIFI, hifi);
return 0;
}
struct snd_soc_codec *codec = codec_dai->codec;
u16 ioctl, hifi;
- hifi = wm8753_read_reg_cache(codec, WM8753_HIFI) & 0x011f;
- ioctl = wm8753_read_reg_cache(codec, WM8753_IOCTL) & 0x00ae;
+ hifi = snd_soc_read(codec, WM8753_HIFI) & 0x011f;
+ ioctl = snd_soc_read(codec, WM8753_IOCTL) & 0x00ae;
/* set master/slave audio interface */
switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) {
return -EINVAL;
}
- wm8753_write(codec, WM8753_HIFI, hifi);
- wm8753_write(codec, WM8753_IOCTL, ioctl);
+ snd_soc_write(codec, WM8753_HIFI, hifi);
+ snd_soc_write(codec, WM8753_IOCTL, ioctl);
return 0;
}
struct snd_soc_pcm_runtime *rtd = substream->private_data;
struct snd_soc_codec *codec = rtd->codec;
struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
- u16 srate = wm8753_read_reg_cache(codec, WM8753_SRATE1) & 0x01c0;
- u16 hifi = wm8753_read_reg_cache(codec, WM8753_HIFI) & 0x01f3;
+ u16 srate = snd_soc_read(codec, WM8753_SRATE1) & 0x01c0;
+ u16 hifi = snd_soc_read(codec, WM8753_HIFI) & 0x01f3;
int coeff;
/* is digital filter coefficient valid ? */
printk(KERN_ERR "wm8753 invalid MCLK or rate\n");
return coeff;
}
- wm8753_write(codec, WM8753_SRATE1, srate | (coeff_div[coeff].sr << 1) |
+ snd_soc_write(codec, WM8753_SRATE1, srate | (coeff_div[coeff].sr << 1) |
coeff_div[coeff].usb);
/* bit size */
break;
}
- wm8753_write(codec, WM8753_HIFI, hifi);
+ snd_soc_write(codec, WM8753_HIFI, hifi);
return 0;
}
u16 clock;
/* set clk source as pcmclk */
- clock = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0xfffb;
- wm8753_write(codec, WM8753_CLOCK, clock);
+ clock = snd_soc_read(codec, WM8753_CLOCK) & 0xfffb;
+ snd_soc_write(codec, WM8753_CLOCK, clock);
if (wm8753_vdac_adc_set_dai_fmt(codec_dai, fmt) < 0)
return -EINVAL;
u16 clock;
/* set clk source as pcmclk */
- clock = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0xfffb;
- wm8753_write(codec, WM8753_CLOCK, clock);
+ clock = snd_soc_read(codec, WM8753_CLOCK) & 0xfffb;
+ snd_soc_write(codec, WM8753_CLOCK, clock);
if (wm8753_vdac_adc_set_dai_fmt(codec_dai, fmt) < 0)
return -EINVAL;
u16 clock;
/* set clk source as mclk */
- clock = wm8753_read_reg_cache(codec, WM8753_CLOCK) & 0xfffb;
- wm8753_write(codec, WM8753_CLOCK, clock | 0x4);
+ clock = snd_soc_read(codec, WM8753_CLOCK) & 0xfffb;
+ snd_soc_write(codec, WM8753_CLOCK, clock | 0x4);
if (wm8753_hdac_set_dai_fmt(codec_dai, fmt) < 0)
return -EINVAL;
static int wm8753_mute(struct snd_soc_dai *dai, int mute)
{
struct snd_soc_codec *codec = dai->codec;
- u16 mute_reg = wm8753_read_reg_cache(codec, WM8753_DAC) & 0xfff7;
+ u16 mute_reg = snd_soc_read(codec, WM8753_DAC) & 0xfff7;
struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
/* the digital mute covers the HiFi and Voice DAC's on the WM8753.
* make sure we check if they are not both active when we mute */
if (mute && wm8753->dai_func == 1) {
if (!codec->active)
- wm8753_write(codec, WM8753_DAC, mute_reg | 0x8);
+ snd_soc_write(codec, WM8753_DAC, mute_reg | 0x8);
} else {
if (mute)
- wm8753_write(codec, WM8753_DAC, mute_reg | 0x8);
+ snd_soc_write(codec, WM8753_DAC, mute_reg | 0x8);
else
- wm8753_write(codec, WM8753_DAC, mute_reg);
+ snd_soc_write(codec, WM8753_DAC, mute_reg);
}
return 0;
static int wm8753_set_bias_level(struct snd_soc_codec *codec,
enum snd_soc_bias_level level)
{
- u16 pwr_reg = wm8753_read_reg_cache(codec, WM8753_PWR1) & 0xfe3e;
+ u16 pwr_reg = snd_soc_read(codec, WM8753_PWR1) & 0xfe3e;
switch (level) {
case SND_SOC_BIAS_ON:
/* set vmid to 50k and unmute dac */
- wm8753_write(codec, WM8753_PWR1, pwr_reg | 0x00c0);
+ snd_soc_write(codec, WM8753_PWR1, pwr_reg | 0x00c0);
break;
case SND_SOC_BIAS_PREPARE:
/* set vmid to 5k for quick power up */
- wm8753_write(codec, WM8753_PWR1, pwr_reg | 0x01c1);
+ snd_soc_write(codec, WM8753_PWR1, pwr_reg | 0x01c1);
break;
case SND_SOC_BIAS_STANDBY:
/* mute dac and set vmid to 500k, enable VREF */
- wm8753_write(codec, WM8753_PWR1, pwr_reg | 0x0141);
+ snd_soc_write(codec, WM8753_PWR1, pwr_reg | 0x0141);
break;
case SND_SOC_BIAS_OFF:
- wm8753_write(codec, WM8753_PWR1, 0x0001);
+ snd_soc_write(codec, WM8753_PWR1, 0x0001);
break;
}
codec->bias_level = level;
else
dai->driver = &wm8753_all_dai[(wm8753->dai_func << 1) + 1];
}
- wm8753_write(codec, WM8753_IOCTL, wm8753->dai_func);
+ snd_soc_write(codec, WM8753_IOCTL, wm8753->dai_func);
}
static void wm8753_work(struct work_struct *work)
static int wm8753_resume(struct snd_soc_codec *codec)
{
+ u16 *reg_cache = codec->reg_cache;
int i;
- u8 data[2];
- u16 *cache = codec->reg_cache;
/* Sync reg_cache with the hardware */
- for (i = 0; i < ARRAY_SIZE(wm8753_reg); i++) {
- if (i + 1 == WM8753_RESET)
+ for (i = 1; i < ARRAY_SIZE(wm8753_reg); i++) {
+ if (i == WM8753_RESET)
continue;
/* No point in writing hardware default values back */
- if (cache[i] == wm8753_reg[i])
+ if (reg_cache[i] == wm8753_reg[i])
continue;
- data[0] = ((i + 1) << 1) | ((cache[i] >> 8) & 0x0001);
- data[1] = cache[i] & 0x00ff;
- codec->hw_write(codec->control_data, data, 2);
+ snd_soc_write(codec, i, reg_cache[i]);
}
wm8753_set_bias_level(codec, SND_SOC_BIAS_STANDBY);
static int wm8753_probe(struct snd_soc_codec *codec)
{
struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
- int ret = 0, reg;
+ int ret;
INIT_DELAYED_WORK(&codec->delayed_work, wm8753_work);
msecs_to_jiffies(caps_charge));
/* set the update bits */
- reg = wm8753_read_reg_cache(codec, WM8753_LDAC);
- wm8753_write(codec, WM8753_LDAC, reg | 0x0100);
- reg = wm8753_read_reg_cache(codec, WM8753_RDAC);
- wm8753_write(codec, WM8753_RDAC, reg | 0x0100);
- reg = wm8753_read_reg_cache(codec, WM8753_LADC);
- wm8753_write(codec, WM8753_LADC, reg | 0x0100);
- reg = wm8753_read_reg_cache(codec, WM8753_RADC);
- wm8753_write(codec, WM8753_RADC, reg | 0x0100);
- reg = wm8753_read_reg_cache(codec, WM8753_LOUT1V);
- wm8753_write(codec, WM8753_LOUT1V, reg | 0x0100);
- reg = wm8753_read_reg_cache(codec, WM8753_ROUT1V);
- wm8753_write(codec, WM8753_ROUT1V, reg | 0x0100);
- reg = wm8753_read_reg_cache(codec, WM8753_LOUT2V);
- wm8753_write(codec, WM8753_LOUT2V, reg | 0x0100);
- reg = wm8753_read_reg_cache(codec, WM8753_ROUT2V);
- wm8753_write(codec, WM8753_ROUT2V, reg | 0x0100);
- reg = wm8753_read_reg_cache(codec, WM8753_LINVOL);
- wm8753_write(codec, WM8753_LINVOL, reg | 0x0100);
- reg = wm8753_read_reg_cache(codec, WM8753_RINVOL);
- wm8753_write(codec, WM8753_RINVOL, reg | 0x0100);
+ snd_soc_update_bits(codec, WM8753_LDAC, 0x0100, 0x0100);
+ snd_soc_update_bits(codec, WM8753_RDAC, 0x0100, 0x0100);
+ snd_soc_update_bits(codec, WM8753_LDAC, 0x0100, 0x0100);
+ snd_soc_update_bits(codec, WM8753_RDAC, 0x0100, 0x0100);
+ snd_soc_update_bits(codec, WM8753_LOUT1V, 0x0100, 0x0100);
+ snd_soc_update_bits(codec, WM8753_ROUT1V, 0x0100, 0x0100);
+ snd_soc_update_bits(codec, WM8753_LOUT2V, 0x0100, 0x0100);
+ snd_soc_update_bits(codec, WM8753_ROUT2V, 0x0100, 0x0100);
+ snd_soc_update_bits(codec, WM8753_LINVOL, 0x0100, 0x0100);
+ snd_soc_update_bits(codec, WM8753_RINVOL, 0x0100, 0x0100);
snd_soc_add_controls(codec, wm8753_snd_controls,
ARRAY_SIZE(wm8753_snd_controls));
/* codec private data */
struct wm8904_priv {
- u16 reg_cache[WM8904_MAX_REGISTER + 1];
-
enum wm8904_type devtype;
void *control_data;
static void wm8904_sync_cache(struct snd_soc_codec *codec)
{
- struct wm8904_priv *wm8904 = snd_soc_codec_get_drvdata(codec);
+ u16 *reg_cache = codec->reg_cache;
int i;
if (!codec->cache_sync)
/* Sync back cached values if they're different from the
* hardware default.
*/
- for (i = 1; i < ARRAY_SIZE(wm8904->reg_cache); i++) {
+ for (i = 1; i < codec->driver->reg_cache_size; i++) {
if (!wm8904_access[i].writable)
continue;
- if (wm8904->reg_cache[i] == wm8904_reg[i])
+ if (reg_cache[i] == wm8904_reg[i])
continue;
- snd_soc_write(codec, i, wm8904->reg_cache[i]);
+ snd_soc_write(codec, i, reg_cache[i]);
}
codec->cache_sync = 0;
{
struct wm8904_priv *wm8904 = snd_soc_codec_get_drvdata(codec);
struct wm8904_pdata *pdata = wm8904->pdata;
+ u16 *reg_cache = codec->reg_cache;
int ret, i;
codec->cache_sync = 1;
}
/* Change some default settings - latch VU and enable ZC */
- wm8904->reg_cache[WM8904_ADC_DIGITAL_VOLUME_LEFT] |= WM8904_ADC_VU;
- wm8904->reg_cache[WM8904_ADC_DIGITAL_VOLUME_RIGHT] |= WM8904_ADC_VU;
- wm8904->reg_cache[WM8904_DAC_DIGITAL_VOLUME_LEFT] |= WM8904_DAC_VU;
- wm8904->reg_cache[WM8904_DAC_DIGITAL_VOLUME_RIGHT] |= WM8904_DAC_VU;
- wm8904->reg_cache[WM8904_ANALOGUE_OUT1_LEFT] |= WM8904_HPOUT_VU |
+ reg_cache[WM8904_ADC_DIGITAL_VOLUME_LEFT] |= WM8904_ADC_VU;
+ reg_cache[WM8904_ADC_DIGITAL_VOLUME_RIGHT] |= WM8904_ADC_VU;
+ reg_cache[WM8904_DAC_DIGITAL_VOLUME_LEFT] |= WM8904_DAC_VU;
+ reg_cache[WM8904_DAC_DIGITAL_VOLUME_RIGHT] |= WM8904_DAC_VU;
+ reg_cache[WM8904_ANALOGUE_OUT1_LEFT] |= WM8904_HPOUT_VU |
WM8904_HPOUTLZC;
- wm8904->reg_cache[WM8904_ANALOGUE_OUT1_RIGHT] |= WM8904_HPOUT_VU |
+ reg_cache[WM8904_ANALOGUE_OUT1_RIGHT] |= WM8904_HPOUT_VU |
WM8904_HPOUTRZC;
- wm8904->reg_cache[WM8904_ANALOGUE_OUT2_LEFT] |= WM8904_LINEOUT_VU |
+ reg_cache[WM8904_ANALOGUE_OUT2_LEFT] |= WM8904_LINEOUT_VU |
WM8904_LINEOUTLZC;
- wm8904->reg_cache[WM8904_ANALOGUE_OUT2_RIGHT] |= WM8904_LINEOUT_VU |
+ reg_cache[WM8904_ANALOGUE_OUT2_RIGHT] |= WM8904_LINEOUT_VU |
WM8904_LINEOUTRZC;
- wm8904->reg_cache[WM8904_CLOCK_RATES_0] &= ~WM8904_SR_MODE;
+ reg_cache[WM8904_CLOCK_RATES_0] &= ~WM8904_SR_MODE;
/* Apply configuration from the platform data. */
if (wm8904->pdata) {
if (!pdata->gpio_cfg[i])
continue;
- wm8904->reg_cache[WM8904_GPIO_CONTROL_1 + i]
+ reg_cache[WM8904_GPIO_CONTROL_1 + i]
= pdata->gpio_cfg[i] & 0xffff;
}
/* Zero is the default value for these anyway */
for (i = 0; i < WM8904_MIC_REGS; i++)
- wm8904->reg_cache[WM8904_MIC_BIAS_CONTROL_0 + i]
+ reg_cache[WM8904_MIC_BIAS_CONTROL_0 + i]
= pdata->mic_cfg[i];
}
/* Set Class W by default - this will be managed by the Class
* G widget at runtime where bypass paths are available.
*/
- wm8904->reg_cache[WM8904_CLASS_W_0] |= WM8904_CP_DYN_PWR;
+ reg_cache[WM8904_CLASS_W_0] |= WM8904_CP_DYN_PWR;
/* Use normal bias source */
- wm8904->reg_cache[WM8904_BIAS_CONTROL_0] &= ~WM8904_POBCTRL;
+ reg_cache[WM8904_BIAS_CONTROL_0] &= ~WM8904_POBCTRL;
wm8904_set_bias_level(codec, SND_SOC_BIAS_STANDBY);
i2c_set_clientdata(i2c, wm8940);
wm8940->control_data = i2c;
+ wm8940->control_type = SND_SOC_I2C;
ret = snd_soc_register_codec(&i2c->dev,
&soc_codec_dev_wm8940, &wm8940_dai, 1);
struct wm8955_priv {
enum snd_soc_control_type control_type;
- u16 reg_cache[WM8955_MAX_REGISTER + 1];
-
unsigned int mclk_rate;
int deemph;
enum snd_soc_bias_level level)
{
struct wm8955_priv *wm8955 = snd_soc_codec_get_drvdata(codec);
+ u16 *reg_cache = codec->reg_cache;
int ret, i;
switch (level) {
/* Sync back cached values if they're
* different from the hardware default.
*/
- for (i = 0; i < ARRAY_SIZE(wm8955->reg_cache); i++) {
+ for (i = 0; i < codec->driver->reg_cache_size; i++) {
if (i == WM8955_RESET)
continue;
- if (wm8955->reg_cache[i] == wm8955_reg[i])
+ if (reg_cache[i] == wm8955_reg[i])
continue;
- snd_soc_write(codec, i, wm8955->reg_cache[i]);
+ snd_soc_write(codec, i, reg_cache[i]);
}
/* Enable VREF and VMID */
{
struct wm8955_priv *wm8955 = snd_soc_codec_get_drvdata(codec);
struct wm8955_pdata *pdata = dev_get_platdata(codec->dev);
+ u16 *reg_cache = codec->reg_cache;
int ret, i;
ret = snd_soc_codec_set_cache_io(codec, 7, 9, wm8955->control_type);
}
/* Change some default settings - latch VU and enable ZC */
- wm8955->reg_cache[WM8955_LEFT_DAC_VOLUME] |= WM8955_LDVU;
- wm8955->reg_cache[WM8955_RIGHT_DAC_VOLUME] |= WM8955_RDVU;
- wm8955->reg_cache[WM8955_LOUT1_VOLUME] |= WM8955_LO1VU | WM8955_LO1ZC;
- wm8955->reg_cache[WM8955_ROUT1_VOLUME] |= WM8955_RO1VU | WM8955_RO1ZC;
- wm8955->reg_cache[WM8955_LOUT2_VOLUME] |= WM8955_LO2VU | WM8955_LO2ZC;
- wm8955->reg_cache[WM8955_ROUT2_VOLUME] |= WM8955_RO2VU | WM8955_RO2ZC;
- wm8955->reg_cache[WM8955_MONOOUT_VOLUME] |= WM8955_MOZC;
+ reg_cache[WM8955_LEFT_DAC_VOLUME] |= WM8955_LDVU;
+ reg_cache[WM8955_RIGHT_DAC_VOLUME] |= WM8955_RDVU;
+ reg_cache[WM8955_LOUT1_VOLUME] |= WM8955_LO1VU | WM8955_LO1ZC;
+ reg_cache[WM8955_ROUT1_VOLUME] |= WM8955_RO1VU | WM8955_RO1ZC;
+ reg_cache[WM8955_LOUT2_VOLUME] |= WM8955_LO2VU | WM8955_LO2ZC;
+ reg_cache[WM8955_ROUT2_VOLUME] |= WM8955_RO2VU | WM8955_RO2ZC;
+ reg_cache[WM8955_MONOOUT_VOLUME] |= WM8955_MOZC;
/* Also enable adaptive bass boost by default */
- wm8955->reg_cache[WM8955_BASS_CONTROL] |= WM8955_BB;
+ reg_cache[WM8955_BASS_CONTROL] |= WM8955_BB;
/* Set platform data values */
if (pdata) {
if (pdata->out2_speaker)
- wm8955->reg_cache[WM8955_ADDITIONAL_CONTROL_2]
+ reg_cache[WM8955_ADDITIONAL_CONTROL_2]
|= WM8955_ROUT2INV;
if (pdata->monoin_diff)
- wm8955->reg_cache[WM8955_MONO_OUT_MIX_1]
+ reg_cache[WM8955_MONO_OUT_MIX_1]
|= WM8955_DMEN;
}
return -ENOMEM;
i2c_set_clientdata(i2c, wm8955);
+ wm8955->control_type = SND_SOC_I2C;
ret = snd_soc_register_codec(&i2c->dev,
&soc_codec_dev_wm8955, &wm8955_dai, 1);
return -ENOMEM;
i2c_set_clientdata(i2c, wm8960);
+ wm8960->control_type = SND_SOC_I2C;
wm8960->control_data = i2c;
ret = snd_soc_register_codec(&i2c->dev,
struct wm8962_priv {
struct snd_soc_codec *codec;
- u16 reg_cache[WM8962_MAX_REGISTER + 1];
-
int sysclk;
int sysclk_rate;
struct snd_ctl_elem_value *ucontrol)
{
struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
- struct wm8962_priv *wm8962 = snd_soc_codec_get_drvdata(codec);
- u16 *reg_cache = wm8962->reg_cache;
+ u16 *reg_cache = codec->reg_cache;
int ret;
/* Apply the update (if any) */
struct snd_ctl_elem_value *ucontrol)
{
struct snd_soc_codec *codec = snd_kcontrol_chip(kcontrol);
- struct wm8962_priv *wm8962 = snd_soc_codec_get_drvdata(codec);
- u16 *reg_cache = wm8962->reg_cache;
+ u16 *reg_cache = codec->reg_cache;
int ret;
/* Apply the update (if any) */
struct snd_kcontrol *kcontrol, int event)
{
struct snd_soc_codec *codec = w->codec;
- struct wm8962_priv *wm8962 = snd_soc_codec_get_drvdata(codec);
- u16 *reg_cache = wm8962->reg_cache;
+ u16 *reg_cache = codec->reg_cache;
int reg;
switch (w->shift) {
static void wm8962_sync_cache(struct snd_soc_codec *codec)
{
- struct wm8962_priv *wm8962 = snd_soc_codec_get_drvdata(codec);
+ u16 *reg_cache = codec->reg_cache;
int i;
if (!codec->cache_sync)
/* Sync back cached values if they're different from the
* hardware default.
*/
- for (i = 1; i < ARRAY_SIZE(wm8962->reg_cache); i++) {
+ for (i = 1; i < codec->driver->reg_cache_size; i++) {
if (i == WM8962_SOFTWARE_RESET)
continue;
- if (wm8962->reg_cache[i] == wm8962_reg[i])
+ if (reg_cache[i] == wm8962_reg[i])
continue;
- snd_soc_write(codec, i, wm8962->reg_cache[i]);
+ snd_soc_write(codec, i, reg_cache[i]);
}
codec->cache_sync = 0;
#ifdef CONFIG_PM
static int wm8962_resume(struct snd_soc_codec *codec)
{
- struct wm8962_priv *wm8962 = snd_soc_codec_get_drvdata(codec);
u16 *reg_cache = codec->reg_cache;
int i;
/* Restore the registers */
- for (i = 1; i < ARRAY_SIZE(wm8962->reg_cache); i++) {
+ for (i = 1; i < codec->driver->reg_cache_size; i++) {
switch (i) {
case WM8962_SOFTWARE_RESET:
continue;
struct wm8962_pdata *pdata = dev_get_platdata(codec->dev);
struct i2c_client *i2c = container_of(codec->dev, struct i2c_client,
dev);
+ u16 *reg_cache = codec->reg_cache;
int i, trigger, irq_pol;
wm8962->codec = codec;
/* Put the speakers into mono mode? */
if (pdata->spk_mono)
- wm8962->reg_cache[WM8962_CLASS_D_CONTROL_2]
+ reg_cache[WM8962_CLASS_D_CONTROL_2]
|= WM8962_SPK_MONO;
/* Micbias setup, detection enable and detection
}
/* Latch volume update bits */
- wm8962->reg_cache[WM8962_LEFT_INPUT_VOLUME] |= WM8962_IN_VU;
- wm8962->reg_cache[WM8962_RIGHT_INPUT_VOLUME] |= WM8962_IN_VU;
- wm8962->reg_cache[WM8962_LEFT_ADC_VOLUME] |= WM8962_ADC_VU;
- wm8962->reg_cache[WM8962_RIGHT_ADC_VOLUME] |= WM8962_ADC_VU;
- wm8962->reg_cache[WM8962_LEFT_DAC_VOLUME] |= WM8962_DAC_VU;
- wm8962->reg_cache[WM8962_RIGHT_DAC_VOLUME] |= WM8962_DAC_VU;
- wm8962->reg_cache[WM8962_SPKOUTL_VOLUME] |= WM8962_SPKOUT_VU;
- wm8962->reg_cache[WM8962_SPKOUTR_VOLUME] |= WM8962_SPKOUT_VU;
- wm8962->reg_cache[WM8962_HPOUTL_VOLUME] |= WM8962_HPOUT_VU;
- wm8962->reg_cache[WM8962_HPOUTR_VOLUME] |= WM8962_HPOUT_VU;
+ reg_cache[WM8962_LEFT_INPUT_VOLUME] |= WM8962_IN_VU;
+ reg_cache[WM8962_RIGHT_INPUT_VOLUME] |= WM8962_IN_VU;
+ reg_cache[WM8962_LEFT_ADC_VOLUME] |= WM8962_ADC_VU;
+ reg_cache[WM8962_RIGHT_ADC_VOLUME] |= WM8962_ADC_VU;
+ reg_cache[WM8962_LEFT_DAC_VOLUME] |= WM8962_DAC_VU;
+ reg_cache[WM8962_RIGHT_DAC_VOLUME] |= WM8962_DAC_VU;
+ reg_cache[WM8962_SPKOUTL_VOLUME] |= WM8962_SPKOUT_VU;
+ reg_cache[WM8962_SPKOUTR_VOLUME] |= WM8962_SPKOUT_VU;
+ reg_cache[WM8962_HPOUTL_VOLUME] |= WM8962_HPOUT_VU;
+ reg_cache[WM8962_HPOUTR_VOLUME] |= WM8962_HPOUT_VU;
wm8962_add_widgets(codec);
if (wm8971 == NULL)
return -ENOMEM;
+ wm8971->control_type = SND_SOC_I2C;
i2c_set_clientdata(i2c, wm8971);
ret = snd_soc_register_codec(&i2c->dev,
return -ENOMEM;
i2c_set_clientdata(i2c, wm9081);
+ wm9081->control_type = SND_SOC_I2C;
wm9081->control_data = i2c;
ret = snd_soc_register_codec(&i2c->dev,
/* This struct is used to save the context */
struct wm9090_priv {
struct mutex mutex;
- u16 reg_cache[WM9090_MAX_REGISTER + 1];
struct wm9090_platform_data pdata;
void *control_data;
};
static int wm9090_probe(struct snd_soc_codec *codec)
{
struct wm9090_priv *wm9090 = snd_soc_codec_get_drvdata(codec);
+ u16 *reg_cache = codec->reg_cache;
int ret;
codec->control_data = wm9090->control_data;
/* Configure some defaults; they will be written out when we
* bring the bias up.
*/
- wm9090->reg_cache[WM9090_IN1_LINE_INPUT_A_VOLUME] |= WM9090_IN1_VU
+ reg_cache[WM9090_IN1_LINE_INPUT_A_VOLUME] |= WM9090_IN1_VU
| WM9090_IN1A_ZC;
- wm9090->reg_cache[WM9090_IN1_LINE_INPUT_B_VOLUME] |= WM9090_IN1_VU
+ reg_cache[WM9090_IN1_LINE_INPUT_B_VOLUME] |= WM9090_IN1_VU
| WM9090_IN1B_ZC;
- wm9090->reg_cache[WM9090_IN2_LINE_INPUT_A_VOLUME] |= WM9090_IN2_VU
+ reg_cache[WM9090_IN2_LINE_INPUT_A_VOLUME] |= WM9090_IN2_VU
| WM9090_IN2A_ZC;
- wm9090->reg_cache[WM9090_IN2_LINE_INPUT_B_VOLUME] |= WM9090_IN2_VU
+ reg_cache[WM9090_IN2_LINE_INPUT_B_VOLUME] |= WM9090_IN2_VU
| WM9090_IN2B_ZC;
- wm9090->reg_cache[WM9090_SPEAKER_VOLUME_LEFT] |=
+ reg_cache[WM9090_SPEAKER_VOLUME_LEFT] |=
WM9090_SPKOUT_VU | WM9090_SPKOUTL_ZC;
- wm9090->reg_cache[WM9090_LEFT_OUTPUT_VOLUME] |=
+ reg_cache[WM9090_LEFT_OUTPUT_VOLUME] |=
WM9090_HPOUT1_VU | WM9090_HPOUT1L_ZC;
- wm9090->reg_cache[WM9090_RIGHT_OUTPUT_VOLUME] |=
+ reg_cache[WM9090_RIGHT_OUTPUT_VOLUME] |=
WM9090_HPOUT1_VU | WM9090_HPOUT1R_ZC;
- wm9090->reg_cache[WM9090_CLOCKING_1] |= WM9090_TOCLK_ENA;
+ reg_cache[WM9090_CLOCKING_1] |= WM9090_TOCLK_ENA;
wm9090_set_bias_level(codec, SND_SOC_BIAS_STANDBY);
--input=::
Input file name. (default: perf.data)
+-d::
+--dsos=<dso[,dso...]>::
+ Only consider symbols in these dsos.
+-s::
+--symbol=<symbol>::
+ Symbol to annotate.
+
+-f::
+--force::
+ Don't complain, do it.
+
+-v::
+--verbose::
+ Be more verbose. (Show symbol address, etc)
+
+-D::
+--dump-raw-trace::
+ Dump raw trace in ASCII.
+
+-k::
+--vmlinux=<file>::
+ vmlinux pathname.
+
+-m::
+--modules::
+ Load module symbols. WARNING: use only with -k and LIVE kernel.
+
+-l::
+--print-line::
+ Print matching source lines (may be slow).
+
+-P::
+--full-paths::
+ Don't shorten the displayed pathnames.
+
--stdio:: Use the stdio interface.
--tui:: Use the TUI interface Use of --tui requires a tty, if one is not
present, as when piping to other commands, the stdio interface is
used. This interfaces starts by centering on the line with more
- samples, TAB/UNTAB cycles thru the lines with more samples.
+ samples, TAB/UNTAB cycles through the lines with more samples.
SEE ALSO
--------
OPTIONS
-------
+-H::
+--with-hits::
+ Show only DSOs with hits.
-i::
--input=::
Input file name. (default: perf.data)
OPTIONS
-------
+-M::
+--displacement::
+ Show position displacement relative to baseline.
+
+-D::
+--dump-raw-trace::
+ Dump raw trace in ASCII.
+
+-m::
+--modules::
+ Load module symbols. WARNING: use only with -k and LIVE kernel
+
-d::
--dsos=::
Only consider symbols in these dsos. CSV that understands
--field-separator=::
Use a special separator character and don't pad with spaces, replacing
- all occurances of this separator in symbol names (and other output)
+ all occurrences of this separator in symbol names (and other output)
with a '.' character, that thus it's the only non valid separator.
-v::
Be verbose, for instance, show the raw counts in addition to the
diff.
+-f::
+--force::
+ Don't complain, do it.
+
+--symfs=<directory>::
+ Look for files with symbols relative to this directory.
+
SEE ALSO
--------
linkperf:perf-record[1]
a performance counter profile of guest os in realtime
of an arbitrary workload.
- 'perf kvm record <command>' to record the performance couinter profile
+ 'perf kvm record <command>' to record the performance counter profile
of an arbitrary workload and save it into a perf data file. If both
--host and --guest are input, the perf data file name is perf.data.kvm.
If there is no --host but --guest, the file name is perf.data.guest.
OPTIONS
-------
+-i::
+--input=::
+ Input file name.
+-o::
+--output::
+ Output file name.
--host=::
Collect host side performance profile.
--guest=::
'perf lock report' reports statistical data.
+OPTIONS
+-------
+
+-i::
+--input=<file>::
+ Input file name.
+
+-v::
+--verbose::
+ Be more verbose (show symbol address, etc).
+
+-D::
+--dump-raw-trace::
+ Dump raw trace in ASCII.
+
SEE ALSO
--------
linkperf:perf[1]
LINE SYNTAX
-----------
-Line range is descripted by following syntax.
+Line range is described by following syntax.
- "FUNC[:RLN[+NUM|-RLN2]]|SRC:ALN[+NUM|-ALN2]"
+ "FUNC[:RLN[+NUM|-RLN2]]|SRC[:ALN[+NUM|-ALN2]]"
FUNC specifies the function name of showing lines. 'RLN' is the start line
number from function entry line, and 'RLN2' is the end line number. As same as
be passed as follows: '\mem:addr[:[r][w][x]]'.
If you want to profile read-write accesses in 0x1000, just set
'mem:0x1000:rw'.
+
+--filter=<filter>::
+ Event filter.
+
-a::
- System-wide collection.
+--all-cpus::
+ System-wide collection from all CPUs.
-l::
Scale counter values.
-p::
--pid=::
- Record events on existing pid.
+ Record events on existing process ID.
+
+-t::
+--tid=::
+ Record events on existing thread ID.
-r::
--realtime=::
--data::
Sample addresses.
+-T::
+--timestamp::
+ Sample timestamps. Use it with 'perf report -D' to see the timestamps,
+ for instance.
+
-n::
--no-samples::
Don't sample.
-C::
--cpu::
-Collect samples only on the list of cpus provided. Multiple CPUs can be provided as a
-comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+Collect samples only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
In per-thread mode with inheritance mode on (default), samples are captured only when
the thread executes on the designated CPUs. Default is to monitor all CPUs.
-i::
--input=::
Input file name. (default: perf.data)
+
+-v::
+--verbose::
+ Be more verbose. (show symbol address, etc)
+
-d::
--dsos=::
Only consider symbols in these dsos. CSV that understands
-n::
--show-nr-samples::
Show the number of samples for each symbol
+
+--showcpuutilization::
+ Show sample percentage for different cpu modes.
+
-T::
--threads::
Show per-thread event counters
Only consider these symbols. CSV that understands
file://filename entries.
+-U::
+--hide-unresolved::
+ Only display entries resolved to a symbol.
+
-s::
--sort=::
Sort by key(s): pid, comm, dso, symbol, parent.
+-p::
+--parent=<regex>::
+ regex filter to identify parent, see: '--sort parent'
+
+-x::
+--exclude-other::
+ Only display entries with parent-match.
+
-w::
---field-width=::
+--column-widths=<width[,width...]>::
Force each column width to the provided list, for large terminal
readability.
--field-separator=::
Use a special separator character and don't pad with spaces, replacing
- all occurances of this separator in symbol names (and other output)
+ all occurrences of this separator in symbol names (and other output)
with a '.' character, that thus it's the only non valid separator.
+-D::
+--dump-raw-trace::
+ Dump raw trace in ASCII.
+
-g [type,min]::
--call-graph::
- Display callchains using type and min percent threshold.
+ Display call chains using type and min percent threshold.
type can be either:
- - flat: single column, linear exposure of callchains.
+ - flat: single column, linear exposure of call chains.
- graph: use a graph tree, displaying absolute overhead rates.
- fractal: like graph, but displays relative rates. Each branch of
the tree is considered as a new profiled object. +
Default: fractal,0.5.
+--pretty=<key>::
+ Pretty printing style. key: normal, raw
+
--stdio:: Use the stdio interface.
--tui:: Use the TUI interface, that is integrated with annotate and allows
requires a tty, if one is not present, as when piping to other
commands, the stdio interface is used.
+-k::
+--vmlinux=<file>::
+ vmlinux pathname
+
+--kallsyms=<file>::
+ kallsyms pathname
+
+-m::
+--modules::
+ Load module symbols. WARNING: This should only be used with -k and
+ a LIVE kernel.
+
+-f::
+--force::
+ Don't complain, do it.
+
+--symfs=<directory>::
+ Look for files with symbols relative to this directory.
+
SEE ALSO
--------
linkperf:perf-stat[1]
SYNOPSIS
--------
[verse]
-'perf sched' {record|latency|replay|trace}
+'perf sched' {record|latency|map|replay|trace}
DESCRIPTION
-----------
-There are four variants of perf sched:
+There are five variants of perf sched:
'perf sched record <command>' to record the scheduling events
of an arbitrary workload.
of the workload as it occurred when it was recorded - and can repeat
it a number of times, measuring its performance.)
+ 'perf sched map' to print a textual context-switching outline of
+ workload captured via perf sched record. Columns stand for
+ individual CPUs, and the two-letter shortcuts stand for tasks that
+ are running on a CPU. A '*' denotes the CPU that had the event, and
+ a dot signals an idle CPU.
+
OPTIONS
-------
+-i::
+--input=<file>::
+ Input file name. (default: perf.data)
+
+-v::
+--verbose::
+ Be more verbose. (show symbol address, etc)
+
-D::
--dump-raw-trace=::
Display verbose dump of the sched data.
--- /dev/null
+perf-script-perl(1)
+==================
+
+NAME
+----
+perf-script-perl - Process trace data with a Perl script
+
+SYNOPSIS
+--------
+[verse]
+'perf script' [-s [Perl]:script[.pl] ]
+
+DESCRIPTION
+-----------
+
+This perf script option is used to process perf script data using perf's
+built-in Perl interpreter. It reads and processes the input file and
+displays the results of the trace analysis implemented in the given
+Perl script, if any.
+
+STARTER SCRIPTS
+---------------
+
+You can avoid reading the rest of this document by running 'perf script
+-g perl' in the same directory as an existing perf.data trace file.
+That will generate a starter script containing a handler for each of
+the event types in the trace file; it simply prints every available
+field for each event in the trace file.
+
+You can also look at the existing scripts in
+~/libexec/perf-core/scripts/perl for typical examples showing how to
+do basic things like aggregate event data, print results, etc. Also,
+the check-perf-script.pl script, while not interesting for its results,
+attempts to exercise all of the main scripting features.
+
+EVENT HANDLERS
+--------------
+
+When perf script is invoked using a trace script, a user-defined
+'handler function' is called for each event in the trace. If there's
+no handler function defined for a given event type, the event is
+ignored (or passed to a 'trace_handled' function, see below) and the
+next event is processed.
+
+Most of the event's field values are passed as arguments to the
+handler function; some of the less common ones aren't - those are
+available as calls back into the perf executable (see below).
+
+As an example, the following perf record command can be used to record
+all sched_wakeup events in the system:
+
+ # perf record -a -e sched:sched_wakeup
+
+Traces meant to be processed using a script should be recorded with
+the above option: -a to enable system-wide collection.
+
+The format file for the sched_wakep event defines the following fields
+(see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format):
+
+----
+ format:
+ field:unsigned short common_type;
+ field:unsigned char common_flags;
+ field:unsigned char common_preempt_count;
+ field:int common_pid;
+ field:int common_lock_depth;
+
+ field:char comm[TASK_COMM_LEN];
+ field:pid_t pid;
+ field:int prio;
+ field:int success;
+ field:int target_cpu;
+----
+
+The handler function for this event would be defined as:
+
+----
+sub sched::sched_wakeup
+{
+ my ($event_name, $context, $common_cpu, $common_secs,
+ $common_nsecs, $common_pid, $common_comm,
+ $comm, $pid, $prio, $success, $target_cpu) = @_;
+}
+----
+
+The handler function takes the form subsystem::event_name.
+
+The $common_* arguments in the handler's argument list are the set of
+arguments passed to all event handlers; some of the fields correspond
+to the common_* fields in the format file, but some are synthesized,
+and some of the common_* fields aren't common enough to to be passed
+to every event as arguments but are available as library functions.
+
+Here's a brief description of each of the invariant event args:
+
+ $event_name the name of the event as text
+ $context an opaque 'cookie' used in calls back into perf
+ $common_cpu the cpu the event occurred on
+ $common_secs the secs portion of the event timestamp
+ $common_nsecs the nsecs portion of the event timestamp
+ $common_pid the pid of the current task
+ $common_comm the name of the current process
+
+All of the remaining fields in the event's format file have
+counterparts as handler function arguments of the same name, as can be
+seen in the example above.
+
+The above provides the basics needed to directly access every field of
+every event in a trace, which covers 90% of what you need to know to
+write a useful trace script. The sections below cover the rest.
+
+SCRIPT LAYOUT
+-------------
+
+Every perf script Perl script should start by setting up a Perl module
+search path and 'use'ing a few support modules (see module
+descriptions below):
+
+----
+ use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/perf-script-Util/lib";
+ use lib "./perf-script-Util/lib";
+ use Perf::Trace::Core;
+ use Perf::Trace::Context;
+ use Perf::Trace::Util;
+----
+
+The rest of the script can contain handler functions and support
+functions in any order.
+
+Aside from the event handler functions discussed above, every script
+can implement a set of optional functions:
+
+*trace_begin*, if defined, is called before any event is processed and
+gives scripts a chance to do setup tasks:
+
+----
+ sub trace_begin
+ {
+ }
+----
+
+*trace_end*, if defined, is called after all events have been
+ processed and gives scripts a chance to do end-of-script tasks, such
+ as display results:
+
+----
+sub trace_end
+{
+}
+----
+
+*trace_unhandled*, if defined, is called after for any event that
+ doesn't have a handler explicitly defined for it. The standard set
+ of common arguments are passed into it:
+
+----
+sub trace_unhandled
+{
+ my ($event_name, $context, $common_cpu, $common_secs,
+ $common_nsecs, $common_pid, $common_comm) = @_;
+}
+----
+
+The remaining sections provide descriptions of each of the available
+built-in perf script Perl modules and their associated functions.
+
+AVAILABLE MODULES AND FUNCTIONS
+-------------------------------
+
+The following sections describe the functions and variables available
+via the various Perf::Trace::* Perl modules. To use the functions and
+variables from the given module, add the corresponding 'use
+Perf::Trace::XXX' line to your perf script script.
+
+Perf::Trace::Core Module
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+These functions provide some essential functions to user scripts.
+
+The *flag_str* and *symbol_str* functions provide human-readable
+strings for flag and symbolic fields. These correspond to the strings
+and values parsed from the 'print fmt' fields of the event format
+files:
+
+ flag_str($event_name, $field_name, $field_value) - returns the string represention corresponding to $field_value for the flag field $field_name of event $event_name
+ symbol_str($event_name, $field_name, $field_value) - returns the string represention corresponding to $field_value for the symbolic field $field_name of event $event_name
+
+Perf::Trace::Context Module
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some of the 'common' fields in the event format file aren't all that
+common, but need to be made accessible to user scripts nonetheless.
+
+Perf::Trace::Context defines a set of functions that can be used to
+access this data in the context of the current event. Each of these
+functions expects a $context variable, which is the same as the
+$context variable passed into every event handler as the second
+argument.
+
+ common_pc($context) - returns common_preempt count for the current event
+ common_flags($context) - returns common_flags for the current event
+ common_lock_depth($context) - returns common_lock_depth for the current event
+
+Perf::Trace::Util Module
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Various utility functions for use with perf script:
+
+ nsecs($secs, $nsecs) - returns total nsecs given secs/nsecs pair
+ nsecs_secs($nsecs) - returns whole secs portion given nsecs
+ nsecs_nsecs($nsecs) - returns nsecs remainder given nsecs
+ nsecs_str($nsecs) - returns printable string in the form secs.nsecs
+ avg($total, $n) - returns average given a sum and a total number of values
+
+SEE ALSO
+--------
+linkperf:perf-script[1]
--- /dev/null
+perf-script-python(1)
+====================
+
+NAME
+----
+perf-script-python - Process trace data with a Python script
+
+SYNOPSIS
+--------
+[verse]
+'perf script' [-s [Python]:script[.py] ]
+
+DESCRIPTION
+-----------
+
+This perf script option is used to process perf script data using perf's
+built-in Python interpreter. It reads and processes the input file and
+displays the results of the trace analysis implemented in the given
+Python script, if any.
+
+A QUICK EXAMPLE
+---------------
+
+This section shows the process, start to finish, of creating a working
+Python script that aggregates and extracts useful information from a
+raw perf script stream. You can avoid reading the rest of this
+document if an example is enough for you; the rest of the document
+provides more details on each step and lists the library functions
+available to script writers.
+
+This example actually details the steps that were used to create the
+'syscall-counts' script you see when you list the available perf script
+scripts via 'perf script -l'. As such, this script also shows how to
+integrate your script into the list of general-purpose 'perf script'
+scripts listed by that command.
+
+The syscall-counts script is a simple script, but demonstrates all the
+basic ideas necessary to create a useful script. Here's an example
+of its output (syscall names are not yet supported, they will appear
+as numbers):
+
+----
+syscall events:
+
+event count
+---------------------------------------- -----------
+sys_write 455067
+sys_getdents 4072
+sys_close 3037
+sys_swapoff 1769
+sys_read 923
+sys_sched_setparam 826
+sys_open 331
+sys_newfstat 326
+sys_mmap 217
+sys_munmap 216
+sys_futex 141
+sys_select 102
+sys_poll 84
+sys_setitimer 12
+sys_writev 8
+15 8
+sys_lseek 7
+sys_rt_sigprocmask 6
+sys_wait4 3
+sys_ioctl 3
+sys_set_robust_list 1
+sys_exit 1
+56 1
+sys_access 1
+----
+
+Basically our task is to keep a per-syscall tally that gets updated
+every time a system call occurs in the system. Our script will do
+that, but first we need to record the data that will be processed by
+that script. Theoretically, there are a couple of ways we could do
+that:
+
+- we could enable every event under the tracing/events/syscalls
+ directory, but this is over 600 syscalls, well beyond the number
+ allowable by perf. These individual syscall events will however be
+ useful if we want to later use the guidance we get from the
+ general-purpose scripts to drill down and get more detail about
+ individual syscalls of interest.
+
+- we can enable the sys_enter and/or sys_exit syscalls found under
+ tracing/events/raw_syscalls. These are called for all syscalls; the
+ 'id' field can be used to distinguish between individual syscall
+ numbers.
+
+For this script, we only need to know that a syscall was entered; we
+don't care how it exited, so we'll use 'perf record' to record only
+the sys_enter events:
+
+----
+# perf record -a -e raw_syscalls:sys_enter
+
+^C[ perf record: Woken up 1 times to write data ]
+[ perf record: Captured and wrote 56.545 MB perf.data (~2470503 samples) ]
+----
+
+The options basically say to collect data for every syscall event
+system-wide and multiplex the per-cpu output into a single stream.
+That single stream will be recorded in a file in the current directory
+called perf.data.
+
+Once we have a perf.data file containing our data, we can use the -g
+'perf script' option to generate a Python script that will contain a
+callback handler for each event type found in the perf.data trace
+stream (for more details, see the STARTER SCRIPTS section).
+
+----
+# perf script -g python
+generated Python script: perf-script.py
+
+The output file created also in the current directory is named
+perf-script.py. Here's the file in its entirety:
+
+# perf script event handlers, generated by perf script -g python
+# Licensed under the terms of the GNU GPL License version 2
+
+# The common_* event handler fields are the most useful fields common to
+# all events. They don't necessarily correspond to the 'common_*' fields
+# in the format files. Those fields not available as handler params can
+# be retrieved using Python functions of the form common_*(context).
+# See the perf-script-python Documentation for the list of available functions.
+
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+ '/scripts/python/perf-script-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+
+def trace_begin():
+ print "in trace_begin"
+
+def trace_end():
+ print "in trace_end"
+
+def raw_syscalls__sys_enter(event_name, context, common_cpu,
+ common_secs, common_nsecs, common_pid, common_comm,
+ id, args):
+ print_header(event_name, common_cpu, common_secs, common_nsecs,
+ common_pid, common_comm)
+
+ print "id=%d, args=%s\n" % \
+ (id, args),
+
+def trace_unhandled(event_name, context, common_cpu, common_secs, common_nsecs,
+ common_pid, common_comm):
+ print_header(event_name, common_cpu, common_secs, common_nsecs,
+ common_pid, common_comm)
+
+def print_header(event_name, cpu, secs, nsecs, pid, comm):
+ print "%-20s %5u %05u.%09u %8u %-20s " % \
+ (event_name, cpu, secs, nsecs, pid, comm),
+----
+
+At the top is a comment block followed by some import statements and a
+path append which every perf script script should include.
+
+Following that are a couple generated functions, trace_begin() and
+trace_end(), which are called at the beginning and the end of the
+script respectively (for more details, see the SCRIPT_LAYOUT section
+below).
+
+Following those are the 'event handler' functions generated one for
+every event in the 'perf record' output. The handler functions take
+the form subsystem__event_name, and contain named parameters, one for
+each field in the event; in this case, there's only one event,
+raw_syscalls__sys_enter(). (see the EVENT HANDLERS section below for
+more info on event handlers).
+
+The final couple of functions are, like the begin and end functions,
+generated for every script. The first, trace_unhandled(), is called
+every time the script finds an event in the perf.data file that
+doesn't correspond to any event handler in the script. This could
+mean either that the record step recorded event types that it wasn't
+really interested in, or the script was run against a trace file that
+doesn't correspond to the script.
+
+The script generated by -g option simply prints a line for each
+event found in the trace stream i.e. it basically just dumps the event
+and its parameter values to stdout. The print_header() function is
+simply a utility function used for that purpose. Let's rename the
+script and run it to see the default output:
+
+----
+# mv perf-script.py syscall-counts.py
+# perf script -s syscall-counts.py
+
+raw_syscalls__sys_enter 1 00840.847582083 7506 perf id=1, args=
+raw_syscalls__sys_enter 1 00840.847595764 7506 perf id=1, args=
+raw_syscalls__sys_enter 1 00840.847620860 7506 perf id=1, args=
+raw_syscalls__sys_enter 1 00840.847710478 6533 npviewer.bin id=78, args=
+raw_syscalls__sys_enter 1 00840.847719204 6533 npviewer.bin id=142, args=
+raw_syscalls__sys_enter 1 00840.847755445 6533 npviewer.bin id=3, args=
+raw_syscalls__sys_enter 1 00840.847775601 6533 npviewer.bin id=3, args=
+raw_syscalls__sys_enter 1 00840.847781820 6533 npviewer.bin id=3, args=
+.
+.
+.
+----
+
+Of course, for this script, we're not interested in printing every
+trace event, but rather aggregating it in a useful way. So we'll get
+rid of everything to do with printing as well as the trace_begin() and
+trace_unhandled() functions, which we won't be using. That leaves us
+with this minimalistic skeleton:
+
+----
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+ '/scripts/python/perf-script-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+
+def trace_end():
+ print "in trace_end"
+
+def raw_syscalls__sys_enter(event_name, context, common_cpu,
+ common_secs, common_nsecs, common_pid, common_comm,
+ id, args):
+----
+
+In trace_end(), we'll simply print the results, but first we need to
+generate some results to print. To do that we need to have our
+sys_enter() handler do the necessary tallying until all events have
+been counted. A hash table indexed by syscall id is a good way to
+store that information; every time the sys_enter() handler is called,
+we simply increment a count associated with that hash entry indexed by
+that syscall id:
+
+----
+ syscalls = autodict()
+
+ try:
+ syscalls[id] += 1
+ except TypeError:
+ syscalls[id] = 1
+----
+
+The syscalls 'autodict' object is a special kind of Python dictionary
+(implemented in Core.py) that implements Perl's 'autovivifying' hashes
+in Python i.e. with autovivifying hashes, you can assign nested hash
+values without having to go to the trouble of creating intermediate
+levels if they don't exist e.g syscalls[comm][pid][id] = 1 will create
+the intermediate hash levels and finally assign the value 1 to the
+hash entry for 'id' (because the value being assigned isn't a hash
+object itself, the initial value is assigned in the TypeError
+exception. Well, there may be a better way to do this in Python but
+that's what works for now).
+
+Putting that code into the raw_syscalls__sys_enter() handler, we
+effectively end up with a single-level dictionary keyed on syscall id
+and having the counts we've tallied as values.
+
+The print_syscall_totals() function iterates over the entries in the
+dictionary and displays a line for each entry containing the syscall
+name (the dictonary keys contain the syscall ids, which are passed to
+the Util function syscall_name(), which translates the raw syscall
+numbers to the corresponding syscall name strings). The output is
+displayed after all the events in the trace have been processed, by
+calling the print_syscall_totals() function from the trace_end()
+handler called at the end of script processing.
+
+The final script producing the output shown above is shown in its
+entirety below (syscall_name() helper is not yet available, you can
+only deal with id's for now):
+
+----
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+ '/scripts/python/perf-script-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+from Util import *
+
+syscalls = autodict()
+
+def trace_end():
+ print_syscall_totals()
+
+def raw_syscalls__sys_enter(event_name, context, common_cpu,
+ common_secs, common_nsecs, common_pid, common_comm,
+ id, args):
+ try:
+ syscalls[id] += 1
+ except TypeError:
+ syscalls[id] = 1
+
+def print_syscall_totals():
+ if for_comm is not None:
+ print "\nsyscall events for %s:\n\n" % (for_comm),
+ else:
+ print "\nsyscall events:\n\n",
+
+ print "%-40s %10s\n" % ("event", "count"),
+ print "%-40s %10s\n" % ("----------------------------------------", \
+ "-----------"),
+
+ for id, val in sorted(syscalls.iteritems(), key = lambda(k, v): (v, k), \
+ reverse = True):
+ print "%-40s %10d\n" % (syscall_name(id), val),
+----
+
+The script can be run just as before:
+
+ # perf script -s syscall-counts.py
+
+So those are the essential steps in writing and running a script. The
+process can be generalized to any tracepoint or set of tracepoints
+you're interested in - basically find the tracepoint(s) you're
+interested in by looking at the list of available events shown by
+'perf list' and/or look in /sys/kernel/debug/tracing events for
+detailed event and field info, record the corresponding trace data
+using 'perf record', passing it the list of interesting events,
+generate a skeleton script using 'perf script -g python' and modify the
+code to aggregate and display it for your particular needs.
+
+After you've done that you may end up with a general-purpose script
+that you want to keep around and have available for future use. By
+writing a couple of very simple shell scripts and putting them in the
+right place, you can have your script listed alongside the other
+scripts listed by the 'perf script -l' command e.g.:
+
+----
+root@tropicana:~# perf script -l
+List of available trace scripts:
+ workqueue-stats workqueue stats (ins/exe/create/destroy)
+ wakeup-latency system-wide min/max/avg wakeup latency
+ rw-by-file <comm> r/w activity for a program, by file
+ rw-by-pid system-wide r/w activity
+----
+
+A nice side effect of doing this is that you also then capture the
+probably lengthy 'perf record' command needed to record the events for
+the script.
+
+To have the script appear as a 'built-in' script, you write two simple
+scripts, one for recording and one for 'reporting'.
+
+The 'record' script is a shell script with the same base name as your
+script, but with -record appended. The shell script should be put
+into the perf/scripts/python/bin directory in the kernel source tree.
+In that script, you write the 'perf record' command-line needed for
+your script:
+
+----
+# cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-record
+
+#!/bin/bash
+perf record -a -e raw_syscalls:sys_enter
+----
+
+The 'report' script is also a shell script with the same base name as
+your script, but with -report appended. It should also be located in
+the perf/scripts/python/bin directory. In that script, you write the
+'perf script -s' command-line needed for running your script:
+
+----
+# cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-report
+
+#!/bin/bash
+# description: system-wide syscall counts
+perf script -s ~/libexec/perf-core/scripts/python/syscall-counts.py
+----
+
+Note that the location of the Python script given in the shell script
+is in the libexec/perf-core/scripts/python directory - this is where
+the script will be copied by 'make install' when you install perf.
+For the installation to install your script there, your script needs
+to be located in the perf/scripts/python directory in the kernel
+source tree:
+
+----
+# ls -al kernel-source/tools/perf/scripts/python
+
+root@tropicana:/home/trz/src/tip# ls -al tools/perf/scripts/python
+total 32
+drwxr-xr-x 4 trz trz 4096 2010-01-26 22:30 .
+drwxr-xr-x 4 trz trz 4096 2010-01-26 22:29 ..
+drwxr-xr-x 2 trz trz 4096 2010-01-26 22:29 bin
+-rw-r--r-- 1 trz trz 2548 2010-01-26 22:29 check-perf-script.py
+drwxr-xr-x 3 trz trz 4096 2010-01-26 22:49 perf-script-Util
+-rw-r--r-- 1 trz trz 1462 2010-01-26 22:30 syscall-counts.py
+----
+
+Once you've done that (don't forget to do a new 'make install',
+otherwise your script won't show up at run-time), 'perf script -l'
+should show a new entry for your script:
+
+----
+root@tropicana:~# perf script -l
+List of available trace scripts:
+ workqueue-stats workqueue stats (ins/exe/create/destroy)
+ wakeup-latency system-wide min/max/avg wakeup latency
+ rw-by-file <comm> r/w activity for a program, by file
+ rw-by-pid system-wide r/w activity
+ syscall-counts system-wide syscall counts
+----
+
+You can now perform the record step via 'perf script record':
+
+ # perf script record syscall-counts
+
+and display the output using 'perf script report':
+
+ # perf script report syscall-counts
+
+STARTER SCRIPTS
+---------------
+
+You can quickly get started writing a script for a particular set of
+trace data by generating a skeleton script using 'perf script -g
+python' in the same directory as an existing perf.data trace file.
+That will generate a starter script containing a handler for each of
+the event types in the trace file; it simply prints every available
+field for each event in the trace file.
+
+You can also look at the existing scripts in
+~/libexec/perf-core/scripts/python for typical examples showing how to
+do basic things like aggregate event data, print results, etc. Also,
+the check-perf-script.py script, while not interesting for its results,
+attempts to exercise all of the main scripting features.
+
+EVENT HANDLERS
+--------------
+
+When perf script is invoked using a trace script, a user-defined
+'handler function' is called for each event in the trace. If there's
+no handler function defined for a given event type, the event is
+ignored (or passed to a 'trace_handled' function, see below) and the
+next event is processed.
+
+Most of the event's field values are passed as arguments to the
+handler function; some of the less common ones aren't - those are
+available as calls back into the perf executable (see below).
+
+As an example, the following perf record command can be used to record
+all sched_wakeup events in the system:
+
+ # perf record -a -e sched:sched_wakeup
+
+Traces meant to be processed using a script should be recorded with
+the above option: -a to enable system-wide collection.
+
+The format file for the sched_wakep event defines the following fields
+(see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format):
+
+----
+ format:
+ field:unsigned short common_type;
+ field:unsigned char common_flags;
+ field:unsigned char common_preempt_count;
+ field:int common_pid;
+ field:int common_lock_depth;
+
+ field:char comm[TASK_COMM_LEN];
+ field:pid_t pid;
+ field:int prio;
+ field:int success;
+ field:int target_cpu;
+----
+
+The handler function for this event would be defined as:
+
+----
+def sched__sched_wakeup(event_name, context, common_cpu, common_secs,
+ common_nsecs, common_pid, common_comm,
+ comm, pid, prio, success, target_cpu):
+ pass
+----
+
+The handler function takes the form subsystem__event_name.
+
+The common_* arguments in the handler's argument list are the set of
+arguments passed to all event handlers; some of the fields correspond
+to the common_* fields in the format file, but some are synthesized,
+and some of the common_* fields aren't common enough to to be passed
+to every event as arguments but are available as library functions.
+
+Here's a brief description of each of the invariant event args:
+
+ event_name the name of the event as text
+ context an opaque 'cookie' used in calls back into perf
+ common_cpu the cpu the event occurred on
+ common_secs the secs portion of the event timestamp
+ common_nsecs the nsecs portion of the event timestamp
+ common_pid the pid of the current task
+ common_comm the name of the current process
+
+All of the remaining fields in the event's format file have
+counterparts as handler function arguments of the same name, as can be
+seen in the example above.
+
+The above provides the basics needed to directly access every field of
+every event in a trace, which covers 90% of what you need to know to
+write a useful trace script. The sections below cover the rest.
+
+SCRIPT LAYOUT
+-------------
+
+Every perf script Python script should start by setting up a Python
+module search path and 'import'ing a few support modules (see module
+descriptions below):
+
+----
+ import os
+ import sys
+
+ sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+ '/scripts/python/perf-script-Util/lib/Perf/Trace')
+
+ from perf_trace_context import *
+ from Core import *
+----
+
+The rest of the script can contain handler functions and support
+functions in any order.
+
+Aside from the event handler functions discussed above, every script
+can implement a set of optional functions:
+
+*trace_begin*, if defined, is called before any event is processed and
+gives scripts a chance to do setup tasks:
+
+----
+def trace_begin:
+ pass
+----
+
+*trace_end*, if defined, is called after all events have been
+ processed and gives scripts a chance to do end-of-script tasks, such
+ as display results:
+
+----
+def trace_end:
+ pass
+----
+
+*trace_unhandled*, if defined, is called after for any event that
+ doesn't have a handler explicitly defined for it. The standard set
+ of common arguments are passed into it:
+
+----
+def trace_unhandled(event_name, context, common_cpu, common_secs,
+ common_nsecs, common_pid, common_comm):
+ pass
+----
+
+The remaining sections provide descriptions of each of the available
+built-in perf script Python modules and their associated functions.
+
+AVAILABLE MODULES AND FUNCTIONS
+-------------------------------
+
+The following sections describe the functions and variables available
+via the various perf script Python modules. To use the functions and
+variables from the given module, add the corresponding 'from XXXX
+import' line to your perf script script.
+
+Core.py Module
+~~~~~~~~~~~~~~
+
+These functions provide some essential functions to user scripts.
+
+The *flag_str* and *symbol_str* functions provide human-readable
+strings for flag and symbolic fields. These correspond to the strings
+and values parsed from the 'print fmt' fields of the event format
+files:
+
+ flag_str(event_name, field_name, field_value) - returns the string represention corresponding to field_value for the flag field field_name of event event_name
+ symbol_str(event_name, field_name, field_value) - returns the string represention corresponding to field_value for the symbolic field field_name of event event_name
+
+The *autodict* function returns a special kind of Python
+dictionary that implements Perl's 'autovivifying' hashes in Python
+i.e. with autovivifying hashes, you can assign nested hash values
+without having to go to the trouble of creating intermediate levels if
+they don't exist.
+
+ autodict() - returns an autovivifying dictionary instance
+
+
+perf_trace_context Module
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some of the 'common' fields in the event format file aren't all that
+common, but need to be made accessible to user scripts nonetheless.
+
+perf_trace_context defines a set of functions that can be used to
+access this data in the context of the current event. Each of these
+functions expects a context variable, which is the same as the
+context variable passed into every event handler as the second
+argument.
+
+ common_pc(context) - returns common_preempt count for the current event
+ common_flags(context) - returns common_flags for the current event
+ common_lock_depth(context) - returns common_lock_depth for the current event
+
+Util.py Module
+~~~~~~~~~~~~~~
+
+Various utility functions for use with perf script:
+
+ nsecs(secs, nsecs) - returns total nsecs given secs/nsecs pair
+ nsecs_secs(nsecs) - returns whole secs portion given nsecs
+ nsecs_nsecs(nsecs) - returns nsecs remainder given nsecs
+ nsecs_str(nsecs) - returns printable string in the form secs.nsecs
+ avg(total, n) - returns average given a sum and a total number of values
+
+SEE ALSO
+--------
+linkperf:perf-script[1]
--- /dev/null
+perf-script(1)
+=============
+
+NAME
+----
+perf-script - Read perf.data (created by perf record) and display trace output
+
+SYNOPSIS
+--------
+[verse]
+'perf script' [<options>]
+'perf script' [<options>] record <script> [<record-options>] <command>
+'perf script' [<options>] report <script> [script-args]
+'perf script' [<options>] <script> <required-script-args> [<record-options>] <command>
+'perf script' [<options>] <top-script> [script-args]
+
+DESCRIPTION
+-----------
+This command reads the input file and displays the trace recorded.
+
+There are several variants of perf script:
+
+ 'perf script' to see a detailed trace of the workload that was
+ recorded.
+
+ You can also run a set of pre-canned scripts that aggregate and
+ summarize the raw trace data in various ways (the list of scripts is
+ available via 'perf script -l'). The following variants allow you to
+ record and run those scripts:
+
+ 'perf script record <script> <command>' to record the events required
+ for 'perf script report'. <script> is the name displayed in the
+ output of 'perf script --list' i.e. the actual script name minus any
+ language extension. If <command> is not specified, the events are
+ recorded using the -a (system-wide) 'perf record' option.
+
+ 'perf script report <script> [args]' to run and display the results
+ of <script>. <script> is the name displayed in the output of 'perf
+ trace --list' i.e. the actual script name minus any language
+ extension. The perf.data output from a previous run of 'perf script
+ record <script>' is used and should be present for this command to
+ succeed. [args] refers to the (mainly optional) args expected by
+ the script.
+
+ 'perf script <script> <required-script-args> <command>' to both
+ record the events required for <script> and to run the <script>
+ using 'live-mode' i.e. without writing anything to disk. <script>
+ is the name displayed in the output of 'perf script --list' i.e. the
+ actual script name minus any language extension. If <command> is
+ not specified, the events are recorded using the -a (system-wide)
+ 'perf record' option. If <script> has any required args, they
+ should be specified before <command>. This mode doesn't allow for
+ optional script args to be specified; if optional script args are
+ desired, they can be specified using separate 'perf script record'
+ and 'perf script report' commands, with the stdout of the record step
+ piped to the stdin of the report script, using the '-o -' and '-i -'
+ options of the corresponding commands.
+
+ 'perf script <top-script>' to both record the events required for
+ <top-script> and to run the <top-script> using 'live-mode'
+ i.e. without writing anything to disk. <top-script> is the name
+ displayed in the output of 'perf script --list' i.e. the actual
+ script name minus any language extension; a <top-script> is defined
+ as any script name ending with the string 'top'.
+
+ [<record-options>] can be passed to the record steps of 'perf script
+ record' and 'live-mode' variants; this isn't possible however for
+ <top-script> 'live-mode' or 'perf script report' variants.
+
+ See the 'SEE ALSO' section for links to language-specific
+ information on how to write and run your own trace scripts.
+
+OPTIONS
+-------
+<command>...::
+ Any command you can specify in a shell.
+
+-D::
+--dump-raw-script=::
+ Display verbose dump of the trace data.
+
+-L::
+--Latency=::
+ Show latency attributes (irqs/preemption disabled, etc).
+
+-l::
+--list=::
+ Display a list of available trace scripts.
+
+-s ['lang']::
+--script=::
+ Process trace data with the given script ([lang]:script[.ext]).
+ If the string 'lang' is specified in place of a script name, a
+ list of supported languages will be displayed instead.
+
+-g::
+--gen-script=::
+ Generate perf-script.[ext] starter script for given language,
+ using current perf.data.
+
+-a::
+ Force system-wide collection. Scripts run without a <command>
+ normally use -a by default, while scripts run with a <command>
+ normally don't - this option allows the latter to be run in
+ system-wide mode.
+
+-i::
+--input=::
+ Input file name.
+
+-d::
+--debug-mode::
+ Do various checks like samples ordering and lost events.
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-script-perl[1],
+linkperf:perf-script-python[1]
SYNOPSIS
--------
[verse]
-'perf stat' [-e <EVENT> | --event=EVENT] [-S] [-a] <command>
-'perf stat' [-e <EVENT> | --event=EVENT] [-S] [-a] -- <command> [<options>]
+'perf stat' [-e <EVENT> | --event=EVENT] [-a] <command>
+'perf stat' [-e <EVENT> | --event=EVENT] [-a] -- <command> [<options>]
DESCRIPTION
-----------
child tasks do not inherit counters
-p::
--pid=<pid>::
- stat events on existing pid
+ stat events on existing process id
+
+-t::
+--tid=<tid>::
+ stat events on existing thread id
+
-a::
- system-wide collection
+--all-cpus::
+ system-wide collection from all CPUs
-c::
- scale counter values
+--scale::
+ scale/normalize counter values
+
+-r::
+--repeat=<n>::
+ repeat command and print average + stddev (max: 100)
-B::
+--big-num::
print large numbers with thousands' separators according to locale
-C::
--cpu=::
-Count only on the list of cpus provided. Multiple CPUs can be provided as a
-comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+Count only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
In per-thread mode, this option is ignored. The -a option is still necessary
to activate system-wide monitoring. Default is to count on all CPUs.
+-A::
+--no-aggr::
+Do not aggregate counts across all monitored CPUs in system-wide mode (-a).
+This option is only valid in system-wide mode.
+
+-n::
+--null::
+ null run - don't start any counters
+
+-v::
+--verbose::
+ be more verbose (show counter open errors, etc)
+
+-x SEP::
+--field-separator SEP::
+print counts using a CSV-style output to make it easy to import directly into
+spreadsheets. Columns are separated by the string specified in SEP.
+
EXAMPLES
--------
DESCRIPTION
-----------
-This command does assorted sanity tests, initially thru linked routines but
+This command does assorted sanity tests, initially through linked routines but
also will look for a directory with more tests in the form of scripts.
OPTIONS
--process::
Select the processes to display, by name or PID
+--symfs=<directory>::
+ Look for files with symbols relative to this directory.
SEE ALSO
--------
DESCRIPTION
-----------
-This command generates and displays a performance counter profile in realtime.
+This command generates and displays a performance counter profile in real time.
OPTIONS
-C <cpu-list>::
--cpu=<cpu>::
-Monitor only on the list of cpus provided. Multiple CPUs can be provided as a
-comma-sperated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+Monitor only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
Default is to monitor all CPUS.
-d <seconds>::
--count-filter=<count>::
Only display functions with more events than this.
+-g::
+--group::
+ Put the counters into a counter group.
+
-F <freq>::
--freq=<freq>::
Profile at this frequency.
-p <pid>::
--pid=<pid>::
- Profile events on existing pid.
+ Profile events on existing Process ID.
+
+-t <tid>::
+--tid=<tid>::
+ Profile events on existing thread ID.
-r <priority>::
--realtime=<priority>::
--sym-annotate=<symbol>::
Annotate this symbol.
+-K::
+--hide_kernel_symbols::
+ Hide kernel symbols.
+
+-U::
+--hide_user_symbols::
+ Hide user symbols.
+
+-D::
+--dump-symtab::
+ Dump the symbol table used for profiling.
+
-v::
--verbose::
Be more verbose (show counter open errors, etc).
+++ /dev/null
-perf-trace-perl(1)
-==================
-
-NAME
-----
-perf-trace-perl - Process trace data with a Perl script
-
-SYNOPSIS
---------
-[verse]
-'perf trace' [-s [Perl]:script[.pl] ]
-
-DESCRIPTION
------------
-
-This perf trace option is used to process perf trace data using perf's
-built-in Perl interpreter. It reads and processes the input file and
-displays the results of the trace analysis implemented in the given
-Perl script, if any.
-
-STARTER SCRIPTS
----------------
-
-You can avoid reading the rest of this document by running 'perf trace
--g perl' in the same directory as an existing perf.data trace file.
-That will generate a starter script containing a handler for each of
-the event types in the trace file; it simply prints every available
-field for each event in the trace file.
-
-You can also look at the existing scripts in
-~/libexec/perf-core/scripts/perl for typical examples showing how to
-do basic things like aggregate event data, print results, etc. Also,
-the check-perf-trace.pl script, while not interesting for its results,
-attempts to exercise all of the main scripting features.
-
-EVENT HANDLERS
---------------
-
-When perf trace is invoked using a trace script, a user-defined
-'handler function' is called for each event in the trace. If there's
-no handler function defined for a given event type, the event is
-ignored (or passed to a 'trace_handled' function, see below) and the
-next event is processed.
-
-Most of the event's field values are passed as arguments to the
-handler function; some of the less common ones aren't - those are
-available as calls back into the perf executable (see below).
-
-As an example, the following perf record command can be used to record
-all sched_wakeup events in the system:
-
- # perf record -a -e sched:sched_wakeup
-
-Traces meant to be processed using a script should be recorded with
-the above option: -a to enable system-wide collection.
-
-The format file for the sched_wakep event defines the following fields
-(see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format):
-
-----
- format:
- field:unsigned short common_type;
- field:unsigned char common_flags;
- field:unsigned char common_preempt_count;
- field:int common_pid;
- field:int common_lock_depth;
-
- field:char comm[TASK_COMM_LEN];
- field:pid_t pid;
- field:int prio;
- field:int success;
- field:int target_cpu;
-----
-
-The handler function for this event would be defined as:
-
-----
-sub sched::sched_wakeup
-{
- my ($event_name, $context, $common_cpu, $common_secs,
- $common_nsecs, $common_pid, $common_comm,
- $comm, $pid, $prio, $success, $target_cpu) = @_;
-}
-----
-
-The handler function takes the form subsystem::event_name.
-
-The $common_* arguments in the handler's argument list are the set of
-arguments passed to all event handlers; some of the fields correspond
-to the common_* fields in the format file, but some are synthesized,
-and some of the common_* fields aren't common enough to to be passed
-to every event as arguments but are available as library functions.
-
-Here's a brief description of each of the invariant event args:
-
- $event_name the name of the event as text
- $context an opaque 'cookie' used in calls back into perf
- $common_cpu the cpu the event occurred on
- $common_secs the secs portion of the event timestamp
- $common_nsecs the nsecs portion of the event timestamp
- $common_pid the pid of the current task
- $common_comm the name of the current process
-
-All of the remaining fields in the event's format file have
-counterparts as handler function arguments of the same name, as can be
-seen in the example above.
-
-The above provides the basics needed to directly access every field of
-every event in a trace, which covers 90% of what you need to know to
-write a useful trace script. The sections below cover the rest.
-
-SCRIPT LAYOUT
--------------
-
-Every perf trace Perl script should start by setting up a Perl module
-search path and 'use'ing a few support modules (see module
-descriptions below):
-
-----
- use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib";
- use lib "./Perf-Trace-Util/lib";
- use Perf::Trace::Core;
- use Perf::Trace::Context;
- use Perf::Trace::Util;
-----
-
-The rest of the script can contain handler functions and support
-functions in any order.
-
-Aside from the event handler functions discussed above, every script
-can implement a set of optional functions:
-
-*trace_begin*, if defined, is called before any event is processed and
-gives scripts a chance to do setup tasks:
-
-----
- sub trace_begin
- {
- }
-----
-
-*trace_end*, if defined, is called after all events have been
- processed and gives scripts a chance to do end-of-script tasks, such
- as display results:
-
-----
-sub trace_end
-{
-}
-----
-
-*trace_unhandled*, if defined, is called after for any event that
- doesn't have a handler explicitly defined for it. The standard set
- of common arguments are passed into it:
-
-----
-sub trace_unhandled
-{
- my ($event_name, $context, $common_cpu, $common_secs,
- $common_nsecs, $common_pid, $common_comm) = @_;
-}
-----
-
-The remaining sections provide descriptions of each of the available
-built-in perf trace Perl modules and their associated functions.
-
-AVAILABLE MODULES AND FUNCTIONS
--------------------------------
-
-The following sections describe the functions and variables available
-via the various Perf::Trace::* Perl modules. To use the functions and
-variables from the given module, add the corresponding 'use
-Perf::Trace::XXX' line to your perf trace script.
-
-Perf::Trace::Core Module
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-These functions provide some essential functions to user scripts.
-
-The *flag_str* and *symbol_str* functions provide human-readable
-strings for flag and symbolic fields. These correspond to the strings
-and values parsed from the 'print fmt' fields of the event format
-files:
-
- flag_str($event_name, $field_name, $field_value) - returns the string represention corresponding to $field_value for the flag field $field_name of event $event_name
- symbol_str($event_name, $field_name, $field_value) - returns the string represention corresponding to $field_value for the symbolic field $field_name of event $event_name
-
-Perf::Trace::Context Module
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Some of the 'common' fields in the event format file aren't all that
-common, but need to be made accessible to user scripts nonetheless.
-
-Perf::Trace::Context defines a set of functions that can be used to
-access this data in the context of the current event. Each of these
-functions expects a $context variable, which is the same as the
-$context variable passed into every event handler as the second
-argument.
-
- common_pc($context) - returns common_preempt count for the current event
- common_flags($context) - returns common_flags for the current event
- common_lock_depth($context) - returns common_lock_depth for the current event
-
-Perf::Trace::Util Module
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-Various utility functions for use with perf trace:
-
- nsecs($secs, $nsecs) - returns total nsecs given secs/nsecs pair
- nsecs_secs($nsecs) - returns whole secs portion given nsecs
- nsecs_nsecs($nsecs) - returns nsecs remainder given nsecs
- nsecs_str($nsecs) - returns printable string in the form secs.nsecs
- avg($total, $n) - returns average given a sum and a total number of values
-
-SEE ALSO
---------
-linkperf:perf-trace[1]
+++ /dev/null
-perf-trace-python(1)
-====================
-
-NAME
-----
-perf-trace-python - Process trace data with a Python script
-
-SYNOPSIS
---------
-[verse]
-'perf trace' [-s [Python]:script[.py] ]
-
-DESCRIPTION
------------
-
-This perf trace option is used to process perf trace data using perf's
-built-in Python interpreter. It reads and processes the input file and
-displays the results of the trace analysis implemented in the given
-Python script, if any.
-
-A QUICK EXAMPLE
----------------
-
-This section shows the process, start to finish, of creating a working
-Python script that aggregates and extracts useful information from a
-raw perf trace stream. You can avoid reading the rest of this
-document if an example is enough for you; the rest of the document
-provides more details on each step and lists the library functions
-available to script writers.
-
-This example actually details the steps that were used to create the
-'syscall-counts' script you see when you list the available perf trace
-scripts via 'perf trace -l'. As such, this script also shows how to
-integrate your script into the list of general-purpose 'perf trace'
-scripts listed by that command.
-
-The syscall-counts script is a simple script, but demonstrates all the
-basic ideas necessary to create a useful script. Here's an example
-of its output (syscall names are not yet supported, they will appear
-as numbers):
-
-----
-syscall events:
-
-event count
----------------------------------------- -----------
-sys_write 455067
-sys_getdents 4072
-sys_close 3037
-sys_swapoff 1769
-sys_read 923
-sys_sched_setparam 826
-sys_open 331
-sys_newfstat 326
-sys_mmap 217
-sys_munmap 216
-sys_futex 141
-sys_select 102
-sys_poll 84
-sys_setitimer 12
-sys_writev 8
-15 8
-sys_lseek 7
-sys_rt_sigprocmask 6
-sys_wait4 3
-sys_ioctl 3
-sys_set_robust_list 1
-sys_exit 1
-56 1
-sys_access 1
-----
-
-Basically our task is to keep a per-syscall tally that gets updated
-every time a system call occurs in the system. Our script will do
-that, but first we need to record the data that will be processed by
-that script. Theoretically, there are a couple of ways we could do
-that:
-
-- we could enable every event under the tracing/events/syscalls
- directory, but this is over 600 syscalls, well beyond the number
- allowable by perf. These individual syscall events will however be
- useful if we want to later use the guidance we get from the
- general-purpose scripts to drill down and get more detail about
- individual syscalls of interest.
-
-- we can enable the sys_enter and/or sys_exit syscalls found under
- tracing/events/raw_syscalls. These are called for all syscalls; the
- 'id' field can be used to distinguish between individual syscall
- numbers.
-
-For this script, we only need to know that a syscall was entered; we
-don't care how it exited, so we'll use 'perf record' to record only
-the sys_enter events:
-
-----
-# perf record -a -e raw_syscalls:sys_enter
-
-^C[ perf record: Woken up 1 times to write data ]
-[ perf record: Captured and wrote 56.545 MB perf.data (~2470503 samples) ]
-----
-
-The options basically say to collect data for every syscall event
-system-wide and multiplex the per-cpu output into a single stream.
-That single stream will be recorded in a file in the current directory
-called perf.data.
-
-Once we have a perf.data file containing our data, we can use the -g
-'perf trace' option to generate a Python script that will contain a
-callback handler for each event type found in the perf.data trace
-stream (for more details, see the STARTER SCRIPTS section).
-
-----
-# perf trace -g python
-generated Python script: perf-trace.py
-
-The output file created also in the current directory is named
-perf-trace.py. Here's the file in its entirety:
-
-# perf trace event handlers, generated by perf trace -g python
-# Licensed under the terms of the GNU GPL License version 2
-
-# The common_* event handler fields are the most useful fields common to
-# all events. They don't necessarily correspond to the 'common_*' fields
-# in the format files. Those fields not available as handler params can
-# be retrieved using Python functions of the form common_*(context).
-# See the perf-trace-python Documentation for the list of available functions.
-
-import os
-import sys
-
-sys.path.append(os.environ['PERF_EXEC_PATH'] + \
- '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
-
-from perf_trace_context import *
-from Core import *
-
-def trace_begin():
- print "in trace_begin"
-
-def trace_end():
- print "in trace_end"
-
-def raw_syscalls__sys_enter(event_name, context, common_cpu,
- common_secs, common_nsecs, common_pid, common_comm,
- id, args):
- print_header(event_name, common_cpu, common_secs, common_nsecs,
- common_pid, common_comm)
-
- print "id=%d, args=%s\n" % \
- (id, args),
-
-def trace_unhandled(event_name, context, common_cpu, common_secs, common_nsecs,
- common_pid, common_comm):
- print_header(event_name, common_cpu, common_secs, common_nsecs,
- common_pid, common_comm)
-
-def print_header(event_name, cpu, secs, nsecs, pid, comm):
- print "%-20s %5u %05u.%09u %8u %-20s " % \
- (event_name, cpu, secs, nsecs, pid, comm),
-----
-
-At the top is a comment block followed by some import statements and a
-path append which every perf trace script should include.
-
-Following that are a couple generated functions, trace_begin() and
-trace_end(), which are called at the beginning and the end of the
-script respectively (for more details, see the SCRIPT_LAYOUT section
-below).
-
-Following those are the 'event handler' functions generated one for
-every event in the 'perf record' output. The handler functions take
-the form subsystem__event_name, and contain named parameters, one for
-each field in the event; in this case, there's only one event,
-raw_syscalls__sys_enter(). (see the EVENT HANDLERS section below for
-more info on event handlers).
-
-The final couple of functions are, like the begin and end functions,
-generated for every script. The first, trace_unhandled(), is called
-every time the script finds an event in the perf.data file that
-doesn't correspond to any event handler in the script. This could
-mean either that the record step recorded event types that it wasn't
-really interested in, or the script was run against a trace file that
-doesn't correspond to the script.
-
-The script generated by -g option simply prints a line for each
-event found in the trace stream i.e. it basically just dumps the event
-and its parameter values to stdout. The print_header() function is
-simply a utility function used for that purpose. Let's rename the
-script and run it to see the default output:
-
-----
-# mv perf-trace.py syscall-counts.py
-# perf trace -s syscall-counts.py
-
-raw_syscalls__sys_enter 1 00840.847582083 7506 perf id=1, args=
-raw_syscalls__sys_enter 1 00840.847595764 7506 perf id=1, args=
-raw_syscalls__sys_enter 1 00840.847620860 7506 perf id=1, args=
-raw_syscalls__sys_enter 1 00840.847710478 6533 npviewer.bin id=78, args=
-raw_syscalls__sys_enter 1 00840.847719204 6533 npviewer.bin id=142, args=
-raw_syscalls__sys_enter 1 00840.847755445 6533 npviewer.bin id=3, args=
-raw_syscalls__sys_enter 1 00840.847775601 6533 npviewer.bin id=3, args=
-raw_syscalls__sys_enter 1 00840.847781820 6533 npviewer.bin id=3, args=
-.
-.
-.
-----
-
-Of course, for this script, we're not interested in printing every
-trace event, but rather aggregating it in a useful way. So we'll get
-rid of everything to do with printing as well as the trace_begin() and
-trace_unhandled() functions, which we won't be using. That leaves us
-with this minimalistic skeleton:
-
-----
-import os
-import sys
-
-sys.path.append(os.environ['PERF_EXEC_PATH'] + \
- '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
-
-from perf_trace_context import *
-from Core import *
-
-def trace_end():
- print "in trace_end"
-
-def raw_syscalls__sys_enter(event_name, context, common_cpu,
- common_secs, common_nsecs, common_pid, common_comm,
- id, args):
-----
-
-In trace_end(), we'll simply print the results, but first we need to
-generate some results to print. To do that we need to have our
-sys_enter() handler do the necessary tallying until all events have
-been counted. A hash table indexed by syscall id is a good way to
-store that information; every time the sys_enter() handler is called,
-we simply increment a count associated with that hash entry indexed by
-that syscall id:
-
-----
- syscalls = autodict()
-
- try:
- syscalls[id] += 1
- except TypeError:
- syscalls[id] = 1
-----
-
-The syscalls 'autodict' object is a special kind of Python dictionary
-(implemented in Core.py) that implements Perl's 'autovivifying' hashes
-in Python i.e. with autovivifying hashes, you can assign nested hash
-values without having to go to the trouble of creating intermediate
-levels if they don't exist e.g syscalls[comm][pid][id] = 1 will create
-the intermediate hash levels and finally assign the value 1 to the
-hash entry for 'id' (because the value being assigned isn't a hash
-object itself, the initial value is assigned in the TypeError
-exception. Well, there may be a better way to do this in Python but
-that's what works for now).
-
-Putting that code into the raw_syscalls__sys_enter() handler, we
-effectively end up with a single-level dictionary keyed on syscall id
-and having the counts we've tallied as values.
-
-The print_syscall_totals() function iterates over the entries in the
-dictionary and displays a line for each entry containing the syscall
-name (the dictonary keys contain the syscall ids, which are passed to
-the Util function syscall_name(), which translates the raw syscall
-numbers to the corresponding syscall name strings). The output is
-displayed after all the events in the trace have been processed, by
-calling the print_syscall_totals() function from the trace_end()
-handler called at the end of script processing.
-
-The final script producing the output shown above is shown in its
-entirety below (syscall_name() helper is not yet available, you can
-only deal with id's for now):
-
-----
-import os
-import sys
-
-sys.path.append(os.environ['PERF_EXEC_PATH'] + \
- '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
-
-from perf_trace_context import *
-from Core import *
-from Util import *
-
-syscalls = autodict()
-
-def trace_end():
- print_syscall_totals()
-
-def raw_syscalls__sys_enter(event_name, context, common_cpu,
- common_secs, common_nsecs, common_pid, common_comm,
- id, args):
- try:
- syscalls[id] += 1
- except TypeError:
- syscalls[id] = 1
-
-def print_syscall_totals():
- if for_comm is not None:
- print "\nsyscall events for %s:\n\n" % (for_comm),
- else:
- print "\nsyscall events:\n\n",
-
- print "%-40s %10s\n" % ("event", "count"),
- print "%-40s %10s\n" % ("----------------------------------------", \
- "-----------"),
-
- for id, val in sorted(syscalls.iteritems(), key = lambda(k, v): (v, k), \
- reverse = True):
- print "%-40s %10d\n" % (syscall_name(id), val),
-----
-
-The script can be run just as before:
-
- # perf trace -s syscall-counts.py
-
-So those are the essential steps in writing and running a script. The
-process can be generalized to any tracepoint or set of tracepoints
-you're interested in - basically find the tracepoint(s) you're
-interested in by looking at the list of available events shown by
-'perf list' and/or look in /sys/kernel/debug/tracing events for
-detailed event and field info, record the corresponding trace data
-using 'perf record', passing it the list of interesting events,
-generate a skeleton script using 'perf trace -g python' and modify the
-code to aggregate and display it for your particular needs.
-
-After you've done that you may end up with a general-purpose script
-that you want to keep around and have available for future use. By
-writing a couple of very simple shell scripts and putting them in the
-right place, you can have your script listed alongside the other
-scripts listed by the 'perf trace -l' command e.g.:
-
-----
-root@tropicana:~# perf trace -l
-List of available trace scripts:
- workqueue-stats workqueue stats (ins/exe/create/destroy)
- wakeup-latency system-wide min/max/avg wakeup latency
- rw-by-file <comm> r/w activity for a program, by file
- rw-by-pid system-wide r/w activity
-----
-
-A nice side effect of doing this is that you also then capture the
-probably lengthy 'perf record' command needed to record the events for
-the script.
-
-To have the script appear as a 'built-in' script, you write two simple
-scripts, one for recording and one for 'reporting'.
-
-The 'record' script is a shell script with the same base name as your
-script, but with -record appended. The shell script should be put
-into the perf/scripts/python/bin directory in the kernel source tree.
-In that script, you write the 'perf record' command-line needed for
-your script:
-
-----
-# cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-record
-
-#!/bin/bash
-perf record -a -e raw_syscalls:sys_enter
-----
-
-The 'report' script is also a shell script with the same base name as
-your script, but with -report appended. It should also be located in
-the perf/scripts/python/bin directory. In that script, you write the
-'perf trace -s' command-line needed for running your script:
-
-----
-# cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-report
-
-#!/bin/bash
-# description: system-wide syscall counts
-perf trace -s ~/libexec/perf-core/scripts/python/syscall-counts.py
-----
-
-Note that the location of the Python script given in the shell script
-is in the libexec/perf-core/scripts/python directory - this is where
-the script will be copied by 'make install' when you install perf.
-For the installation to install your script there, your script needs
-to be located in the perf/scripts/python directory in the kernel
-source tree:
-
-----
-# ls -al kernel-source/tools/perf/scripts/python
-
-root@tropicana:/home/trz/src/tip# ls -al tools/perf/scripts/python
-total 32
-drwxr-xr-x 4 trz trz 4096 2010-01-26 22:30 .
-drwxr-xr-x 4 trz trz 4096 2010-01-26 22:29 ..
-drwxr-xr-x 2 trz trz 4096 2010-01-26 22:29 bin
--rw-r--r-- 1 trz trz 2548 2010-01-26 22:29 check-perf-trace.py
-drwxr-xr-x 3 trz trz 4096 2010-01-26 22:49 Perf-Trace-Util
--rw-r--r-- 1 trz trz 1462 2010-01-26 22:30 syscall-counts.py
-----
-
-Once you've done that (don't forget to do a new 'make install',
-otherwise your script won't show up at run-time), 'perf trace -l'
-should show a new entry for your script:
-
-----
-root@tropicana:~# perf trace -l
-List of available trace scripts:
- workqueue-stats workqueue stats (ins/exe/create/destroy)
- wakeup-latency system-wide min/max/avg wakeup latency
- rw-by-file <comm> r/w activity for a program, by file
- rw-by-pid system-wide r/w activity
- syscall-counts system-wide syscall counts
-----
-
-You can now perform the record step via 'perf trace record':
-
- # perf trace record syscall-counts
-
-and display the output using 'perf trace report':
-
- # perf trace report syscall-counts
-
-STARTER SCRIPTS
----------------
-
-You can quickly get started writing a script for a particular set of
-trace data by generating a skeleton script using 'perf trace -g
-python' in the same directory as an existing perf.data trace file.
-That will generate a starter script containing a handler for each of
-the event types in the trace file; it simply prints every available
-field for each event in the trace file.
-
-You can also look at the existing scripts in
-~/libexec/perf-core/scripts/python for typical examples showing how to
-do basic things like aggregate event data, print results, etc. Also,
-the check-perf-trace.py script, while not interesting for its results,
-attempts to exercise all of the main scripting features.
-
-EVENT HANDLERS
---------------
-
-When perf trace is invoked using a trace script, a user-defined
-'handler function' is called for each event in the trace. If there's
-no handler function defined for a given event type, the event is
-ignored (or passed to a 'trace_handled' function, see below) and the
-next event is processed.
-
-Most of the event's field values are passed as arguments to the
-handler function; some of the less common ones aren't - those are
-available as calls back into the perf executable (see below).
-
-As an example, the following perf record command can be used to record
-all sched_wakeup events in the system:
-
- # perf record -a -e sched:sched_wakeup
-
-Traces meant to be processed using a script should be recorded with
-the above option: -a to enable system-wide collection.
-
-The format file for the sched_wakep event defines the following fields
-(see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format):
-
-----
- format:
- field:unsigned short common_type;
- field:unsigned char common_flags;
- field:unsigned char common_preempt_count;
- field:int common_pid;
- field:int common_lock_depth;
-
- field:char comm[TASK_COMM_LEN];
- field:pid_t pid;
- field:int prio;
- field:int success;
- field:int target_cpu;
-----
-
-The handler function for this event would be defined as:
-
-----
-def sched__sched_wakeup(event_name, context, common_cpu, common_secs,
- common_nsecs, common_pid, common_comm,
- comm, pid, prio, success, target_cpu):
- pass
-----
-
-The handler function takes the form subsystem__event_name.
-
-The common_* arguments in the handler's argument list are the set of
-arguments passed to all event handlers; some of the fields correspond
-to the common_* fields in the format file, but some are synthesized,
-and some of the common_* fields aren't common enough to to be passed
-to every event as arguments but are available as library functions.
-
-Here's a brief description of each of the invariant event args:
-
- event_name the name of the event as text
- context an opaque 'cookie' used in calls back into perf
- common_cpu the cpu the event occurred on
- common_secs the secs portion of the event timestamp
- common_nsecs the nsecs portion of the event timestamp
- common_pid the pid of the current task
- common_comm the name of the current process
-
-All of the remaining fields in the event's format file have
-counterparts as handler function arguments of the same name, as can be
-seen in the example above.
-
-The above provides the basics needed to directly access every field of
-every event in a trace, which covers 90% of what you need to know to
-write a useful trace script. The sections below cover the rest.
-
-SCRIPT LAYOUT
--------------
-
-Every perf trace Python script should start by setting up a Python
-module search path and 'import'ing a few support modules (see module
-descriptions below):
-
-----
- import os
- import sys
-
- sys.path.append(os.environ['PERF_EXEC_PATH'] + \
- '/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
-
- from perf_trace_context import *
- from Core import *
-----
-
-The rest of the script can contain handler functions and support
-functions in any order.
-
-Aside from the event handler functions discussed above, every script
-can implement a set of optional functions:
-
-*trace_begin*, if defined, is called before any event is processed and
-gives scripts a chance to do setup tasks:
-
-----
-def trace_begin:
- pass
-----
-
-*trace_end*, if defined, is called after all events have been
- processed and gives scripts a chance to do end-of-script tasks, such
- as display results:
-
-----
-def trace_end:
- pass
-----
-
-*trace_unhandled*, if defined, is called after for any event that
- doesn't have a handler explicitly defined for it. The standard set
- of common arguments are passed into it:
-
-----
-def trace_unhandled(event_name, context, common_cpu, common_secs,
- common_nsecs, common_pid, common_comm):
- pass
-----
-
-The remaining sections provide descriptions of each of the available
-built-in perf trace Python modules and their associated functions.
-
-AVAILABLE MODULES AND FUNCTIONS
--------------------------------
-
-The following sections describe the functions and variables available
-via the various perf trace Python modules. To use the functions and
-variables from the given module, add the corresponding 'from XXXX
-import' line to your perf trace script.
-
-Core.py Module
-~~~~~~~~~~~~~~
-
-These functions provide some essential functions to user scripts.
-
-The *flag_str* and *symbol_str* functions provide human-readable
-strings for flag and symbolic fields. These correspond to the strings
-and values parsed from the 'print fmt' fields of the event format
-files:
-
- flag_str(event_name, field_name, field_value) - returns the string represention corresponding to field_value for the flag field field_name of event event_name
- symbol_str(event_name, field_name, field_value) - returns the string represention corresponding to field_value for the symbolic field field_name of event event_name
-
-The *autodict* function returns a special kind of Python
-dictionary that implements Perl's 'autovivifying' hashes in Python
-i.e. with autovivifying hashes, you can assign nested hash values
-without having to go to the trouble of creating intermediate levels if
-they don't exist.
-
- autodict() - returns an autovivifying dictionary instance
-
-
-perf_trace_context Module
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Some of the 'common' fields in the event format file aren't all that
-common, but need to be made accessible to user scripts nonetheless.
-
-perf_trace_context defines a set of functions that can be used to
-access this data in the context of the current event. Each of these
-functions expects a context variable, which is the same as the
-context variable passed into every event handler as the second
-argument.
-
- common_pc(context) - returns common_preempt count for the current event
- common_flags(context) - returns common_flags for the current event
- common_lock_depth(context) - returns common_lock_depth for the current event
-
-Util.py Module
-~~~~~~~~~~~~~~
-
-Various utility functions for use with perf trace:
-
- nsecs(secs, nsecs) - returns total nsecs given secs/nsecs pair
- nsecs_secs(nsecs) - returns whole secs portion given nsecs
- nsecs_nsecs(nsecs) - returns nsecs remainder given nsecs
- nsecs_str(nsecs) - returns printable string in the form secs.nsecs
- avg(total, n) - returns average given a sum and a total number of values
-
-SEE ALSO
---------
-linkperf:perf-trace[1]
+++ /dev/null
-perf-trace(1)
-=============
-
-NAME
-----
-perf-trace - Read perf.data (created by perf record) and display trace output
-
-SYNOPSIS
---------
-[verse]
-'perf trace' [<options>]
-'perf trace' [<options>] record <script> [<record-options>] <command>
-'perf trace' [<options>] report <script> [script-args]
-'perf trace' [<options>] <script> <required-script-args> [<record-options>] <command>
-'perf trace' [<options>] <top-script> [script-args]
-
-DESCRIPTION
------------
-This command reads the input file and displays the trace recorded.
-
-There are several variants of perf trace:
-
- 'perf trace' to see a detailed trace of the workload that was
- recorded.
-
- You can also run a set of pre-canned scripts that aggregate and
- summarize the raw trace data in various ways (the list of scripts is
- available via 'perf trace -l'). The following variants allow you to
- record and run those scripts:
-
- 'perf trace record <script> <command>' to record the events required
- for 'perf trace report'. <script> is the name displayed in the
- output of 'perf trace --list' i.e. the actual script name minus any
- language extension. If <command> is not specified, the events are
- recorded using the -a (system-wide) 'perf record' option.
-
- 'perf trace report <script> [args]' to run and display the results
- of <script>. <script> is the name displayed in the output of 'perf
- trace --list' i.e. the actual script name minus any language
- extension. The perf.data output from a previous run of 'perf trace
- record <script>' is used and should be present for this command to
- succeed. [args] refers to the (mainly optional) args expected by
- the script.
-
- 'perf trace <script> <required-script-args> <command>' to both
- record the events required for <script> and to run the <script>
- using 'live-mode' i.e. without writing anything to disk. <script>
- is the name displayed in the output of 'perf trace --list' i.e. the
- actual script name minus any language extension. If <command> is
- not specified, the events are recorded using the -a (system-wide)
- 'perf record' option. If <script> has any required args, they
- should be specified before <command>. This mode doesn't allow for
- optional script args to be specified; if optional script args are
- desired, they can be specified using separate 'perf trace record'
- and 'perf trace report' commands, with the stdout of the record step
- piped to the stdin of the report script, using the '-o -' and '-i -'
- options of the corresponding commands.
-
- 'perf trace <top-script>' to both record the events required for
- <top-script> and to run the <top-script> using 'live-mode'
- i.e. without writing anything to disk. <top-script> is the name
- displayed in the output of 'perf trace --list' i.e. the actual
- script name minus any language extension; a <top-script> is defined
- as any script name ending with the string 'top'.
-
- [<record-options>] can be passed to the record steps of 'perf trace
- record' and 'live-mode' variants; this isn't possible however for
- <top-script> 'live-mode' or 'perf trace report' variants.
-
- See the 'SEE ALSO' section for links to language-specific
- information on how to write and run your own trace scripts.
-
-OPTIONS
--------
-<command>...::
- Any command you can specify in a shell.
-
--D::
---dump-raw-trace=::
- Display verbose dump of the trace data.
-
--L::
---Latency=::
- Show latency attributes (irqs/preemption disabled, etc).
-
--l::
---list=::
- Display a list of available trace scripts.
-
--s ['lang']::
---script=::
- Process trace data with the given script ([lang]:script[.ext]).
- If the string 'lang' is specified in place of a script name, a
- list of supported languages will be displayed instead.
-
--g::
---gen-script=::
- Generate perf-trace.[ext] starter script for given language,
- using current perf.data.
-
--a::
- Force system-wide collection. Scripts run without a <command>
- normally use -a by default, while scripts run with a <command>
- normally don't - this option allows the latter to be run in
- system-wide mode.
-
-
-SEE ALSO
---------
-linkperf:perf-record[1], linkperf:perf-trace-perl[1],
-linkperf:perf-trace-python[1]
lib/rbtree.c
include/linux/swab.h
arch/*/include/asm/unistd*.h
+arch/*/lib/memcpy*.S
include/linux/poison.h
include/linux/magic.h
include/linux/hw_breakpoint.h
ARCH := x86
endif
ifeq ($(ARCH),x86_64)
+ RAW_ARCH := x86_64
ARCH := x86
+ ARCH_CFLAGS := -DARCH_X86_64
+ ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S
endif
# CFLAGS and LDFLAGS are for the users to override from the command line.
LIB_H += util/include/linux/rbtree.h
LIB_H += util/include/linux/string.h
LIB_H += util/include/linux/types.h
+LIB_H += util/include/linux/linkage.h
LIB_H += util/include/asm/asm-offsets.h
LIB_H += util/include/asm/bug.h
LIB_H += util/include/asm/byteorder.h
LIB_H += util/include/asm/system.h
LIB_H += util/include/asm/uaccess.h
LIB_H += util/include/dwarf-regs.h
+LIB_H += util/include/asm/dwarf2.h
+LIB_H += util/include/asm/cpufeature.h
LIB_H += perf.h
LIB_H += util/cache.h
LIB_H += util/callchain.h
LIB_H += util/debug.h
LIB_H += util/debugfs.h
LIB_H += util/event.h
+LIB_H += util/evsel.h
LIB_H += util/exec_cmd.h
LIB_H += util/types.h
LIB_H += util/levenshtein.h
LIB_H += util/parse-events.h
LIB_H += util/quote.h
LIB_H += util/util.h
+LIB_H += util/xyarray.h
LIB_H += util/header.h
LIB_H += util/help.h
LIB_H += util/session.h
LIB_H += util/probe-event.h
LIB_H += util/pstack.h
LIB_H += util/cpumap.h
+LIB_H += $(ARCH_INCLUDE)
LIB_OBJS += $(OUTPUT)util/abspath.o
LIB_OBJS += $(OUTPUT)util/alias.o
LIB_OBJS += $(OUTPUT)util/debugfs.o
LIB_OBJS += $(OUTPUT)util/environment.o
LIB_OBJS += $(OUTPUT)util/event.o
+LIB_OBJS += $(OUTPUT)util/evsel.o
LIB_OBJS += $(OUTPUT)util/exec_cmd.o
LIB_OBJS += $(OUTPUT)util/help.o
LIB_OBJS += $(OUTPUT)util/levenshtein.o
LIB_OBJS += $(OUTPUT)util/hist.o
LIB_OBJS += $(OUTPUT)util/probe-event.o
LIB_OBJS += $(OUTPUT)util/util.o
+LIB_OBJS += $(OUTPUT)util/xyarray.o
LIB_OBJS += $(OUTPUT)util/cpumap.o
BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o
# Benchmark modules
BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o
BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
+ifeq ($(RAW_ARCH),x86_64)
+BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
+endif
BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o
BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
BUILTIN_OBJS += $(OUTPUT)builtin-stat.o
BUILTIN_OBJS += $(OUTPUT)builtin-timechart.o
BUILTIN_OBJS += $(OUTPUT)builtin-top.o
-BUILTIN_OBJS += $(OUTPUT)builtin-trace.o
+BUILTIN_OBJS += $(OUTPUT)builtin-script.o
BUILTIN_OBJS += $(OUTPUT)builtin-probe.o
BUILTIN_OBJS += $(OUTPUT)builtin-kmem.o
BUILTIN_OBJS += $(OUTPUT)builtin-lock.o
-include config.mak
ifndef NO_DWARF
-FLAGS_DWARF=$(ALL_CFLAGS) -I/usr/include/elfutils -ldw -lelf $(ALL_LDFLAGS) $(EXTLIBS)
+FLAGS_DWARF=$(ALL_CFLAGS) -ldw -lelf $(ALL_LDFLAGS) $(EXTLIBS)
ifneq ($(call try-cc,$(SOURCE_DWARF),$(FLAGS_DWARF)),y)
msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev);
NO_DWARF := 1
ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined)
msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled);
else
- BASIC_CFLAGS += -I/usr/include/elfutils -DDWARF_SUPPORT
+ BASIC_CFLAGS += -DDWARF_SUPPORT
EXTLIBS += -lelf -ldw
LIB_OBJS += $(OUTPUT)util/probe-finder.o
endif # PERF_HAVE_DWARF_REGS
SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
-LIBS = $(PERFLIBS) $(EXTLIBS)
+LIBS = -Wl,--whole-archive $(PERFLIBS) -Wl,--no-whole-archive $(EXTLIBS)
BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
$(COMPAT_CFLAGS)
LIB_OBJS += $(COMPAT_OBJS)
ALL_CFLAGS += $(BASIC_CFLAGS)
+ALL_CFLAGS += $(ARCH_CFLAGS)
ALL_LDFLAGS += $(BASIC_LDFLAGS)
export TAR INSTALL DESTDIR SHELL_PATH
--- /dev/null
+
+#ifdef ARCH_X86_64
+
+#define MEMCPY_FN(fn, name, desc) \
+ extern void *fn(void *, const void *, size_t);
+
+#include "mem-memcpy-x86-64-asm-def.h"
+
+#undef MEMCPY_FN
+
+#endif
+
--- /dev/null
+
+MEMCPY_FN(__memcpy,
+ "x86-64-unrolled",
+ "unrolled memcpy() in arch/x86/lib/memcpy_64.S")
--- /dev/null
+
+#include "../../../arch/x86/lib/memcpy_64.S"
#include "../util/parse-options.h"
#include "../util/header.h"
#include "bench.h"
+#include "mem-memcpy-arch.h"
#include <stdio.h>
#include <stdlib.h>
static const char *length_str = "1MB";
static const char *routine = "default";
-static bool use_clock = false;
+static bool use_clock;
static int clock_fd;
+static bool only_prefault;
+static bool no_prefault;
static const struct option options[] = {
OPT_STRING('l', "length", &length_str, "1MB",
"Specify routine to copy"),
OPT_BOOLEAN('c', "clock", &use_clock,
"Use CPU clock for measuring"),
+ OPT_BOOLEAN('o', "only-prefault", &only_prefault,
+ "Show only the result with page faults before memcpy()"),
+ OPT_BOOLEAN('n', "no-prefault", &no_prefault,
+ "Show only the result without page faults before memcpy()"),
OPT_END()
};
+typedef void *(*memcpy_t)(void *, const void *, size_t);
+
struct routine {
const char *name;
const char *desc;
- void * (*fn)(void *dst, const void *src, size_t len);
+ memcpy_t fn;
};
struct routine routines[] = {
{ "default",
"Default memcpy() provided by glibc",
memcpy },
+#ifdef ARCH_X86_64
+
+#define MEMCPY_FN(fn, name, desc) { name, desc, fn },
+#include "mem-memcpy-x86-64-asm-def.h"
+#undef MEMCPY_FN
+
+#endif
+
{ NULL,
NULL,
NULL }
(double)ts->tv_usec / (double)1000000;
}
+static void alloc_mem(void **dst, void **src, size_t length)
+{
+ *dst = zalloc(length);
+ if (!dst)
+ die("memory allocation failed - maybe length is too large?\n");
+
+ *src = zalloc(length);
+ if (!src)
+ die("memory allocation failed - maybe length is too large?\n");
+}
+
+static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault)
+{
+ u64 clock_start = 0ULL, clock_end = 0ULL;
+ void *src = NULL, *dst = NULL;
+
+ alloc_mem(&src, &dst, len);
+
+ if (prefault)
+ fn(dst, src, len);
+
+ clock_start = get_clock();
+ fn(dst, src, len);
+ clock_end = get_clock();
+
+ free(src);
+ free(dst);
+ return clock_end - clock_start;
+}
+
+static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault)
+{
+ struct timeval tv_start, tv_end, tv_diff;
+ void *src = NULL, *dst = NULL;
+
+ alloc_mem(&src, &dst, len);
+
+ if (prefault)
+ fn(dst, src, len);
+
+ BUG_ON(gettimeofday(&tv_start, NULL));
+ fn(dst, src, len);
+ BUG_ON(gettimeofday(&tv_end, NULL));
+
+ timersub(&tv_end, &tv_start, &tv_diff);
+
+ free(src);
+ free(dst);
+ return (double)((double)len / timeval2double(&tv_diff));
+}
+
+#define pf (no_prefault ? 0 : 1)
+
+#define print_bps(x) do { \
+ if (x < K) \
+ printf(" %14lf B/Sec", x); \
+ else if (x < K * K) \
+ printf(" %14lfd KB/Sec", x / K); \
+ else if (x < K * K * K) \
+ printf(" %14lf MB/Sec", x / K / K); \
+ else \
+ printf(" %14lf GB/Sec", x / K / K / K); \
+ } while (0)
+
int bench_mem_memcpy(int argc, const char **argv,
const char *prefix __used)
{
int i;
- void *dst, *src;
- size_t length;
- double bps = 0.0;
- struct timeval tv_start, tv_end, tv_diff;
- u64 clock_start, clock_end, clock_diff;
+ size_t len;
+ double result_bps[2];
+ u64 result_clock[2];
- clock_start = clock_end = clock_diff = 0ULL;
argc = parse_options(argc, argv, options,
bench_mem_memcpy_usage, 0);
- tv_diff.tv_sec = 0;
- tv_diff.tv_usec = 0;
- length = (size_t)perf_atoll((char *)length_str);
+ if (use_clock)
+ init_clock();
+
+ len = (size_t)perf_atoll((char *)length_str);
- if ((s64)length <= 0) {
+ result_clock[0] = result_clock[1] = 0ULL;
+ result_bps[0] = result_bps[1] = 0.0;
+
+ if ((s64)len <= 0) {
fprintf(stderr, "Invalid length:%s\n", length_str);
return 1;
}
+ /* same to without specifying either of prefault and no-prefault */
+ if (only_prefault && no_prefault)
+ only_prefault = no_prefault = false;
+
for (i = 0; routines[i].name; i++) {
if (!strcmp(routines[i].name, routine))
break;
return 1;
}
- dst = zalloc(length);
- if (!dst)
- die("memory allocation failed - maybe length is too large?\n");
-
- src = zalloc(length);
- if (!src)
- die("memory allocation failed - maybe length is too large?\n");
-
- if (bench_format == BENCH_FORMAT_DEFAULT) {
- printf("# Copying %s Bytes from %p to %p ...\n\n",
- length_str, src, dst);
- }
-
- if (use_clock) {
- init_clock();
- clock_start = get_clock();
- } else {
- BUG_ON(gettimeofday(&tv_start, NULL));
- }
-
- routines[i].fn(dst, src, length);
+ if (bench_format == BENCH_FORMAT_DEFAULT)
+ printf("# Copying %s Bytes ...\n\n", length_str);
- if (use_clock) {
- clock_end = get_clock();
- clock_diff = clock_end - clock_start;
+ if (!only_prefault && !no_prefault) {
+ /* show both of results */
+ if (use_clock) {
+ result_clock[0] =
+ do_memcpy_clock(routines[i].fn, len, false);
+ result_clock[1] =
+ do_memcpy_clock(routines[i].fn, len, true);
+ } else {
+ result_bps[0] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, false);
+ result_bps[1] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, true);
+ }
} else {
- BUG_ON(gettimeofday(&tv_end, NULL));
- timersub(&tv_end, &tv_start, &tv_diff);
- bps = (double)((double)length / timeval2double(&tv_diff));
+ if (use_clock) {
+ result_clock[pf] =
+ do_memcpy_clock(routines[i].fn,
+ len, only_prefault);
+ } else {
+ result_bps[pf] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, only_prefault);
+ }
}
switch (bench_format) {
case BENCH_FORMAT_DEFAULT:
- if (use_clock) {
- printf(" %14lf Clock/Byte\n",
- (double)clock_diff / (double)length);
- } else {
- if (bps < K)
- printf(" %14lf B/Sec\n", bps);
- else if (bps < K * K)
- printf(" %14lfd KB/Sec\n", bps / 1024);
- else if (bps < K * K * K)
- printf(" %14lf MB/Sec\n", bps / 1024 / 1024);
- else {
- printf(" %14lf GB/Sec\n",
- bps / 1024 / 1024 / 1024);
+ if (!only_prefault && !no_prefault) {
+ if (use_clock) {
+ printf(" %14lf Clock/Byte\n",
+ (double)result_clock[0]
+ / (double)len);
+ printf(" %14lf Clock/Byte (with prefault)\n",
+ (double)result_clock[1]
+ / (double)len);
+ } else {
+ print_bps(result_bps[0]);
+ printf("\n");
+ print_bps(result_bps[1]);
+ printf(" (with prefault)\n");
}
+ } else {
+ if (use_clock) {
+ printf(" %14lf Clock/Byte",
+ (double)result_clock[pf]
+ / (double)len);
+ } else
+ print_bps(result_bps[pf]);
+
+ printf("%s\n", only_prefault ? " (with prefault)" : "");
}
break;
case BENCH_FORMAT_SIMPLE:
- if (use_clock) {
- printf("%14lf\n",
- (double)clock_diff / (double)length);
- } else
- printf("%lf\n", bps);
+ if (!only_prefault && !no_prefault) {
+ if (use_clock) {
+ printf("%lf %lf\n",
+ (double)result_clock[0] / (double)len,
+ (double)result_clock[1] / (double)len);
+ } else {
+ printf("%lf %lf\n",
+ result_bps[0], result_bps[1]);
+ }
+ } else {
+ if (use_clock) {
+ printf("%lf\n", (double)result_clock[pf]
+ / (double)len);
+ } else
+ printf("%lf\n", result_bps[pf]);
+ }
break;
default:
/* reaching this means there's some disaster: */
return hist_entry__inc_addr_samples(he, al->addr);
}
-static int process_sample_event(event_t *event, struct perf_session *session)
+static int process_sample_event(event_t *event, struct sample_data *sample,
+ struct perf_session *session)
{
struct addr_location al;
- struct sample_data data;
- if (event__preprocess_sample(event, session, &al, &data, NULL) < 0) {
+ if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) {
pr_warning("problem processing %d event, skipping it.\n",
event->header.type);
return -1;
.mmap = event__process_mmap,
.comm = event__process_comm,
.fork = event__process_task,
+ .ordered_samples = true,
+ .ordering_requires_timestamps = true,
};
static int __cmd_annotate(void)
int ret;
struct perf_session *session;
- session = perf_session__new(input_name, O_RDONLY, force, false);
+ session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops);
if (session == NULL)
return -ENOMEM;
{
struct perf_session *session;
- session = perf_session__new(input_name, O_RDONLY, force, false);
+ session = perf_session__new(input_name, O_RDONLY, force, false,
+ &build_id__mark_dso_hit_ops);
if (session == NULL)
return -1;
return -ENOMEM;
}
-static int diff__process_sample_event(event_t *event, struct perf_session *session)
+static int diff__process_sample_event(event_t *event,
+ struct sample_data *sample,
+ struct perf_session *session)
{
struct addr_location al;
- struct sample_data data = { .period = 1, };
- if (event__preprocess_sample(event, session, &al, &data, NULL) < 0) {
+ if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) {
pr_warning("problem processing %d event, skipping it.\n",
event->header.type);
return -1;
if (al.filtered || al.sym == NULL)
return 0;
- if (hists__add_entry(&session->hists, &al, data.period)) {
+ if (hists__add_entry(&session->hists, &al, sample->period)) {
pr_warning("problem incrementing symbol period, skipping event\n");
return -1;
}
- session->hists.stats.total_period += data.period;
+ session->hists.stats.total_period += sample->period;
return 0;
}
.exit = event__process_task,
.fork = event__process_task,
.lost = event__process_lost,
+ .ordered_samples = true,
+ .ordering_requires_timestamps = true,
};
static void perf_session__insert_hist_entry_by_name(struct rb_root *root,
int ret, i;
struct perf_session *session[2];
- session[0] = perf_session__new(input_old, O_RDONLY, force, false);
- session[1] = perf_session__new(input_new, O_RDONLY, force, false);
+ session[0] = perf_session__new(input_old, O_RDONLY, force, false, &event_ops);
+ session[1] = perf_session__new(input_new, O_RDONLY, force, false, &event_ops);
if (session[0] == NULL || session[1] == NULL)
return -ENOMEM;
static const struct option options[] = {
OPT_INCR('v', "verbose", &verbose,
"be more verbose (show symbol address, etc)"),
- OPT_BOOLEAN('m', "displacement", &show_displacement,
+ OPT_BOOLEAN('M', "displacement", &show_displacement,
"Show position displacement relative to baseline"),
OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
"dump raw trace in ASCII"),
OPT_STRING('t', "field-separator", &symbol_conf.field_sep, "separator",
"separator for columns, no spaces will be added between "
"columns '.' is reserved."),
+ OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
+ "Look for files with symbols relative to this directory"),
OPT_END()
};
static char const *input_name = "-";
static bool inject_build_ids;
-static int event__repipe(event_t *event __used,
- struct perf_session *session __used)
+static int event__repipe_synth(event_t *event,
+ struct perf_session *session __used)
{
uint32_t size;
void *buf = event;
return 0;
}
-static int event__repipe_mmap(event_t *self, struct perf_session *session)
+static int event__repipe(event_t *event, struct sample_data *sample __used,
+ struct perf_session *session)
+{
+ return event__repipe_synth(event, session);
+}
+
+static int event__repipe_mmap(event_t *self, struct sample_data *sample,
+ struct perf_session *session)
{
int err;
- err = event__process_mmap(self, session);
- event__repipe(self, session);
+ err = event__process_mmap(self, sample, session);
+ event__repipe(self, sample, session);
return err;
}
-static int event__repipe_task(event_t *self, struct perf_session *session)
+static int event__repipe_task(event_t *self, struct sample_data *sample,
+ struct perf_session *session)
{
int err;
- err = event__process_task(self, session);
- event__repipe(self, session);
+ err = event__process_task(self, sample, session);
+ event__repipe(self, sample, session);
return err;
}
{
int err;
- event__repipe(self, session);
+ event__repipe_synth(self, session);
err = event__process_tracing_data(self, session);
return err;
return 0;
}
-static int event__inject_buildid(event_t *event, struct perf_session *session)
+static int event__inject_buildid(event_t *event, struct sample_data *sample,
+ struct perf_session *session)
{
struct addr_location al;
struct thread *thread;
}
repipe:
- event__repipe(event, session);
+ event__repipe(event, sample, session);
return 0;
}
.read = event__repipe,
.throttle = event__repipe,
.unthrottle = event__repipe,
- .attr = event__repipe,
- .event_type = event__repipe,
- .tracing_data = event__repipe,
- .build_id = event__repipe,
+ .attr = event__repipe_synth,
+ .event_type = event__repipe_synth,
+ .tracing_data = event__repipe_synth,
+ .build_id = event__repipe_synth,
};
extern volatile int session_done;
inject_ops.tracing_data = event__repipe_tracing_data;
}
- session = perf_session__new(input_name, O_RDONLY, false, true);
+ session = perf_session__new(input_name, O_RDONLY, false, true, &inject_ops);
if (session == NULL)
return -ENOMEM;
}
}
-static int process_sample_event(event_t *event, struct perf_session *session)
+static int process_sample_event(event_t *event, struct sample_data *sample,
+ struct perf_session *session)
{
- struct sample_data data;
- struct thread *thread;
+ struct thread *thread = perf_session__findnew(session, event->ip.pid);
- memset(&data, 0, sizeof(data));
- data.time = -1;
- data.cpu = -1;
- data.period = 1;
-
- event__parse_sample(event, session->sample_type, &data);
-
- dump_printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc,
- data.pid, data.tid, data.ip, data.period);
-
- thread = perf_session__findnew(session, event->ip.pid);
if (thread == NULL) {
pr_debug("problem processing %d event, skipping it.\n",
event->header.type);
dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
- process_raw_event(event, data.raw_data, data.cpu,
- data.time, thread);
+ process_raw_event(event, sample->raw_data, sample->cpu,
+ sample->time, thread);
return 0;
}
static int __cmd_kmem(void)
{
int err = -EINVAL;
- struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0, false);
+ struct perf_session *session = perf_session__new(input_name, O_RDONLY,
+ 0, false, &event_ops);
if (session == NULL)
return -ENOMEM;
rec_argc = ARRAY_SIZE(record_args) + argc - 1;
rec_argv = calloc(rec_argc + 1, sizeof(char *));
+ if (rec_argv == NULL)
+ return -ENOMEM;
+
for (i = 0; i < ARRAY_SIZE(record_args); i++)
rec_argv[i] = strdup(record_args[i]);
die("Unknown type of information\n");
}
-static int process_sample_event(event_t *self, struct perf_session *s)
+static int process_sample_event(event_t *self, struct sample_data *sample,
+ struct perf_session *s)
{
- struct sample_data data;
- struct thread *thread;
+ struct thread *thread = perf_session__findnew(s, sample->tid);
- bzero(&data, sizeof(data));
- event__parse_sample(self, s->sample_type, &data);
-
- thread = perf_session__findnew(s, data.tid);
if (thread == NULL) {
pr_debug("problem processing %d event, skipping it.\n",
self->header.type);
return -1;
}
- process_raw_event(data.raw_data, data.cpu, data.time, thread);
+ process_raw_event(sample->raw_data, sample->cpu, sample->time, thread);
return 0;
}
static int read_events(void)
{
- session = perf_session__new(input_name, O_RDONLY, 0, false);
+ session = perf_session__new(input_name, O_RDONLY, 0, false, &eops);
if (!session)
die("Initializing perf session failed\n");
rec_argc = ARRAY_SIZE(record_args) + argc - 1;
rec_argv = calloc(rec_argc + 1, sizeof(char *));
+ if (rec_argv == NULL)
+ return -ENOMEM;
+
for (i = 0; i < ARRAY_SIZE(record_args); i++)
rec_argv[i] = strdup(record_args[i]);
usage_with_options(report_usage, report_options);
}
__cmd_report();
- } else if (!strcmp(argv[0], "trace")) {
- /* Aliased to 'perf trace' */
- return cmd_trace(argc, argv, prefix);
+ } else if (!strcmp(argv[0], "script")) {
+ /* Aliased to 'perf script' */
+ return cmd_script(argc, argv, prefix);
} else if (!strcmp(argv[0], "info")) {
if (argc) {
argc = parse_options(argc, argv,
#include "util/header.h"
#include "util/event.h"
+#include "util/evsel.h"
#include "util/debug.h"
#include "util/session.h"
#include "util/symbol.h"
#include <sched.h>
#include <sys/mman.h>
+#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
+
enum write_mode_t {
WRITE_FORCE,
WRITE_APPEND
};
-static int *fd[MAX_NR_CPUS][MAX_COUNTERS];
-
static u64 user_interval = ULLONG_MAX;
static u64 default_interval = 0;
+static u64 sample_type;
-static int nr_cpus = 0;
+static struct cpu_map *cpus;
static unsigned int page_size;
static unsigned int mmap_pages = 128;
static unsigned int user_freq = UINT_MAX;
static int group = 0;
static int realtime_prio = 0;
static bool raw_samples = false;
+static bool sample_id_all_avail = true;
static bool system_wide = false;
static pid_t target_pid = -1;
static pid_t target_tid = -1;
-static pid_t *all_tids = NULL;
-static int thread_num = 0;
+static struct thread_map *threads;
static pid_t child_pid = -1;
static bool no_inherit = false;
static enum write_mode_t write_mode = WRITE_FORCE;
static bool inherit_stat = false;
static bool no_samples = false;
static bool sample_address = false;
+static bool sample_time = false;
static bool no_buildid = false;
+static bool no_buildid_cache = false;
static long samples = 0;
static u64 bytes_written = 0;
static const char *cpu_list;
struct mmap_data {
- int counter;
void *base;
unsigned int mask;
unsigned int prev;
}
static int process_synthesized_event(event_t *event,
+ struct sample_data *sample __used,
struct perf_session *self __used)
{
write_output(event, event->header.size);
return h_attr;
}
-static void create_counter(int counter, int cpu)
+static void create_counter(struct perf_evsel *evsel, int cpu)
{
- char *filter = filters[counter];
- struct perf_event_attr *attr = attrs + counter;
+ char *filter = evsel->filter;
+ struct perf_event_attr *attr = &evsel->attr;
struct perf_header_attr *h_attr;
- int track = !counter; /* only the first counter needs these */
+ int track = !evsel->idx; /* only the first counter needs these */
int thread_index;
int ret;
struct {
u64 time_running;
u64 id;
} read_data;
+ /*
+ * Check if parse_single_tracepoint_event has already asked for
+ * PERF_SAMPLE_TIME.
+ *
+ * XXX this is kludgy but short term fix for problems introduced by
+ * eac23d1c that broke 'perf script' by having different sample_types
+ * when using multiple tracepoint events when we use a perf binary
+ * that tries to use sample_id_all on an older kernel.
+ *
+ * We need to move counter creation to perf_session, support
+ * different sample_types, etc.
+ */
+ bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
PERF_FORMAT_TOTAL_TIME_RUNNING |
if (system_wide)
attr->sample_type |= PERF_SAMPLE_CPU;
+ if (sample_id_all_avail &&
+ (sample_time || system_wide || !no_inherit || cpu_list))
+ attr->sample_type |= PERF_SAMPLE_TIME;
+
if (raw_samples) {
attr->sample_type |= PERF_SAMPLE_TIME;
attr->sample_type |= PERF_SAMPLE_RAW;
attr->disabled = 1;
attr->enable_on_exec = 1;
}
+retry_sample_id:
+ attr->sample_id_all = sample_id_all_avail ? 1 : 0;
- for (thread_index = 0; thread_index < thread_num; thread_index++) {
+ for (thread_index = 0; thread_index < threads->nr; thread_index++) {
try_again:
- fd[nr_cpu][counter][thread_index] = sys_perf_event_open(attr,
- all_tids[thread_index], cpu, group_fd, 0);
+ FD(evsel, nr_cpu, thread_index) = sys_perf_event_open(attr, threads->map[thread_index], cpu, group_fd, 0);
- if (fd[nr_cpu][counter][thread_index] < 0) {
+ if (FD(evsel, nr_cpu, thread_index) < 0) {
int err = errno;
if (err == EPERM || err == EACCES)
else if (err == ENODEV && cpu_list) {
die("No such device - did you specify"
" an out-of-range profile CPU?\n");
+ } else if (err == EINVAL && sample_id_all_avail) {
+ /*
+ * Old kernel, no attr->sample_id_type_all field
+ */
+ sample_id_all_avail = false;
+ if (!sample_time && !raw_samples && !time_needed)
+ attr->sample_type &= ~PERF_SAMPLE_TIME;
+
+ goto retry_sample_id;
}
/*
goto try_again;
}
printf("\n");
- error("perfcounter syscall returned with %d (%s)\n",
- fd[nr_cpu][counter][thread_index], strerror(err));
+ error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
+ FD(evsel, nr_cpu, thread_index), strerror(err));
#if defined(__i386__) || defined(__x86_64__)
if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
exit(-1);
}
- h_attr = get_header_attr(attr, counter);
+ h_attr = get_header_attr(attr, evsel->idx);
if (h_attr == NULL)
die("nomem\n");
}
}
- if (read(fd[nr_cpu][counter][thread_index], &read_data, sizeof(read_data)) == -1) {
+ if (read(FD(evsel, nr_cpu, thread_index), &read_data, sizeof(read_data)) == -1) {
perror("Unable to read perf file descriptor");
exit(-1);
}
exit(-1);
}
- assert(fd[nr_cpu][counter][thread_index] >= 0);
- fcntl(fd[nr_cpu][counter][thread_index], F_SETFL, O_NONBLOCK);
+ assert(FD(evsel, nr_cpu, thread_index) >= 0);
+ fcntl(FD(evsel, nr_cpu, thread_index), F_SETFL, O_NONBLOCK);
/*
* First counter acts as the group leader:
*/
if (group && group_fd == -1)
- group_fd = fd[nr_cpu][counter][thread_index];
-
- if (counter || thread_index) {
- ret = ioctl(fd[nr_cpu][counter][thread_index],
- PERF_EVENT_IOC_SET_OUTPUT,
- fd[nr_cpu][0][0]);
+ group_fd = FD(evsel, nr_cpu, thread_index);
+
+ if (evsel->idx || thread_index) {
+ struct perf_evsel *first;
+ first = list_entry(evsel_list.next, struct perf_evsel, node);
+ ret = ioctl(FD(evsel, nr_cpu, thread_index),
+ PERF_EVENT_IOC_SET_OUTPUT,
+ FD(first, nr_cpu, 0));
if (ret) {
error("failed to set output: %d (%s)\n", errno,
strerror(errno));
exit(-1);
}
} else {
- mmap_array[nr_cpu].counter = counter;
mmap_array[nr_cpu].prev = 0;
mmap_array[nr_cpu].mask = mmap_pages*page_size - 1;
mmap_array[nr_cpu].base = mmap(NULL, (mmap_pages+1)*page_size,
- PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter][thread_index], 0);
+ PROT_READ | PROT_WRITE, MAP_SHARED, FD(evsel, nr_cpu, thread_index), 0);
if (mmap_array[nr_cpu].base == MAP_FAILED) {
error("failed to mmap with %d (%s)\n", errno, strerror(errno));
exit(-1);
}
- event_array[nr_poll].fd = fd[nr_cpu][counter][thread_index];
+ event_array[nr_poll].fd = FD(evsel, nr_cpu, thread_index);
event_array[nr_poll].events = POLLIN;
nr_poll++;
}
if (filter != NULL) {
- ret = ioctl(fd[nr_cpu][counter][thread_index],
- PERF_EVENT_IOC_SET_FILTER, filter);
+ ret = ioctl(FD(evsel, nr_cpu, thread_index),
+ PERF_EVENT_IOC_SET_FILTER, filter);
if (ret) {
error("failed to set filter with %d (%s)\n", errno,
strerror(errno));
}
}
}
+
+ if (!sample_type)
+ sample_type = attr->sample_type;
}
static void open_counters(int cpu)
{
- int counter;
+ struct perf_evsel *pos;
group_fd = -1;
- for (counter = 0; counter < nr_counters; counter++)
- create_counter(counter, cpu);
+
+ list_for_each_entry(pos, &evsel_list, node)
+ create_counter(pos, cpu);
nr_cpu++;
}
if (!pipe_output) {
session->header.data_size += bytes_written;
- process_buildids();
+ if (!no_buildid)
+ process_buildids();
perf_header__write(&session->header, output, true);
perf_session__delete(session);
symbol__exit();
static int __cmd_record(int argc, const char **argv)
{
- int i, counter;
+ int i;
struct stat st;
int flags;
int err;
}
session = perf_session__new(output_name, O_WRONLY,
- write_mode == WRITE_FORCE, false);
+ write_mode == WRITE_FORCE, false, NULL);
if (session == NULL) {
pr_err("Not enough memory for reading perf file header\n");
return -1;
}
+ if (!no_buildid)
+ perf_header__set_feat(&session->header, HEADER_BUILD_ID);
+
if (!file_new) {
err = perf_header__read(session, output);
if (err < 0)
goto out_delete_session;
}
- if (have_tracepoints(attrs, nr_counters))
+ if (have_tracepoints(&evsel_list))
perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
/*
}
if (!system_wide && target_tid == -1 && target_pid == -1)
- all_tids[0] = child_pid;
+ threads->map[0] = child_pid;
close(child_ready_pipe[1]);
close(go_pipe[0]);
close(child_ready_pipe[0]);
}
- nr_cpus = read_cpu_map(cpu_list);
- if (nr_cpus < 1) {
- perror("failed to collect number of CPUs");
- return -1;
- }
-
if (!system_wide && no_inherit && !cpu_list) {
open_counters(-1);
} else {
- for (i = 0; i < nr_cpus; i++)
- open_counters(cpumap[i]);
+ for (i = 0; i < cpus->nr; i++)
+ open_counters(cpus->map[i]);
}
+ perf_session__set_sample_type(session, sample_type);
+
if (pipe_output) {
err = perf_header__write_pipe(output);
if (err < 0)
post_processing_offset = lseek(output, 0, SEEK_CUR);
+ perf_session__set_sample_id_all(session, sample_id_all_avail);
+
if (pipe_output) {
err = event__synthesize_attrs(&session->header,
process_synthesized_event,
return err;
}
- if (have_tracepoints(attrs, nr_counters)) {
+ if (have_tracepoints(&evsel_list)) {
/*
* FIXME err <= 0 here actually means that
* there were no tracepoints so its not really
* return this more properly and also
* propagate errors that now are calling die()
*/
- err = event__synthesize_tracing_data(output, attrs,
- nr_counters,
+ err = event__synthesize_tracing_data(output, &evsel_list,
process_synthesized_event,
session);
if (err <= 0) {
if (done) {
for (i = 0; i < nr_cpu; i++) {
- for (counter = 0;
- counter < nr_counters;
- counter++) {
+ struct perf_evsel *pos;
+
+ list_for_each_entry(pos, &evsel_list, node) {
for (thread = 0;
- thread < thread_num;
+ thread < threads->nr;
thread++)
- ioctl(fd[i][counter][thread],
+ ioctl(FD(pos, i, thread),
PERF_EVENT_IOC_DISABLE);
}
}
"per thread counts"),
OPT_BOOLEAN('d', "data", &sample_address,
"Sample addresses"),
+ OPT_BOOLEAN('T', "timestamp", &sample_time, "Sample timestamps"),
OPT_BOOLEAN('n', "no-samples", &no_samples,
"don't sample"),
- OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid,
+ OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache,
"do not update the buildid cache"),
+ OPT_BOOLEAN('B', "no-buildid", &no_buildid,
+ "do not collect buildids in perf.data"),
OPT_END()
};
int cmd_record(int argc, const char **argv, const char *prefix __used)
{
- int i, j, err = -ENOMEM;
+ int err = -ENOMEM;
+ struct perf_evsel *pos;
argc = parse_options(argc, argv, record_options, record_usage,
PARSE_OPT_STOP_AT_NON_OPTION);
}
symbol__init();
- if (no_buildid)
+
+ if (no_buildid_cache || no_buildid)
disable_buildid_cache();
- if (!nr_counters) {
- nr_counters = 1;
- attrs[0].type = PERF_TYPE_HARDWARE;
- attrs[0].config = PERF_COUNT_HW_CPU_CYCLES;
+ if (list_empty(&evsel_list) && perf_evsel_list__create_default() < 0) {
+ pr_err("Not enough memory for event selector list\n");
+ goto out_symbol_exit;
}
- if (target_pid != -1) {
+ if (target_pid != -1)
target_tid = target_pid;
- thread_num = find_all_tid(target_pid, &all_tids);
- if (thread_num <= 0) {
- fprintf(stderr, "Can't find all threads of pid %d\n",
- target_pid);
- usage_with_options(record_usage, record_options);
- }
- } else {
- all_tids=malloc(sizeof(pid_t));
- if (!all_tids)
- goto out_symbol_exit;
- all_tids[0] = target_tid;
- thread_num = 1;
+ threads = thread_map__new(target_pid, target_tid);
+ if (threads == NULL) {
+ pr_err("Problems finding threads of monitor\n");
+ usage_with_options(record_usage, record_options);
}
- for (i = 0; i < MAX_NR_CPUS; i++) {
- for (j = 0; j < MAX_COUNTERS; j++) {
- fd[i][j] = malloc(sizeof(int)*thread_num);
- if (!fd[i][j])
- goto out_free_fd;
- }
+ cpus = cpu_map__new(cpu_list);
+ if (cpus == NULL) {
+ perror("failed to parse CPUs map");
+ return -1;
}
- event_array = malloc(
- sizeof(struct pollfd)*MAX_NR_CPUS*MAX_COUNTERS*thread_num);
+
+ list_for_each_entry(pos, &evsel_list, node) {
+ if (perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
+ goto out_free_fd;
+ }
+ event_array = malloc((sizeof(struct pollfd) * MAX_NR_CPUS *
+ MAX_COUNTERS * threads->nr));
if (!event_array)
goto out_free_fd;
out_free_event_array:
free(event_array);
out_free_fd:
- for (i = 0; i < MAX_NR_CPUS; i++) {
- for (j = 0; j < MAX_COUNTERS; j++)
- free(fd[i][j]);
- }
- free(all_tids);
- all_tids = NULL;
+ thread_map__delete(threads);
+ threads = NULL;
out_symbol_exit:
symbol__exit();
return err;
return 0;
}
-static int process_sample_event(event_t *event, struct perf_session *session)
+static int process_sample_event(event_t *event, struct sample_data *sample,
+ struct perf_session *session)
{
- struct sample_data data = { .period = 1, };
struct addr_location al;
struct perf_event_attr *attr;
- if (event__preprocess_sample(event, session, &al, &data, NULL) < 0) {
+ if (event__preprocess_sample(event, session, &al, sample, NULL) < 0) {
fprintf(stderr, "problem processing %d event, skipping it.\n",
event->header.type);
return -1;
if (al.filtered || (hide_unresolved && al.sym == NULL))
return 0;
- if (perf_session__add_hist_entry(session, &al, &data)) {
+ if (perf_session__add_hist_entry(session, &al, sample)) {
pr_debug("problem incrementing symbol period, skipping event\n");
return -1;
}
- attr = perf_header__find_attr(data.id, &session->header);
+ attr = perf_header__find_attr(sample->id, &session->header);
- if (add_event_total(session, &data, attr)) {
+ if (add_event_total(session, sample, attr)) {
pr_debug("problem adding event period\n");
return -1;
}
return 0;
}
-static int process_read_event(event_t *event, struct perf_session *session __used)
+static int process_read_event(event_t *event, struct sample_data *sample __used,
+ struct perf_session *session __used)
{
struct perf_event_attr *attr;
.event_type = event__process_event_type,
.tracing_data = event__process_tracing_data,
.build_id = event__process_build_id,
+ .ordered_samples = true,
+ .ordering_requires_timestamps = true,
};
extern volatile int session_done;
signal(SIGINT, sig_handler);
- session = perf_session__new(input_name, O_RDONLY, force, false);
+ session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops);
if (session == NULL)
return -ENOMEM;
"dump raw trace in ASCII"),
OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
"file", "vmlinux pathname"),
+ OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
+ "file", "kallsyms pathname"),
OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules,
"load module symbols - WARNING: use only with -k and LIVE kernel"),
"columns '.' is reserved."),
OPT_BOOLEAN('U', "hide-unresolved", &hide_unresolved,
"Only display entries resolved to a symbol"),
+ OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
+ "Look for files with symbols relative to this directory"),
OPT_END()
};
process_sched_migrate_task_event(data, session, event, cpu, timestamp, thread);
}
-static int process_sample_event(event_t *event, struct perf_session *session)
+static int process_sample_event(event_t *event, struct sample_data *sample,
+ struct perf_session *session)
{
- struct sample_data data;
struct thread *thread;
if (!(session->sample_type & PERF_SAMPLE_RAW))
return 0;
- memset(&data, 0, sizeof(data));
- data.time = -1;
- data.cpu = -1;
- data.period = -1;
-
- event__parse_sample(event, session->sample_type, &data);
-
- dump_printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc,
- data.pid, data.tid, data.ip, data.period);
-
- thread = perf_session__findnew(session, data.pid);
+ thread = perf_session__findnew(session, sample->pid);
if (thread == NULL) {
pr_debug("problem processing %d event, skipping it.\n",
event->header.type);
dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
- if (profile_cpu != -1 && profile_cpu != (int)data.cpu)
+ if (profile_cpu != -1 && profile_cpu != (int)sample->cpu)
return 0;
- process_raw_event(event, session, data.raw_data, data.cpu, data.time, thread);
+ process_raw_event(event, session, sample->raw_data, sample->cpu,
+ sample->time, thread);
return 0;
}
static int read_events(void)
{
int err = -EINVAL;
- struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0, false);
+ struct perf_session *session = perf_session__new(input_name, O_RDONLY,
+ 0, false, &event_ops);
if (session == NULL)
return -ENOMEM;
rec_argc = ARRAY_SIZE(record_args) + argc - 1;
rec_argv = calloc(rec_argc + 1, sizeof(char *));
+ if (rec_argv)
+ return -ENOMEM;
+
for (i = 0; i < ARRAY_SIZE(record_args); i++)
rec_argv[i] = strdup(record_args[i]);
usage_with_options(sched_usage, sched_options);
/*
- * Aliased to 'perf trace' for now:
+ * Aliased to 'perf script' for now:
*/
- if (!strcmp(argv[0], "trace"))
- return cmd_trace(argc, argv, prefix);
+ if (!strcmp(argv[0], "script"))
+ return cmd_script(argc, argv, prefix);
symbol__init();
if (!strncmp(argv[0], "rec", 3)) {
--- /dev/null
+#include "builtin.h"
+
+#include "perf.h"
+#include "util/cache.h"
+#include "util/debug.h"
+#include "util/exec_cmd.h"
+#include "util/header.h"
+#include "util/parse-options.h"
+#include "util/session.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+#include "util/trace-event.h"
+#include "util/parse-options.h"
+#include "util/util.h"
+
+static char const *script_name;
+static char const *generate_script_lang;
+static bool debug_mode;
+static u64 last_timestamp;
+static u64 nr_unordered;
+extern const struct option record_options[];
+
+static int default_start_script(const char *script __unused,
+ int argc __unused,
+ const char **argv __unused)
+{
+ return 0;
+}
+
+static int default_stop_script(void)
+{
+ return 0;
+}
+
+static int default_generate_script(const char *outfile __unused)
+{
+ return 0;
+}
+
+static struct scripting_ops default_scripting_ops = {
+ .start_script = default_start_script,
+ .stop_script = default_stop_script,
+ .process_event = print_event,
+ .generate_script = default_generate_script,
+};
+
+static struct scripting_ops *scripting_ops;
+
+static void setup_scripting(void)
+{
+ setup_perl_scripting();
+ setup_python_scripting();
+
+ scripting_ops = &default_scripting_ops;
+}
+
+static int cleanup_scripting(void)
+{
+ pr_debug("\nperf script stopped\n");
+
+ return scripting_ops->stop_script();
+}
+
+static char const *input_name = "perf.data";
+
+static int process_sample_event(event_t *event, struct sample_data *sample,
+ struct perf_session *session)
+{
+ struct thread *thread = perf_session__findnew(session, event->ip.pid);
+
+ if (thread == NULL) {
+ pr_debug("problem processing %d event, skipping it.\n",
+ event->header.type);
+ return -1;
+ }
+
+ if (session->sample_type & PERF_SAMPLE_RAW) {
+ if (debug_mode) {
+ if (sample->time < last_timestamp) {
+ pr_err("Samples misordered, previous: %llu "
+ "this: %llu\n", last_timestamp,
+ sample->time);
+ nr_unordered++;
+ }
+ last_timestamp = sample->time;
+ return 0;
+ }
+ /*
+ * FIXME: better resolve from pid from the struct trace_entry
+ * field, although it should be the same than this perf
+ * event pid
+ */
+ scripting_ops->process_event(sample->cpu, sample->raw_data,
+ sample->raw_size,
+ sample->time, thread->comm);
+ }
+
+ session->hists.stats.total_period += sample->period;
+ return 0;
+}
+
+static struct perf_event_ops event_ops = {
+ .sample = process_sample_event,
+ .comm = event__process_comm,
+ .attr = event__process_attr,
+ .event_type = event__process_event_type,
+ .tracing_data = event__process_tracing_data,
+ .build_id = event__process_build_id,
+ .ordering_requires_timestamps = true,
+ .ordered_samples = true,
+};
+
+extern volatile int session_done;
+
+static void sig_handler(int sig __unused)
+{
+ session_done = 1;
+}
+
+static int __cmd_script(struct perf_session *session)
+{
+ int ret;
+
+ signal(SIGINT, sig_handler);
+
+ ret = perf_session__process_events(session, &event_ops);
+
+ if (debug_mode)
+ pr_err("Misordered timestamps: %llu\n", nr_unordered);
+
+ return ret;
+}
+
+struct script_spec {
+ struct list_head node;
+ struct scripting_ops *ops;
+ char spec[0];
+};
+
+static LIST_HEAD(script_specs);
+
+static struct script_spec *script_spec__new(const char *spec,
+ struct scripting_ops *ops)
+{
+ struct script_spec *s = malloc(sizeof(*s) + strlen(spec) + 1);
+
+ if (s != NULL) {
+ strcpy(s->spec, spec);
+ s->ops = ops;
+ }
+
+ return s;
+}
+
+static void script_spec__delete(struct script_spec *s)
+{
+ free(s->spec);
+ free(s);
+}
+
+static void script_spec__add(struct script_spec *s)
+{
+ list_add_tail(&s->node, &script_specs);
+}
+
+static struct script_spec *script_spec__find(const char *spec)
+{
+ struct script_spec *s;
+
+ list_for_each_entry(s, &script_specs, node)
+ if (strcasecmp(s->spec, spec) == 0)
+ return s;
+ return NULL;
+}
+
+static struct script_spec *script_spec__findnew(const char *spec,
+ struct scripting_ops *ops)
+{
+ struct script_spec *s = script_spec__find(spec);
+
+ if (s)
+ return s;
+
+ s = script_spec__new(spec, ops);
+ if (!s)
+ goto out_delete_spec;
+
+ script_spec__add(s);
+
+ return s;
+
+out_delete_spec:
+ script_spec__delete(s);
+
+ return NULL;
+}
+
+int script_spec_register(const char *spec, struct scripting_ops *ops)
+{
+ struct script_spec *s;
+
+ s = script_spec__find(spec);
+ if (s)
+ return -1;
+
+ s = script_spec__findnew(spec, ops);
+ if (!s)
+ return -1;
+
+ return 0;
+}
+
+static struct scripting_ops *script_spec__lookup(const char *spec)
+{
+ struct script_spec *s = script_spec__find(spec);
+ if (!s)
+ return NULL;
+
+ return s->ops;
+}
+
+static void list_available_languages(void)
+{
+ struct script_spec *s;
+
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Scripting language extensions (used in "
+ "perf script -s [spec:]script.[spec]):\n\n");
+
+ list_for_each_entry(s, &script_specs, node)
+ fprintf(stderr, " %-42s [%s]\n", s->spec, s->ops->name);
+
+ fprintf(stderr, "\n");
+}
+
+static int parse_scriptname(const struct option *opt __used,
+ const char *str, int unset __used)
+{
+ char spec[PATH_MAX];
+ const char *script, *ext;
+ int len;
+
+ if (strcmp(str, "lang") == 0) {
+ list_available_languages();
+ exit(0);
+ }
+
+ script = strchr(str, ':');
+ if (script) {
+ len = script - str;
+ if (len >= PATH_MAX) {
+ fprintf(stderr, "invalid language specifier");
+ return -1;
+ }
+ strncpy(spec, str, len);
+ spec[len] = '\0';
+ scripting_ops = script_spec__lookup(spec);
+ if (!scripting_ops) {
+ fprintf(stderr, "invalid language specifier");
+ return -1;
+ }
+ script++;
+ } else {
+ script = str;
+ ext = strrchr(script, '.');
+ if (!ext) {
+ fprintf(stderr, "invalid script extension");
+ return -1;
+ }
+ scripting_ops = script_spec__lookup(++ext);
+ if (!scripting_ops) {
+ fprintf(stderr, "invalid script extension");
+ return -1;
+ }
+ }
+
+ script_name = strdup(script);
+
+ return 0;
+}
+
+/* Helper function for filesystems that return a dent->d_type DT_UNKNOWN */
+static int is_directory(const char *base_path, const struct dirent *dent)
+{
+ char path[PATH_MAX];
+ struct stat st;
+
+ sprintf(path, "%s/%s", base_path, dent->d_name);
+ if (stat(path, &st))
+ return 0;
+
+ return S_ISDIR(st.st_mode);
+}
+
+#define for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next)\
+ while (!readdir_r(scripts_dir, &lang_dirent, &lang_next) && \
+ lang_next) \
+ if ((lang_dirent.d_type == DT_DIR || \
+ (lang_dirent.d_type == DT_UNKNOWN && \
+ is_directory(scripts_path, &lang_dirent))) && \
+ (strcmp(lang_dirent.d_name, ".")) && \
+ (strcmp(lang_dirent.d_name, "..")))
+
+#define for_each_script(lang_path, lang_dir, script_dirent, script_next)\
+ while (!readdir_r(lang_dir, &script_dirent, &script_next) && \
+ script_next) \
+ if (script_dirent.d_type != DT_DIR && \
+ (script_dirent.d_type != DT_UNKNOWN || \
+ !is_directory(lang_path, &script_dirent)))
+
+
+#define RECORD_SUFFIX "-record"
+#define REPORT_SUFFIX "-report"
+
+struct script_desc {
+ struct list_head node;
+ char *name;
+ char *half_liner;
+ char *args;
+};
+
+static LIST_HEAD(script_descs);
+
+static struct script_desc *script_desc__new(const char *name)
+{
+ struct script_desc *s = zalloc(sizeof(*s));
+
+ if (s != NULL && name)
+ s->name = strdup(name);
+
+ return s;
+}
+
+static void script_desc__delete(struct script_desc *s)
+{
+ free(s->name);
+ free(s->half_liner);
+ free(s->args);
+ free(s);
+}
+
+static void script_desc__add(struct script_desc *s)
+{
+ list_add_tail(&s->node, &script_descs);
+}
+
+static struct script_desc *script_desc__find(const char *name)
+{
+ struct script_desc *s;
+
+ list_for_each_entry(s, &script_descs, node)
+ if (strcasecmp(s->name, name) == 0)
+ return s;
+ return NULL;
+}
+
+static struct script_desc *script_desc__findnew(const char *name)
+{
+ struct script_desc *s = script_desc__find(name);
+
+ if (s)
+ return s;
+
+ s = script_desc__new(name);
+ if (!s)
+ goto out_delete_desc;
+
+ script_desc__add(s);
+
+ return s;
+
+out_delete_desc:
+ script_desc__delete(s);
+
+ return NULL;
+}
+
+static const char *ends_with(const char *str, const char *suffix)
+{
+ size_t suffix_len = strlen(suffix);
+ const char *p = str;
+
+ if (strlen(str) > suffix_len) {
+ p = str + strlen(str) - suffix_len;
+ if (!strncmp(p, suffix, suffix_len))
+ return p;
+ }
+
+ return NULL;
+}
+
+static char *ltrim(char *str)
+{
+ int len = strlen(str);
+
+ while (len && isspace(*str)) {
+ len--;
+ str++;
+ }
+
+ return str;
+}
+
+static int read_script_info(struct script_desc *desc, const char *filename)
+{
+ char line[BUFSIZ], *p;
+ FILE *fp;
+
+ fp = fopen(filename, "r");
+ if (!fp)
+ return -1;
+
+ while (fgets(line, sizeof(line), fp)) {
+ p = ltrim(line);
+ if (strlen(p) == 0)
+ continue;
+ if (*p != '#')
+ continue;
+ p++;
+ if (strlen(p) && *p == '!')
+ continue;
+
+ p = ltrim(p);
+ if (strlen(p) && p[strlen(p) - 1] == '\n')
+ p[strlen(p) - 1] = '\0';
+
+ if (!strncmp(p, "description:", strlen("description:"))) {
+ p += strlen("description:");
+ desc->half_liner = strdup(ltrim(p));
+ continue;
+ }
+
+ if (!strncmp(p, "args:", strlen("args:"))) {
+ p += strlen("args:");
+ desc->args = strdup(ltrim(p));
+ continue;
+ }
+ }
+
+ fclose(fp);
+
+ return 0;
+}
+
+static int list_available_scripts(const struct option *opt __used,
+ const char *s __used, int unset __used)
+{
+ struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
+ char scripts_path[MAXPATHLEN];
+ DIR *scripts_dir, *lang_dir;
+ char script_path[MAXPATHLEN];
+ char lang_path[MAXPATHLEN];
+ struct script_desc *desc;
+ char first_half[BUFSIZ];
+ char *script_root;
+ char *str;
+
+ snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
+
+ scripts_dir = opendir(scripts_path);
+ if (!scripts_dir)
+ return -1;
+
+ for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
+ snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
+ lang_dirent.d_name);
+ lang_dir = opendir(lang_path);
+ if (!lang_dir)
+ continue;
+
+ for_each_script(lang_path, lang_dir, script_dirent, script_next) {
+ script_root = strdup(script_dirent.d_name);
+ str = (char *)ends_with(script_root, REPORT_SUFFIX);
+ if (str) {
+ *str = '\0';
+ desc = script_desc__findnew(script_root);
+ snprintf(script_path, MAXPATHLEN, "%s/%s",
+ lang_path, script_dirent.d_name);
+ read_script_info(desc, script_path);
+ }
+ free(script_root);
+ }
+ }
+
+ fprintf(stdout, "List of available trace scripts:\n");
+ list_for_each_entry(desc, &script_descs, node) {
+ sprintf(first_half, "%s %s", desc->name,
+ desc->args ? desc->args : "");
+ fprintf(stdout, " %-36s %s\n", first_half,
+ desc->half_liner ? desc->half_liner : "");
+ }
+
+ exit(0);
+}
+
+static char *get_script_path(const char *script_root, const char *suffix)
+{
+ struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
+ char scripts_path[MAXPATHLEN];
+ char script_path[MAXPATHLEN];
+ DIR *scripts_dir, *lang_dir;
+ char lang_path[MAXPATHLEN];
+ char *str, *__script_root;
+ char *path = NULL;
+
+ snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
+
+ scripts_dir = opendir(scripts_path);
+ if (!scripts_dir)
+ return NULL;
+
+ for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
+ snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
+ lang_dirent.d_name);
+ lang_dir = opendir(lang_path);
+ if (!lang_dir)
+ continue;
+
+ for_each_script(lang_path, lang_dir, script_dirent, script_next) {
+ __script_root = strdup(script_dirent.d_name);
+ str = (char *)ends_with(__script_root, suffix);
+ if (str) {
+ *str = '\0';
+ if (strcmp(__script_root, script_root))
+ continue;
+ snprintf(script_path, MAXPATHLEN, "%s/%s",
+ lang_path, script_dirent.d_name);
+ path = strdup(script_path);
+ free(__script_root);
+ break;
+ }
+ free(__script_root);
+ }
+ }
+
+ return path;
+}
+
+static bool is_top_script(const char *script_path)
+{
+ return ends_with(script_path, "top") == NULL ? false : true;
+}
+
+static int has_required_arg(char *script_path)
+{
+ struct script_desc *desc;
+ int n_args = 0;
+ char *p;
+
+ desc = script_desc__new(NULL);
+
+ if (read_script_info(desc, script_path))
+ goto out;
+
+ if (!desc->args)
+ goto out;
+
+ for (p = desc->args; *p; p++)
+ if (*p == '<')
+ n_args++;
+out:
+ script_desc__delete(desc);
+
+ return n_args;
+}
+
+static const char * const script_usage[] = {
+ "perf script [<options>]",
+ "perf script [<options>] record <script> [<record-options>] <command>",
+ "perf script [<options>] report <script> [script-args]",
+ "perf script [<options>] <script> [<record-options>] <command>",
+ "perf script [<options>] <top-script> [script-args]",
+ NULL
+};
+
+static const struct option options[] = {
+ OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+ "dump raw trace in ASCII"),
+ OPT_INCR('v', "verbose", &verbose,
+ "be more verbose (show symbol address, etc)"),
+ OPT_BOOLEAN('L', "Latency", &latency_format,
+ "show latency attributes (irqs/preemption disabled, etc)"),
+ OPT_CALLBACK_NOOPT('l', "list", NULL, NULL, "list available scripts",
+ list_available_scripts),
+ OPT_CALLBACK('s', "script", NULL, "name",
+ "script file name (lang:script name, script name, or *)",
+ parse_scriptname),
+ OPT_STRING('g', "gen-script", &generate_script_lang, "lang",
+ "generate perf-script.xx script in specified language"),
+ OPT_STRING('i', "input", &input_name, "file",
+ "input file name"),
+ OPT_BOOLEAN('d', "debug-mode", &debug_mode,
+ "do various checks like samples ordering and lost events"),
+
+ OPT_END()
+};
+
+static bool have_cmd(int argc, const char **argv)
+{
+ char **__argv = malloc(sizeof(const char *) * argc);
+
+ if (!__argv)
+ die("malloc");
+ memcpy(__argv, argv, sizeof(const char *) * argc);
+ argc = parse_options(argc, (const char **)__argv, record_options,
+ NULL, PARSE_OPT_STOP_AT_NON_OPTION);
+ free(__argv);
+
+ return argc != 0;
+}
+
+int cmd_script(int argc, const char **argv, const char *prefix __used)
+{
+ char *rec_script_path = NULL;
+ char *rep_script_path = NULL;
+ struct perf_session *session;
+ char *script_path = NULL;
+ const char **__argv;
+ bool system_wide;
+ int i, j, err;
+
+ setup_scripting();
+
+ argc = parse_options(argc, argv, options, script_usage,
+ PARSE_OPT_STOP_AT_NON_OPTION);
+
+ if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) {
+ rec_script_path = get_script_path(argv[1], RECORD_SUFFIX);
+ if (!rec_script_path)
+ return cmd_record(argc, argv, NULL);
+ }
+
+ if (argc > 1 && !strncmp(argv[0], "rep", strlen("rep"))) {
+ rep_script_path = get_script_path(argv[1], REPORT_SUFFIX);
+ if (!rep_script_path) {
+ fprintf(stderr,
+ "Please specify a valid report script"
+ "(see 'perf script -l' for listing)\n");
+ return -1;
+ }
+ }
+
+ /* make sure PERF_EXEC_PATH is set for scripts */
+ perf_set_argv_exec_path(perf_exec_path());
+
+ if (argc && !script_name && !rec_script_path && !rep_script_path) {
+ int live_pipe[2];
+ int rep_args;
+ pid_t pid;
+
+ rec_script_path = get_script_path(argv[0], RECORD_SUFFIX);
+ rep_script_path = get_script_path(argv[0], REPORT_SUFFIX);
+
+ if (!rec_script_path && !rep_script_path) {
+ fprintf(stderr, " Couldn't find script %s\n\n See perf"
+ " script -l for available scripts.\n", argv[0]);
+ usage_with_options(script_usage, options);
+ }
+
+ if (is_top_script(argv[0])) {
+ rep_args = argc - 1;
+ } else {
+ int rec_args;
+
+ rep_args = has_required_arg(rep_script_path);
+ rec_args = (argc - 1) - rep_args;
+ if (rec_args < 0) {
+ fprintf(stderr, " %s script requires options."
+ "\n\n See perf script -l for available "
+ "scripts and options.\n", argv[0]);
+ usage_with_options(script_usage, options);
+ }
+ }
+
+ if (pipe(live_pipe) < 0) {
+ perror("failed to create pipe");
+ exit(-1);
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ perror("failed to fork");
+ exit(-1);
+ }
+
+ if (!pid) {
+ system_wide = true;
+ j = 0;
+
+ dup2(live_pipe[1], 1);
+ close(live_pipe[0]);
+
+ if (!is_top_script(argv[0]))
+ system_wide = !have_cmd(argc - rep_args,
+ &argv[rep_args]);
+
+ __argv = malloc((argc + 6) * sizeof(const char *));
+ if (!__argv)
+ die("malloc");
+
+ __argv[j++] = "/bin/sh";
+ __argv[j++] = rec_script_path;
+ if (system_wide)
+ __argv[j++] = "-a";
+ __argv[j++] = "-q";
+ __argv[j++] = "-o";
+ __argv[j++] = "-";
+ for (i = rep_args + 1; i < argc; i++)
+ __argv[j++] = argv[i];
+ __argv[j++] = NULL;
+
+ execvp("/bin/sh", (char **)__argv);
+ free(__argv);
+ exit(-1);
+ }
+
+ dup2(live_pipe[0], 0);
+ close(live_pipe[1]);
+
+ __argv = malloc((argc + 4) * sizeof(const char *));
+ if (!__argv)
+ die("malloc");
+ j = 0;
+ __argv[j++] = "/bin/sh";
+ __argv[j++] = rep_script_path;
+ for (i = 1; i < rep_args + 1; i++)
+ __argv[j++] = argv[i];
+ __argv[j++] = "-i";
+ __argv[j++] = "-";
+ __argv[j++] = NULL;
+
+ execvp("/bin/sh", (char **)__argv);
+ free(__argv);
+ exit(-1);
+ }
+
+ if (rec_script_path)
+ script_path = rec_script_path;
+ if (rep_script_path)
+ script_path = rep_script_path;
+
+ if (script_path) {
+ system_wide = false;
+ j = 0;
+
+ if (rec_script_path)
+ system_wide = !have_cmd(argc - 1, &argv[1]);
+
+ __argv = malloc((argc + 2) * sizeof(const char *));
+ if (!__argv)
+ die("malloc");
+ __argv[j++] = "/bin/sh";
+ __argv[j++] = script_path;
+ if (system_wide)
+ __argv[j++] = "-a";
+ for (i = 2; i < argc; i++)
+ __argv[j++] = argv[i];
+ __argv[j++] = NULL;
+
+ execvp("/bin/sh", (char **)__argv);
+ free(__argv);
+ exit(-1);
+ }
+
+ if (symbol__init() < 0)
+ return -1;
+ if (!script_name)
+ setup_pager();
+
+ session = perf_session__new(input_name, O_RDONLY, 0, false, &event_ops);
+ if (session == NULL)
+ return -ENOMEM;
+
+ if (strcmp(input_name, "-") &&
+ !perf_session__has_traces(session, "record -R"))
+ return -EINVAL;
+
+ if (generate_script_lang) {
+ struct stat perf_stat;
+
+ int input = open(input_name, O_RDONLY);
+ if (input < 0) {
+ perror("failed to open file");
+ exit(-1);
+ }
+
+ err = fstat(input, &perf_stat);
+ if (err < 0) {
+ perror("failed to stat file");
+ exit(-1);
+ }
+
+ if (!perf_stat.st_size) {
+ fprintf(stderr, "zero-sized file, nothing to do!\n");
+ exit(0);
+ }
+
+ scripting_ops = script_spec__lookup(generate_script_lang);
+ if (!scripting_ops) {
+ fprintf(stderr, "invalid language specifier");
+ return -1;
+ }
+
+ err = scripting_ops->generate_script("perf-script");
+ goto out;
+ }
+
+ if (script_name) {
+ err = scripting_ops->start_script(script_name, argc, argv);
+ if (err)
+ goto out;
+ pr_debug("perf script started with script %s\n\n", script_name);
+ }
+
+ err = __cmd_script(session);
+
+ perf_session__delete(session);
+ cleanup_scripting();
+out:
+ return err;
+}
#include "util/parse-options.h"
#include "util/parse-events.h"
#include "util/event.h"
+#include "util/evsel.h"
#include "util/debug.h"
#include "util/header.h"
#include "util/cpumap.h"
#include <math.h>
#include <locale.h>
+#define DEFAULT_SEPARATOR " "
+
static struct perf_event_attr default_attrs[] = {
{ .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK },
};
static bool system_wide = false;
-static int nr_cpus = 0;
+static struct cpu_map *cpus;
static int run_idx = 0;
static int run_count = 1;
static bool no_inherit = false;
static bool scale = true;
+static bool no_aggr = false;
static pid_t target_pid = -1;
static pid_t target_tid = -1;
-static pid_t *all_tids = NULL;
-static int thread_num = 0;
+static struct thread_map *threads;
static pid_t child_pid = -1;
static bool null_run = false;
-static bool big_num = false;
+static bool big_num = true;
+static int big_num_opt = -1;
static const char *cpu_list;
-
-
-static int *fd[MAX_NR_CPUS][MAX_COUNTERS];
-
-static int event_scaled[MAX_COUNTERS];
+static const char *csv_sep = NULL;
+static bool csv_output = false;
static volatile int done = 0;
double n, mean, M2;
};
+struct perf_stat {
+ struct stats res_stats[3];
+};
+
+static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
+{
+ evsel->priv = zalloc(sizeof(struct perf_stat));
+ return evsel->priv == NULL ? -ENOMEM : 0;
+}
+
+static void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
+{
+ free(evsel->priv);
+ evsel->priv = NULL;
+}
+
static void update_stats(struct stats *stats, u64 val)
{
double delta;
return sqrt(variance_mean);
}
-struct stats event_res_stats[MAX_COUNTERS][3];
-struct stats runtime_nsecs_stats;
+struct stats runtime_nsecs_stats[MAX_NR_CPUS];
+struct stats runtime_cycles_stats[MAX_NR_CPUS];
+struct stats runtime_branches_stats[MAX_NR_CPUS];
struct stats walltime_nsecs_stats;
-struct stats runtime_cycles_stats;
-struct stats runtime_branches_stats;
-#define MATCH_EVENT(t, c, counter) \
- (attrs[counter].type == PERF_TYPE_##t && \
- attrs[counter].config == PERF_COUNT_##c)
-
-#define ERR_PERF_OPEN \
-"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n"
-
-static int create_perf_stat_counter(int counter)
+static int create_perf_stat_counter(struct perf_evsel *evsel)
{
- struct perf_event_attr *attr = attrs + counter;
- int thread;
- int ncreated = 0;
+ struct perf_event_attr *attr = &evsel->attr;
if (scale)
attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
PERF_FORMAT_TOTAL_TIME_RUNNING;
- if (system_wide) {
- int cpu;
-
- for (cpu = 0; cpu < nr_cpus; cpu++) {
- fd[cpu][counter][0] = sys_perf_event_open(attr,
- -1, cpumap[cpu], -1, 0);
- if (fd[cpu][counter][0] < 0)
- pr_debug(ERR_PERF_OPEN, counter,
- fd[cpu][counter][0], strerror(errno));
- else
- ++ncreated;
- }
- } else {
- attr->inherit = !no_inherit;
- if (target_pid == -1 && target_tid == -1) {
- attr->disabled = 1;
- attr->enable_on_exec = 1;
- }
- for (thread = 0; thread < thread_num; thread++) {
- fd[0][counter][thread] = sys_perf_event_open(attr,
- all_tids[thread], -1, -1, 0);
- if (fd[0][counter][thread] < 0)
- pr_debug(ERR_PERF_OPEN, counter,
- fd[0][counter][thread],
- strerror(errno));
- else
- ++ncreated;
- }
+ if (system_wide)
+ return perf_evsel__open_per_cpu(evsel, cpus);
+
+ attr->inherit = !no_inherit;
+ if (target_pid == -1 && target_tid == -1) {
+ attr->disabled = 1;
+ attr->enable_on_exec = 1;
}
- return ncreated;
+ return perf_evsel__open_per_thread(evsel, threads);
}
/*
* Does the counter have nsecs as a unit?
*/
-static inline int nsec_counter(int counter)
+static inline int nsec_counter(struct perf_evsel *evsel)
{
- if (MATCH_EVENT(SOFTWARE, SW_CPU_CLOCK, counter) ||
- MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
+ if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) ||
+ perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
return 1;
return 0;
/*
* Read out the results of a single counter:
+ * aggregate counts across CPUs in system-wide mode
*/
-static void read_counter(int counter)
+static int read_counter_aggr(struct perf_evsel *counter)
{
- u64 count[3], single_count[3];
- int cpu;
- size_t res, nv;
- int scaled;
- int i, thread;
-
- count[0] = count[1] = count[2] = 0;
-
- nv = scale ? 3 : 1;
- for (cpu = 0; cpu < nr_cpus; cpu++) {
- for (thread = 0; thread < thread_num; thread++) {
- if (fd[cpu][counter][thread] < 0)
- continue;
-
- res = read(fd[cpu][counter][thread],
- single_count, nv * sizeof(u64));
- assert(res == nv * sizeof(u64));
-
- close(fd[cpu][counter][thread]);
- fd[cpu][counter][thread] = -1;
-
- count[0] += single_count[0];
- if (scale) {
- count[1] += single_count[1];
- count[2] += single_count[2];
- }
- }
- }
-
- scaled = 0;
- if (scale) {
- if (count[2] == 0) {
- event_scaled[counter] = -1;
- count[0] = 0;
- return;
- }
+ struct perf_stat *ps = counter->priv;
+ u64 *count = counter->counts->aggr.values;
+ int i;
- if (count[2] < count[1]) {
- event_scaled[counter] = 1;
- count[0] = (unsigned long long)
- ((double)count[0] * count[1] / count[2] + 0.5);
- }
- }
+ if (__perf_evsel__read(counter, cpus->nr, threads->nr, scale) < 0)
+ return -1;
for (i = 0; i < 3; i++)
- update_stats(&event_res_stats[counter][i], count[i]);
+ update_stats(&ps->res_stats[i], count[i]);
if (verbose) {
fprintf(stderr, "%s: %Ld %Ld %Ld\n", event_name(counter),
/*
* Save the full runtime - to allow normalization during printout:
*/
- if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
- update_stats(&runtime_nsecs_stats, count[0]);
- if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
- update_stats(&runtime_cycles_stats, count[0]);
- if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter))
- update_stats(&runtime_branches_stats, count[0]);
+ if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
+ update_stats(&runtime_nsecs_stats[0], count[0]);
+ if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
+ update_stats(&runtime_cycles_stats[0], count[0]);
+ if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
+ update_stats(&runtime_branches_stats[0], count[0]);
+
+ return 0;
+}
+
+/*
+ * Read out the results of a single counter:
+ * do not aggregate counts across CPUs in system-wide mode
+ */
+static int read_counter(struct perf_evsel *counter)
+{
+ u64 *count;
+ int cpu;
+
+ for (cpu = 0; cpu < cpus->nr; cpu++) {
+ if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0)
+ return -1;
+
+ count = counter->counts->cpu[cpu].values;
+
+ if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
+ update_stats(&runtime_nsecs_stats[cpu], count[0]);
+ if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
+ update_stats(&runtime_cycles_stats[cpu], count[0]);
+ if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
+ update_stats(&runtime_branches_stats[cpu], count[0]);
+ }
+
+ return 0;
}
static int run_perf_stat(int argc __used, const char **argv)
{
unsigned long long t0, t1;
+ struct perf_evsel *counter;
int status = 0;
- int counter, ncreated = 0;
int child_ready_pipe[2], go_pipe[2];
const bool forks = (argc > 0);
char buf;
- if (!system_wide)
- nr_cpus = 1;
-
if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
perror("failed to create pipes");
exit(1);
}
if (target_tid == -1 && target_pid == -1 && !system_wide)
- all_tids[0] = child_pid;
+ threads->map[0] = child_pid;
/*
* Wait for the child to be ready to exec.
close(child_ready_pipe[0]);
}
- for (counter = 0; counter < nr_counters; counter++)
- ncreated += create_perf_stat_counter(counter);
-
- if (ncreated == 0) {
- pr_err("No permission to collect %sstats.\n"
- "Consider tweaking /proc/sys/kernel/perf_event_paranoid.\n",
- system_wide ? "system-wide " : "");
- if (child_pid != -1)
- kill(child_pid, SIGTERM);
- return -1;
+ list_for_each_entry(counter, &evsel_list, node) {
+ if (create_perf_stat_counter(counter) < 0) {
+ if (errno == -EPERM || errno == -EACCES) {
+ error("You may not have permission to collect %sstats.\n"
+ "\t Consider tweaking"
+ " /proc/sys/kernel/perf_event_paranoid or running as root.",
+ system_wide ? "system-wide " : "");
+ } else {
+ error("open_counter returned with %d (%s). "
+ "/bin/dmesg may provide additional information.\n",
+ errno, strerror(errno));
+ }
+ if (child_pid != -1)
+ kill(child_pid, SIGTERM);
+ die("Not all events could be opened.\n");
+ return -1;
+ }
}
/*
update_stats(&walltime_nsecs_stats, t1 - t0);
- for (counter = 0; counter < nr_counters; counter++)
- read_counter(counter);
+ if (no_aggr) {
+ list_for_each_entry(counter, &evsel_list, node) {
+ read_counter(counter);
+ perf_evsel__close_fd(counter, cpus->nr, 1);
+ }
+ } else {
+ list_for_each_entry(counter, &evsel_list, node) {
+ read_counter_aggr(counter);
+ perf_evsel__close_fd(counter, cpus->nr, threads->nr);
+ }
+ }
return WEXITSTATUS(status);
}
-static void print_noise(int counter, double avg)
+static void print_noise(struct perf_evsel *evsel, double avg)
{
+ struct perf_stat *ps;
+
if (run_count == 1)
return;
+ ps = evsel->priv;
fprintf(stderr, " ( +- %7.3f%% )",
- 100 * stddev_stats(&event_res_stats[counter][0]) / avg);
+ 100 * stddev_stats(&ps->res_stats[0]) / avg);
}
-static void nsec_printout(int counter, double avg)
+static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg)
{
double msecs = avg / 1e6;
+ char cpustr[16] = { '\0', };
+ const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-24s";
- fprintf(stderr, " %18.6f %-24s", msecs, event_name(counter));
+ if (no_aggr)
+ sprintf(cpustr, "CPU%*d%s",
+ csv_output ? 0 : -4,
+ cpus->map[cpu], csv_sep);
+
+ fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(evsel));
+
+ if (csv_output)
+ return;
- if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) {
+ if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
fprintf(stderr, " # %10.3f CPUs ",
avg / avg_stats(&walltime_nsecs_stats));
- }
}
-static void abs_printout(int counter, double avg)
+static void abs_printout(int cpu, struct perf_evsel *evsel, double avg)
{
double total, ratio = 0.0;
+ char cpustr[16] = { '\0', };
+ const char *fmt;
+
+ if (csv_output)
+ fmt = "%s%.0f%s%s";
+ else if (big_num)
+ fmt = "%s%'18.0f%s%-24s";
+ else
+ fmt = "%s%18.0f%s%-24s";
- if (big_num)
- fprintf(stderr, " %'18.0f %-24s", avg, event_name(counter));
+ if (no_aggr)
+ sprintf(cpustr, "CPU%*d%s",
+ csv_output ? 0 : -4,
+ cpus->map[cpu], csv_sep);
else
- fprintf(stderr, " %18.0f %-24s", avg, event_name(counter));
+ cpu = 0;
+
+ fprintf(stderr, fmt, cpustr, avg, csv_sep, event_name(evsel));
- if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) {
- total = avg_stats(&runtime_cycles_stats);
+ if (csv_output)
+ return;
+
+ if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
+ total = avg_stats(&runtime_cycles_stats[cpu]);
if (total)
ratio = avg / total;
fprintf(stderr, " # %10.3f IPC ", ratio);
- } else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) &&
- runtime_branches_stats.n != 0) {
- total = avg_stats(&runtime_branches_stats);
+ } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
+ runtime_branches_stats[cpu].n != 0) {
+ total = avg_stats(&runtime_branches_stats[cpu]);
if (total)
ratio = avg * 100 / total;
fprintf(stderr, " # %10.3f %% ", ratio);
- } else if (runtime_nsecs_stats.n != 0) {
- total = avg_stats(&runtime_nsecs_stats);
+ } else if (runtime_nsecs_stats[cpu].n != 0) {
+ total = avg_stats(&runtime_nsecs_stats[cpu]);
if (total)
ratio = 1000.0 * avg / total;
/*
* Print out the results of a single counter:
+ * aggregated counts in system-wide mode
*/
-static void print_counter(int counter)
+static void print_counter_aggr(struct perf_evsel *counter)
{
- double avg = avg_stats(&event_res_stats[counter][0]);
- int scaled = event_scaled[counter];
+ struct perf_stat *ps = counter->priv;
+ double avg = avg_stats(&ps->res_stats[0]);
+ int scaled = counter->counts->scaled;
if (scaled == -1) {
- fprintf(stderr, " %18s %-24s\n",
- "<not counted>", event_name(counter));
+ fprintf(stderr, "%*s%s%-24s\n",
+ csv_output ? 0 : 18,
+ "<not counted>", csv_sep, event_name(counter));
return;
}
if (nsec_counter(counter))
- nsec_printout(counter, avg);
+ nsec_printout(-1, counter, avg);
else
- abs_printout(counter, avg);
+ abs_printout(-1, counter, avg);
+
+ if (csv_output) {
+ fputc('\n', stderr);
+ return;
+ }
print_noise(counter, avg);
if (scaled) {
double avg_enabled, avg_running;
- avg_enabled = avg_stats(&event_res_stats[counter][1]);
- avg_running = avg_stats(&event_res_stats[counter][2]);
+ avg_enabled = avg_stats(&ps->res_stats[1]);
+ avg_running = avg_stats(&ps->res_stats[2]);
fprintf(stderr, " (scaled from %.2f%%)",
100 * avg_running / avg_enabled);
fprintf(stderr, "\n");
}
+/*
+ * Print out the results of a single counter:
+ * does not use aggregated count in system-wide
+ */
+static void print_counter(struct perf_evsel *counter)
+{
+ u64 ena, run, val;
+ int cpu;
+
+ for (cpu = 0; cpu < cpus->nr; cpu++) {
+ val = counter->counts->cpu[cpu].val;
+ ena = counter->counts->cpu[cpu].ena;
+ run = counter->counts->cpu[cpu].run;
+ if (run == 0 || ena == 0) {
+ fprintf(stderr, "CPU%*d%s%*s%s%-24s",
+ csv_output ? 0 : -4,
+ cpus->map[cpu], csv_sep,
+ csv_output ? 0 : 18,
+ "<not counted>", csv_sep,
+ event_name(counter));
+
+ fprintf(stderr, "\n");
+ continue;
+ }
+
+ if (nsec_counter(counter))
+ nsec_printout(cpu, counter, val);
+ else
+ abs_printout(cpu, counter, val);
+
+ if (!csv_output) {
+ print_noise(counter, 1.0);
+
+ if (run != ena) {
+ fprintf(stderr, " (scaled from %.2f%%)",
+ 100.0 * run / ena);
+ }
+ }
+ fprintf(stderr, "\n");
+ }
+}
+
static void print_stat(int argc, const char **argv)
{
- int i, counter;
+ struct perf_evsel *counter;
+ int i;
fflush(stdout);
- fprintf(stderr, "\n");
- fprintf(stderr, " Performance counter stats for ");
- if(target_pid == -1 && target_tid == -1) {
- fprintf(stderr, "\'%s", argv[0]);
- for (i = 1; i < argc; i++)
- fprintf(stderr, " %s", argv[i]);
- } else if (target_pid != -1)
- fprintf(stderr, "process id \'%d", target_pid);
- else
- fprintf(stderr, "thread id \'%d", target_tid);
-
- fprintf(stderr, "\'");
- if (run_count > 1)
- fprintf(stderr, " (%d runs)", run_count);
- fprintf(stderr, ":\n\n");
+ if (!csv_output) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, " Performance counter stats for ");
+ if(target_pid == -1 && target_tid == -1) {
+ fprintf(stderr, "\'%s", argv[0]);
+ for (i = 1; i < argc; i++)
+ fprintf(stderr, " %s", argv[i]);
+ } else if (target_pid != -1)
+ fprintf(stderr, "process id \'%d", target_pid);
+ else
+ fprintf(stderr, "thread id \'%d", target_tid);
+
+ fprintf(stderr, "\'");
+ if (run_count > 1)
+ fprintf(stderr, " (%d runs)", run_count);
+ fprintf(stderr, ":\n\n");
+ }
- for (counter = 0; counter < nr_counters; counter++)
- print_counter(counter);
+ if (no_aggr) {
+ list_for_each_entry(counter, &evsel_list, node)
+ print_counter(counter);
+ } else {
+ list_for_each_entry(counter, &evsel_list, node)
+ print_counter_aggr(counter);
+ }
- fprintf(stderr, "\n");
- fprintf(stderr, " %18.9f seconds time elapsed",
- avg_stats(&walltime_nsecs_stats)/1e9);
- if (run_count > 1) {
- fprintf(stderr, " ( +- %7.3f%% )",
+ if (!csv_output) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, " %18.9f seconds time elapsed",
+ avg_stats(&walltime_nsecs_stats)/1e9);
+ if (run_count > 1) {
+ fprintf(stderr, " ( +- %7.3f%% )",
100*stddev_stats(&walltime_nsecs_stats) /
avg_stats(&walltime_nsecs_stats));
+ }
+ fprintf(stderr, "\n\n");
}
- fprintf(stderr, "\n\n");
}
static volatile int signr = -1;
NULL
};
+static int stat__set_big_num(const struct option *opt __used,
+ const char *s __used, int unset)
+{
+ big_num_opt = unset ? 0 : 1;
+ return 0;
+}
+
static const struct option options[] = {
OPT_CALLBACK('e', "event", NULL, "event",
"event selector. use 'perf list' to list available events",
"repeat command and print average + stddev (max: 100)"),
OPT_BOOLEAN('n', "null", &null_run,
"null run - dont start any counters"),
- OPT_BOOLEAN('B', "big-num", &big_num,
- "print large numbers with thousands\' separators"),
+ OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL,
+ "print large numbers with thousands\' separators",
+ stat__set_big_num),
OPT_STRING('C', "cpu", &cpu_list, "cpu",
"list of cpus to monitor in system-wide"),
+ OPT_BOOLEAN('A', "no-aggr", &no_aggr,
+ "disable CPU count aggregation"),
+ OPT_STRING('x', "field-separator", &csv_sep, "separator",
+ "print counts with custom separator"),
OPT_END()
};
int cmd_stat(int argc, const char **argv, const char *prefix __used)
{
- int status;
- int i,j;
+ struct perf_evsel *pos;
+ int status = -ENOMEM;
setlocale(LC_ALL, "");
argc = parse_options(argc, argv, options, stat_usage,
PARSE_OPT_STOP_AT_NON_OPTION);
+
+ if (csv_sep)
+ csv_output = true;
+ else
+ csv_sep = DEFAULT_SEPARATOR;
+
+ /*
+ * let the spreadsheet do the pretty-printing
+ */
+ if (csv_output) {
+ /* User explicitely passed -B? */
+ if (big_num_opt == 1) {
+ fprintf(stderr, "-B option not supported with -x\n");
+ usage_with_options(stat_usage, options);
+ } else /* Nope, so disable big number formatting */
+ big_num = false;
+ } else if (big_num_opt == 0) /* User passed --no-big-num */
+ big_num = false;
+
if (!argc && target_pid == -1 && target_tid == -1)
usage_with_options(stat_usage, options);
if (run_count <= 0)
usage_with_options(stat_usage, options);
+ /* no_aggr is for system-wide only */
+ if (no_aggr && !system_wide)
+ usage_with_options(stat_usage, options);
+
/* Set attrs and nr_counters if no event is selected and !null_run */
if (!null_run && !nr_counters) {
- memcpy(attrs, default_attrs, sizeof(default_attrs));
+ size_t c;
+
nr_counters = ARRAY_SIZE(default_attrs);
+
+ for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) {
+ pos = perf_evsel__new(default_attrs[c].type,
+ default_attrs[c].config,
+ nr_counters);
+ if (pos == NULL)
+ goto out;
+ list_add(&pos->node, &evsel_list);
+ }
}
- if (system_wide)
- nr_cpus = read_cpu_map(cpu_list);
- else
- nr_cpus = 1;
+ if (target_pid != -1)
+ target_tid = target_pid;
- if (nr_cpus < 1)
+ threads = thread_map__new(target_pid, target_tid);
+ if (threads == NULL) {
+ pr_err("Problems finding threads of monitor\n");
usage_with_options(stat_usage, options);
+ }
- if (target_pid != -1) {
- target_tid = target_pid;
- thread_num = find_all_tid(target_pid, &all_tids);
- if (thread_num <= 0) {
- fprintf(stderr, "Can't find all threads of pid %d\n",
- target_pid);
- usage_with_options(stat_usage, options);
- }
- } else {
- all_tids=malloc(sizeof(pid_t));
- if (!all_tids)
- return -ENOMEM;
+ if (system_wide)
+ cpus = cpu_map__new(cpu_list);
+ else
+ cpus = cpu_map__dummy_new();
- all_tids[0] = target_tid;
- thread_num = 1;
+ if (cpus == NULL) {
+ perror("failed to parse CPUs map");
+ usage_with_options(stat_usage, options);
+ return -1;
}
- for (i = 0; i < MAX_NR_CPUS; i++) {
- for (j = 0; j < MAX_COUNTERS; j++) {
- fd[i][j] = malloc(sizeof(int)*thread_num);
- if (!fd[i][j])
- return -ENOMEM;
- }
+ list_for_each_entry(pos, &evsel_list, node) {
+ if (perf_evsel__alloc_stat_priv(pos) < 0 ||
+ perf_evsel__alloc_counts(pos, cpus->nr) < 0 ||
+ perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
+ goto out_free_fd;
}
/*
if (status != -1)
print_stat(argc, argv);
-
+out_free_fd:
+ list_for_each_entry(pos, &evsel_list, node)
+ perf_evsel__free_stat_priv(pos);
+out:
+ thread_map__delete(threads);
+ threads = NULL;
return status;
}
* end addresses too.
*/
for (nd = rb_first(&vmlinux_map->dso->symbols[type]); nd; nd = rb_next(nd)) {
- struct symbol *pair;
+ struct symbol *pair, *first_pair;
+ bool backwards = true;
sym = rb_entry(nd, struct symbol, rb_node);
- pair = machine__find_kernel_symbol(&kallsyms, type, sym->start, NULL, NULL);
+
+ if (sym->start == sym->end)
+ continue;
+
+ first_pair = machine__find_kernel_symbol(&kallsyms, type, sym->start, NULL, NULL);
+ pair = first_pair;
if (pair && pair->start == sym->start) {
next_pair:
pr_debug("%#Lx: diff end addr for %s v: %#Lx k: %#Lx\n",
sym->start, sym->name, sym->end, pair->end);
} else {
- struct rb_node *nnd = rb_prev(&pair->rb_node);
-
+ struct rb_node *nnd;
+detour:
+ nnd = backwards ? rb_prev(&pair->rb_node) :
+ rb_next(&pair->rb_node);
if (nnd) {
struct symbol *next = rb_entry(nnd, struct symbol, rb_node);
goto next_pair;
}
}
+
+ if (backwards) {
+ backwards = false;
+ pair = first_pair;
+ goto detour;
+ }
+
pr_debug("%#Lx: diff name v: %s k: %s\n",
sym->start, sym->name, pair->name);
}
return err;
}
+#include "util/evsel.h"
+#include <sys/types.h>
+
+static int trace_event__id(const char *event_name)
+{
+ char *filename;
+ int err = -1, fd;
+
+ if (asprintf(&filename,
+ "/sys/kernel/debug/tracing/events/syscalls/%s/id",
+ event_name) < 0)
+ return -1;
+
+ fd = open(filename, O_RDONLY);
+ if (fd >= 0) {
+ char id[16];
+ if (read(fd, id, sizeof(id)) > 0)
+ err = atoi(id);
+ close(fd);
+ }
+
+ free(filename);
+ return err;
+}
+
+static int test__open_syscall_event(void)
+{
+ int err = -1, fd;
+ struct thread_map *threads;
+ struct perf_evsel *evsel;
+ unsigned int nr_open_calls = 111, i;
+ int id = trace_event__id("sys_enter_open");
+
+ if (id < 0) {
+ pr_debug("is debugfs mounted on /sys/kernel/debug?\n");
+ return -1;
+ }
+
+ threads = thread_map__new(-1, getpid());
+ if (threads == NULL) {
+ pr_debug("thread_map__new\n");
+ return -1;
+ }
+
+ evsel = perf_evsel__new(PERF_TYPE_TRACEPOINT, id, 0);
+ if (evsel == NULL) {
+ pr_debug("perf_evsel__new\n");
+ goto out_thread_map_delete;
+ }
+
+ if (perf_evsel__open_per_thread(evsel, threads) < 0) {
+ pr_debug("failed to open counter: %s, "
+ "tweak /proc/sys/kernel/perf_event_paranoid?\n",
+ strerror(errno));
+ goto out_evsel_delete;
+ }
+
+ for (i = 0; i < nr_open_calls; ++i) {
+ fd = open("/etc/passwd", O_RDONLY);
+ close(fd);
+ }
+
+ if (perf_evsel__read_on_cpu(evsel, 0, 0) < 0) {
+ pr_debug("perf_evsel__open_read_on_cpu\n");
+ goto out_close_fd;
+ }
+
+ if (evsel->counts->cpu[0].val != nr_open_calls) {
+ pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls, got %Ld\n",
+ nr_open_calls, evsel->counts->cpu[0].val);
+ goto out_close_fd;
+ }
+
+ err = 0;
+out_close_fd:
+ perf_evsel__close_fd(evsel, 1, threads->nr);
+out_evsel_delete:
+ perf_evsel__delete(evsel);
+out_thread_map_delete:
+ thread_map__delete(threads);
+ return err;
+}
+
static struct test {
const char *desc;
int (*func)(void);
.desc = "vmlinux symtab matches kallsyms",
.func = test__vmlinux_matches_kallsyms,
},
+ {
+ .desc = "detect open syscall event",
+ .func = test__open_syscall_event,
+ },
{
.func = NULL,
},
#include "util/session.h"
#include "util/svghelper.h"
+#define SUPPORT_OLD_POWER_EVENTS 1
+#define PWR_EVENT_EXIT -1
+
+
static char const *input_name = "perf.data";
static char const *output_name = "output.svg";
static u64 cpus_pstate_start_times[MAX_CPUS];
static u64 cpus_pstate_state[MAX_CPUS];
-static int process_comm_event(event_t *event, struct perf_session *session __used)
+static int process_comm_event(event_t *event, struct sample_data *sample __used,
+ struct perf_session *session __used)
{
pid_set_comm(event->comm.tid, event->comm.comm);
return 0;
}
-static int process_fork_event(event_t *event, struct perf_session *session __used)
+static int process_fork_event(event_t *event, struct sample_data *sample __used,
+ struct perf_session *session __used)
{
pid_fork(event->fork.pid, event->fork.ppid, event->fork.time);
return 0;
}
-static int process_exit_event(event_t *event, struct perf_session *session __used)
+static int process_exit_event(event_t *event, struct sample_data *sample __used,
+ struct perf_session *session __used)
{
pid_exit(event->fork.pid, event->fork.time);
return 0;
int lock_depth;
};
-struct power_entry {
+#ifdef SUPPORT_OLD_POWER_EVENTS
+static int use_old_power_events;
+struct power_entry_old {
struct trace_entry te;
u64 type;
u64 value;
u64 cpu_id;
};
+#endif
+
+struct power_processor_entry {
+ struct trace_entry te;
+ u32 state;
+ u32 cpu_id;
+};
#define TASK_COMM_LEN 16
struct wakeup_entry {
}
-static int process_sample_event(event_t *event, struct perf_session *session)
+static int process_sample_event(event_t *event __used,
+ struct sample_data *sample,
+ struct perf_session *session)
{
- struct sample_data data;
struct trace_entry *te;
- memset(&data, 0, sizeof(data));
-
- event__parse_sample(event, session->sample_type, &data);
-
if (session->sample_type & PERF_SAMPLE_TIME) {
- if (!first_time || first_time > data.time)
- first_time = data.time;
- if (last_time < data.time)
- last_time = data.time;
+ if (!first_time || first_time > sample->time)
+ first_time = sample->time;
+ if (last_time < sample->time)
+ last_time = sample->time;
}
- te = (void *)data.raw_data;
- if (session->sample_type & PERF_SAMPLE_RAW && data.raw_size > 0) {
+ te = (void *)sample->raw_data;
+ if (session->sample_type & PERF_SAMPLE_RAW && sample->raw_size > 0) {
char *event_str;
- struct power_entry *pe;
-
- pe = (void *)te;
-
+#ifdef SUPPORT_OLD_POWER_EVENTS
+ struct power_entry_old *peo;
+ peo = (void *)te;
+#endif
event_str = perf_header__find_event(te->type);
if (!event_str)
return 0;
- if (strcmp(event_str, "power:power_start") == 0)
- c_state_start(pe->cpu_id, data.time, pe->value);
+ if (strcmp(event_str, "power:cpu_idle") == 0) {
+ struct power_processor_entry *ppe = (void *)te;
+ if (ppe->state == (u32)PWR_EVENT_EXIT)
+ c_state_end(ppe->cpu_id, sample->time);
+ else
+ c_state_start(ppe->cpu_id, sample->time,
+ ppe->state);
+ }
+ else if (strcmp(event_str, "power:cpu_frequency") == 0) {
+ struct power_processor_entry *ppe = (void *)te;
+ p_state_change(ppe->cpu_id, sample->time, ppe->state);
+ }
+
+ else if (strcmp(event_str, "sched:sched_wakeup") == 0)
+ sched_wakeup(sample->cpu, sample->time, sample->pid, te);
- if (strcmp(event_str, "power:power_end") == 0)
- c_state_end(pe->cpu_id, data.time);
+ else if (strcmp(event_str, "sched:sched_switch") == 0)
+ sched_switch(sample->cpu, sample->time, te);
- if (strcmp(event_str, "power:power_frequency") == 0)
- p_state_change(pe->cpu_id, data.time, pe->value);
+#ifdef SUPPORT_OLD_POWER_EVENTS
+ if (use_old_power_events) {
+ if (strcmp(event_str, "power:power_start") == 0)
+ c_state_start(peo->cpu_id, sample->time,
+ peo->value);
- if (strcmp(event_str, "sched:sched_wakeup") == 0)
- sched_wakeup(data.cpu, data.time, data.pid, te);
+ else if (strcmp(event_str, "power:power_end") == 0)
+ c_state_end(sample->cpu, sample->time);
- if (strcmp(event_str, "sched:sched_switch") == 0)
- sched_switch(data.cpu, data.time, te);
+ else if (strcmp(event_str,
+ "power:power_frequency") == 0)
+ p_state_change(peo->cpu_id, sample->time,
+ peo->value);
+ }
+#endif
}
return 0;
}
static int __cmd_timechart(void)
{
- struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0, false);
+ struct perf_session *session = perf_session__new(input_name, O_RDONLY,
+ 0, false, &event_ops);
int ret = -EINVAL;
if (session == NULL)
NULL
};
-static const char *record_args[] = {
+#ifdef SUPPORT_OLD_POWER_EVENTS
+static const char * const record_old_args[] = {
"record",
"-a",
"-R",
"-e", "sched:sched_wakeup",
"-e", "sched:sched_switch",
};
+#endif
+
+static const char * const record_new_args[] = {
+ "record",
+ "-a",
+ "-R",
+ "-f",
+ "-c", "1",
+ "-e", "power:cpu_frequency",
+ "-e", "power:cpu_idle",
+ "-e", "sched:sched_wakeup",
+ "-e", "sched:sched_switch",
+};
static int __cmd_record(int argc, const char **argv)
{
unsigned int rec_argc, i, j;
const char **rec_argv;
+ const char * const *record_args = record_new_args;
+ unsigned int record_elems = ARRAY_SIZE(record_new_args);
+
+#ifdef SUPPORT_OLD_POWER_EVENTS
+ if (!is_valid_tracepoint("power:cpu_idle") &&
+ is_valid_tracepoint("power:power_start")) {
+ use_old_power_events = 1;
+ record_args = record_old_args;
+ record_elems = ARRAY_SIZE(record_old_args);
+ }
+#endif
- rec_argc = ARRAY_SIZE(record_args) + argc - 1;
+ rec_argc = record_elems + argc - 1;
rec_argv = calloc(rec_argc + 1, sizeof(char *));
- for (i = 0; i < ARRAY_SIZE(record_args); i++)
+ if (rec_argv == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < record_elems; i++)
rec_argv[i] = strdup(record_args[i]);
for (j = 1; j < (unsigned int)argc; j++, i++)
OPT_CALLBACK('p', "process", NULL, "process",
"process selector. Pass a pid or process name.",
parse_process),
+ OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
+ "Look for files with symbols relative to this directory"),
OPT_END()
};
#include "perf.h"
#include "util/color.h"
+#include "util/evsel.h"
#include "util/session.h"
#include "util/symbol.h"
#include "util/thread.h"
#include "util/parse-options.h"
#include "util/parse-events.h"
#include "util/cpumap.h"
+#include "util/xyarray.h"
#include "util/debug.h"
#include <linux/unistd.h>
#include <linux/types.h>
-static int *fd[MAX_NR_CPUS][MAX_COUNTERS];
+#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
static bool system_wide = false;
static int target_pid = -1;
static int target_tid = -1;
-static pid_t *all_tids = NULL;
-static int thread_num = 0;
+static struct thread_map *threads;
static bool inherit = false;
-static int nr_cpus = 0;
+static struct cpu_map *cpus;
static int realtime_prio = 0;
static bool group = false;
static unsigned int page_size;
struct sym_entry *sym_filter_entry_sched = NULL;
static int sym_pcnt_filter = 5;
static int sym_counter = 0;
+static struct perf_evsel *sym_evsel = NULL;
static int display_weighted = -1;
static const char *cpu_list;
return;
symbol = sym_entry__symbol(syme);
- printf("Showing %s for %s\n", event_name(sym_counter), symbol->name);
+ printf("Showing %s for %s\n", event_name(sym_evsel), symbol->name);
printf(" Events Pcnt (>=%d%%)\n", sym_pcnt_filter);
pthread_mutex_lock(&syme->src->lock);
static void print_sym_table(void)
{
int printed = 0, j;
- int counter, snap = !display_weighted ? sym_counter : 0;
+ struct perf_evsel *counter;
+ int snap = !display_weighted ? sym_counter : 0;
float samples_per_sec = samples/delay_secs;
float ksamples_per_sec = kernel_samples/delay_secs;
float us_samples_per_sec = (us_samples)/delay_secs;
}
if (nr_counters == 1 || !display_weighted) {
- printf("%Ld", (u64)attrs[0].sample_period);
+ struct perf_evsel *first;
+ first = list_entry(evsel_list.next, struct perf_evsel, node);
+ printf("%Ld", first->attr.sample_period);
if (freq)
printf("Hz ");
else
}
if (!display_weighted)
- printf("%s", event_name(sym_counter));
- else for (counter = 0; counter < nr_counters; counter++) {
- if (counter)
+ printf("%s", event_name(sym_evsel));
+ else list_for_each_entry(counter, &evsel_list, node) {
+ if (counter->idx)
printf("/");
printf("%s", event_name(counter));
printf(" (all");
if (cpu_list)
- printf(", CPU%s: %s)\n", nr_cpus > 1 ? "s" : "", cpu_list);
+ printf(", CPU%s: %s)\n", cpus->nr > 1 ? "s" : "", cpu_list);
else {
if (target_tid != -1)
printf(")\n");
else
- printf(", %d CPU%s)\n", nr_cpus, nr_cpus > 1 ? "s" : "");
+ printf(", %d CPU%s)\n", cpus->nr, cpus->nr > 1 ? "s" : "");
}
printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", print_entries);
if (nr_counters > 1)
- fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(sym_counter));
+ fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(sym_evsel));
fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", count_filter);
break;
case 'E':
if (nr_counters > 1) {
- int i;
-
fprintf(stderr, "\nAvailable events:");
- for (i = 0; i < nr_counters; i++)
- fprintf(stderr, "\n\t%d %s", i, event_name(i));
+
+ list_for_each_entry(sym_evsel, &evsel_list, node)
+ fprintf(stderr, "\n\t%d %s", sym_evsel->idx, event_name(sym_evsel));
prompt_integer(&sym_counter, "Enter details event counter");
if (sym_counter >= nr_counters) {
- fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(0));
+ sym_evsel = list_entry(evsel_list.next, struct perf_evsel, node);
sym_counter = 0;
+ fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(sym_evsel));
sleep(1);
+ break;
}
+ list_for_each_entry(sym_evsel, &evsel_list, node)
+ if (sym_evsel->idx == sym_counter)
+ break;
} else sym_counter = 0;
break;
case 'f':
}
static void event__process_sample(const event_t *self,
- struct perf_session *session, int counter)
+ struct sample_data *sample,
+ struct perf_session *session,
+ struct perf_evsel *evsel)
{
u64 ip = self->ip.ip;
struct sym_entry *syme;
struct addr_location al;
- struct sample_data data;
struct machine *machine;
u8 origin = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
if (self->header.misc & PERF_RECORD_MISC_EXACT_IP)
exact_samples++;
- if (event__preprocess_sample(self, session, &al, &data,
+ if (event__preprocess_sample(self, session, &al, sample,
symbol_filter) < 0 ||
al.filtered)
return;
syme = symbol__priv(al.sym);
if (!syme->skip) {
- syme->count[counter]++;
+ syme->count[evsel->idx]++;
syme->origin = origin;
- record_precise_ip(syme, counter, ip);
+ record_precise_ip(syme, evsel->idx, ip);
pthread_mutex_lock(&active_symbols_lock);
if (list_empty(&syme->node) || !syme->node.next)
__list_insert_active_sym(syme);
}
struct mmap_data {
- int counter;
void *base;
int mask;
unsigned int prev;
};
+static int perf_evsel__alloc_mmap_per_thread(struct perf_evsel *evsel,
+ int ncpus, int nthreads)
+{
+ evsel->priv = xyarray__new(ncpus, nthreads, sizeof(struct mmap_data));
+ return evsel->priv != NULL ? 0 : -ENOMEM;
+}
+
+static void perf_evsel__free_mmap(struct perf_evsel *evsel)
+{
+ xyarray__delete(evsel->priv);
+ evsel->priv = NULL;
+}
+
static unsigned int mmap_read_head(struct mmap_data *md)
{
struct perf_event_mmap_page *pc = md->base;
}
static void perf_session__mmap_read_counter(struct perf_session *self,
- struct mmap_data *md)
+ struct perf_evsel *evsel,
+ int cpu, int thread_idx)
{
+ struct xyarray *mmap_array = evsel->priv;
+ struct mmap_data *md = xyarray__entry(mmap_array, cpu, thread_idx);
unsigned int head = mmap_read_head(md);
unsigned int old = md->prev;
unsigned char *data = md->base + page_size;
+ struct sample_data sample;
int diff;
/*
event = &event_copy;
}
+ event__parse_sample(event, self, &sample);
if (event->header.type == PERF_RECORD_SAMPLE)
- event__process_sample(event, self, md->counter);
+ event__process_sample(event, &sample, self, evsel);
else
- event__process(event, self);
+ event__process(event, &sample, self);
old += size;
}
}
static struct pollfd *event_array;
-static struct mmap_data *mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
static void perf_session__mmap_read(struct perf_session *self)
{
- int i, counter, thread_index;
+ struct perf_evsel *counter;
+ int i, thread_index;
- for (i = 0; i < nr_cpus; i++) {
- for (counter = 0; counter < nr_counters; counter++)
+ for (i = 0; i < cpus->nr; i++) {
+ list_for_each_entry(counter, &evsel_list, node) {
for (thread_index = 0;
- thread_index < thread_num;
+ thread_index < threads->nr;
thread_index++) {
perf_session__mmap_read_counter(self,
- &mmap_array[i][counter][thread_index]);
+ counter, i, thread_index);
}
+ }
}
}
int nr_poll;
int group_fd;
-static void start_counter(int i, int counter)
+static void start_counter(int i, struct perf_evsel *evsel)
{
+ struct xyarray *mmap_array = evsel->priv;
+ struct mmap_data *mm;
struct perf_event_attr *attr;
int cpu = -1;
int thread_index;
if (target_tid == -1)
- cpu = cpumap[i];
+ cpu = cpus->map[i];
- attr = attrs + counter;
+ attr = &evsel->attr;
attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID;
attr->inherit = (cpu < 0) && inherit;
attr->mmap = 1;
- for (thread_index = 0; thread_index < thread_num; thread_index++) {
+ for (thread_index = 0; thread_index < threads->nr; thread_index++) {
try_again:
- fd[i][counter][thread_index] = sys_perf_event_open(attr,
- all_tids[thread_index], cpu, group_fd, 0);
+ FD(evsel, i, thread_index) = sys_perf_event_open(attr,
+ threads->map[thread_index], cpu, group_fd, 0);
- if (fd[i][counter][thread_index] < 0) {
+ if (FD(evsel, i, thread_index) < 0) {
int err = errno;
if (err == EPERM || err == EACCES)
- die("No permission - are you root?\n");
+ die("Permission error - are you root?\n"
+ "\t Consider tweaking"
+ " /proc/sys/kernel/perf_event_paranoid.\n");
/*
* If it's cycles then fall back to hrtimer
* based cpu-clock-tick sw counter, which
goto try_again;
}
printf("\n");
- error("perfcounter syscall returned with %d (%s)\n",
- fd[i][counter][thread_index], strerror(err));
+ error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
+ FD(evsel, i, thread_index), strerror(err));
die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
exit(-1);
}
- assert(fd[i][counter][thread_index] >= 0);
- fcntl(fd[i][counter][thread_index], F_SETFL, O_NONBLOCK);
+ assert(FD(evsel, i, thread_index) >= 0);
+ fcntl(FD(evsel, i, thread_index), F_SETFL, O_NONBLOCK);
/*
* First counter acts as the group leader:
*/
if (group && group_fd == -1)
- group_fd = fd[i][counter][thread_index];
+ group_fd = FD(evsel, i, thread_index);
- event_array[nr_poll].fd = fd[i][counter][thread_index];
+ event_array[nr_poll].fd = FD(evsel, i, thread_index);
event_array[nr_poll].events = POLLIN;
nr_poll++;
- mmap_array[i][counter][thread_index].counter = counter;
- mmap_array[i][counter][thread_index].prev = 0;
- mmap_array[i][counter][thread_index].mask = mmap_pages*page_size - 1;
- mmap_array[i][counter][thread_index].base = mmap(NULL, (mmap_pages+1)*page_size,
- PROT_READ, MAP_SHARED, fd[i][counter][thread_index], 0);
- if (mmap_array[i][counter][thread_index].base == MAP_FAILED)
+ mm = xyarray__entry(mmap_array, i, thread_index);
+ mm->prev = 0;
+ mm->mask = mmap_pages*page_size - 1;
+ mm->base = mmap(NULL, (mmap_pages+1)*page_size,
+ PROT_READ, MAP_SHARED, FD(evsel, i, thread_index), 0);
+ if (mm->base == MAP_FAILED)
die("failed to mmap with %d (%s)\n", errno, strerror(errno));
}
}
static int __cmd_top(void)
{
pthread_t thread;
- int i, counter;
- int ret;
+ struct perf_evsel *counter;
+ int i, ret;
/*
* FIXME: perf_session__new should allow passing a O_MMAP, so that all this
* mmap reading, etc is encapsulated in it. Use O_WRONLY for now.
*/
- struct perf_session *session = perf_session__new(NULL, O_WRONLY, false, false);
+ struct perf_session *session = perf_session__new(NULL, O_WRONLY, false, false, NULL);
if (session == NULL)
return -ENOMEM;
else
event__synthesize_threads(event__process, session);
- for (i = 0; i < nr_cpus; i++) {
+ for (i = 0; i < cpus->nr; i++) {
group_fd = -1;
- for (counter = 0; counter < nr_counters; counter++)
+ list_for_each_entry(counter, &evsel_list, node)
start_counter(i, counter);
}
int cmd_top(int argc, const char **argv, const char *prefix __used)
{
- int counter;
- int i,j;
+ struct perf_evsel *pos;
+ int status = -ENOMEM;
page_size = sysconf(_SC_PAGE_SIZE);
if (argc)
usage_with_options(top_usage, options);
- if (target_pid != -1) {
+ if (target_pid != -1)
target_tid = target_pid;
- thread_num = find_all_tid(target_pid, &all_tids);
- if (thread_num <= 0) {
- fprintf(stderr, "Can't find all threads of pid %d\n",
- target_pid);
- usage_with_options(top_usage, options);
- }
- } else {
- all_tids=malloc(sizeof(pid_t));
- if (!all_tids)
- return -ENOMEM;
- all_tids[0] = target_tid;
- thread_num = 1;
+ threads = thread_map__new(target_pid, target_tid);
+ if (threads == NULL) {
+ pr_err("Problems finding threads of monitor\n");
+ usage_with_options(top_usage, options);
}
- for (i = 0; i < MAX_NR_CPUS; i++) {
- for (j = 0; j < MAX_COUNTERS; j++) {
- fd[i][j] = malloc(sizeof(int)*thread_num);
- mmap_array[i][j] = zalloc(
- sizeof(struct mmap_data)*thread_num);
- if (!fd[i][j] || !mmap_array[i][j])
- return -ENOMEM;
- }
- }
- event_array = malloc(
- sizeof(struct pollfd)*MAX_NR_CPUS*MAX_COUNTERS*thread_num);
+ event_array = malloc((sizeof(struct pollfd) *
+ MAX_NR_CPUS * MAX_COUNTERS * threads->nr));
if (!event_array)
return -ENOMEM;
cpu_list = NULL;
}
- if (!nr_counters)
- nr_counters = 1;
-
- symbol_conf.priv_size = (sizeof(struct sym_entry) +
- (nr_counters + 1) * sizeof(unsigned long));
-
- symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
- if (symbol__init() < 0)
- return -1;
+ if (!nr_counters && perf_evsel_list__create_default() < 0) {
+ pr_err("Not enough memory for event selector list\n");
+ return -ENOMEM;
+ }
if (delay_secs < 1)
delay_secs = 1;
exit(EXIT_FAILURE);
}
- /*
- * Fill in the ones not specifically initialized via -c:
- */
- for (counter = 0; counter < nr_counters; counter++) {
- if (attrs[counter].sample_period)
+ if (target_tid != -1)
+ cpus = cpu_map__dummy_new();
+ else
+ cpus = cpu_map__new(cpu_list);
+
+ if (cpus == NULL)
+ usage_with_options(top_usage, options);
+
+ list_for_each_entry(pos, &evsel_list, node) {
+ if (perf_evsel__alloc_mmap_per_thread(pos, cpus->nr, threads->nr) < 0 ||
+ perf_evsel__alloc_fd(pos, cpus->nr, threads->nr) < 0)
+ goto out_free_fd;
+ /*
+ * Fill in the ones not specifically initialized via -c:
+ */
+ if (pos->attr.sample_period)
continue;
- attrs[counter].sample_period = default_interval;
+ pos->attr.sample_period = default_interval;
}
- if (target_tid != -1)
- nr_cpus = 1;
- else
- nr_cpus = read_cpu_map(cpu_list);
+ symbol_conf.priv_size = (sizeof(struct sym_entry) +
+ (nr_counters + 1) * sizeof(unsigned long));
- if (nr_cpus < 1)
- usage_with_options(top_usage, options);
+ symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
+ if (symbol__init() < 0)
+ return -1;
get_term_dimensions(&winsize);
if (print_entries == 0) {
signal(SIGWINCH, sig_winch_handler);
}
- return __cmd_top();
+ status = __cmd_top();
+out_free_fd:
+ list_for_each_entry(pos, &evsel_list, node)
+ perf_evsel__free_mmap(pos);
+
+ return status;
}
+++ /dev/null
-#include "builtin.h"
-
-#include "perf.h"
-#include "util/cache.h"
-#include "util/debug.h"
-#include "util/exec_cmd.h"
-#include "util/header.h"
-#include "util/parse-options.h"
-#include "util/session.h"
-#include "util/symbol.h"
-#include "util/thread.h"
-#include "util/trace-event.h"
-#include "util/parse-options.h"
-#include "util/util.h"
-
-static char const *script_name;
-static char const *generate_script_lang;
-static bool debug_mode;
-static u64 last_timestamp;
-static u64 nr_unordered;
-extern const struct option record_options[];
-
-static int default_start_script(const char *script __unused,
- int argc __unused,
- const char **argv __unused)
-{
- return 0;
-}
-
-static int default_stop_script(void)
-{
- return 0;
-}
-
-static int default_generate_script(const char *outfile __unused)
-{
- return 0;
-}
-
-static struct scripting_ops default_scripting_ops = {
- .start_script = default_start_script,
- .stop_script = default_stop_script,
- .process_event = print_event,
- .generate_script = default_generate_script,
-};
-
-static struct scripting_ops *scripting_ops;
-
-static void setup_scripting(void)
-{
- setup_perl_scripting();
- setup_python_scripting();
-
- scripting_ops = &default_scripting_ops;
-}
-
-static int cleanup_scripting(void)
-{
- pr_debug("\nperf trace script stopped\n");
-
- return scripting_ops->stop_script();
-}
-
-static char const *input_name = "perf.data";
-
-static int process_sample_event(event_t *event, struct perf_session *session)
-{
- struct sample_data data;
- struct thread *thread;
-
- memset(&data, 0, sizeof(data));
- data.time = -1;
- data.cpu = -1;
- data.period = 1;
-
- event__parse_sample(event, session->sample_type, &data);
-
- dump_printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc,
- data.pid, data.tid, data.ip, data.period);
-
- thread = perf_session__findnew(session, event->ip.pid);
- if (thread == NULL) {
- pr_debug("problem processing %d event, skipping it.\n",
- event->header.type);
- return -1;
- }
-
- if (session->sample_type & PERF_SAMPLE_RAW) {
- if (debug_mode) {
- if (data.time < last_timestamp) {
- pr_err("Samples misordered, previous: %llu "
- "this: %llu\n", last_timestamp,
- data.time);
- nr_unordered++;
- }
- last_timestamp = data.time;
- return 0;
- }
- /*
- * FIXME: better resolve from pid from the struct trace_entry
- * field, although it should be the same than this perf
- * event pid
- */
- scripting_ops->process_event(data.cpu, data.raw_data,
- data.raw_size,
- data.time, thread->comm);
- }
-
- session->hists.stats.total_period += data.period;
- return 0;
-}
-
-static u64 nr_lost;
-
-static int process_lost_event(event_t *event, struct perf_session *session __used)
-{
- nr_lost += event->lost.lost;
-
- return 0;
-}
-
-static struct perf_event_ops event_ops = {
- .sample = process_sample_event,
- .comm = event__process_comm,
- .attr = event__process_attr,
- .event_type = event__process_event_type,
- .tracing_data = event__process_tracing_data,
- .build_id = event__process_build_id,
- .lost = process_lost_event,
- .ordered_samples = true,
-};
-
-extern volatile int session_done;
-
-static void sig_handler(int sig __unused)
-{
- session_done = 1;
-}
-
-static int __cmd_trace(struct perf_session *session)
-{
- int ret;
-
- signal(SIGINT, sig_handler);
-
- ret = perf_session__process_events(session, &event_ops);
-
- if (debug_mode) {
- pr_err("Misordered timestamps: %llu\n", nr_unordered);
- pr_err("Lost events: %llu\n", nr_lost);
- }
-
- return ret;
-}
-
-struct script_spec {
- struct list_head node;
- struct scripting_ops *ops;
- char spec[0];
-};
-
-LIST_HEAD(script_specs);
-
-static struct script_spec *script_spec__new(const char *spec,
- struct scripting_ops *ops)
-{
- struct script_spec *s = malloc(sizeof(*s) + strlen(spec) + 1);
-
- if (s != NULL) {
- strcpy(s->spec, spec);
- s->ops = ops;
- }
-
- return s;
-}
-
-static void script_spec__delete(struct script_spec *s)
-{
- free(s->spec);
- free(s);
-}
-
-static void script_spec__add(struct script_spec *s)
-{
- list_add_tail(&s->node, &script_specs);
-}
-
-static struct script_spec *script_spec__find(const char *spec)
-{
- struct script_spec *s;
-
- list_for_each_entry(s, &script_specs, node)
- if (strcasecmp(s->spec, spec) == 0)
- return s;
- return NULL;
-}
-
-static struct script_spec *script_spec__findnew(const char *spec,
- struct scripting_ops *ops)
-{
- struct script_spec *s = script_spec__find(spec);
-
- if (s)
- return s;
-
- s = script_spec__new(spec, ops);
- if (!s)
- goto out_delete_spec;
-
- script_spec__add(s);
-
- return s;
-
-out_delete_spec:
- script_spec__delete(s);
-
- return NULL;
-}
-
-int script_spec_register(const char *spec, struct scripting_ops *ops)
-{
- struct script_spec *s;
-
- s = script_spec__find(spec);
- if (s)
- return -1;
-
- s = script_spec__findnew(spec, ops);
- if (!s)
- return -1;
-
- return 0;
-}
-
-static struct scripting_ops *script_spec__lookup(const char *spec)
-{
- struct script_spec *s = script_spec__find(spec);
- if (!s)
- return NULL;
-
- return s->ops;
-}
-
-static void list_available_languages(void)
-{
- struct script_spec *s;
-
- fprintf(stderr, "\n");
- fprintf(stderr, "Scripting language extensions (used in "
- "perf trace -s [spec:]script.[spec]):\n\n");
-
- list_for_each_entry(s, &script_specs, node)
- fprintf(stderr, " %-42s [%s]\n", s->spec, s->ops->name);
-
- fprintf(stderr, "\n");
-}
-
-static int parse_scriptname(const struct option *opt __used,
- const char *str, int unset __used)
-{
- char spec[PATH_MAX];
- const char *script, *ext;
- int len;
-
- if (strcmp(str, "lang") == 0) {
- list_available_languages();
- exit(0);
- }
-
- script = strchr(str, ':');
- if (script) {
- len = script - str;
- if (len >= PATH_MAX) {
- fprintf(stderr, "invalid language specifier");
- return -1;
- }
- strncpy(spec, str, len);
- spec[len] = '\0';
- scripting_ops = script_spec__lookup(spec);
- if (!scripting_ops) {
- fprintf(stderr, "invalid language specifier");
- return -1;
- }
- script++;
- } else {
- script = str;
- ext = strrchr(script, '.');
- if (!ext) {
- fprintf(stderr, "invalid script extension");
- return -1;
- }
- scripting_ops = script_spec__lookup(++ext);
- if (!scripting_ops) {
- fprintf(stderr, "invalid script extension");
- return -1;
- }
- }
-
- script_name = strdup(script);
-
- return 0;
-}
-
-#define for_each_lang(scripts_dir, lang_dirent, lang_next) \
- while (!readdir_r(scripts_dir, &lang_dirent, &lang_next) && \
- lang_next) \
- if (lang_dirent.d_type == DT_DIR && \
- (strcmp(lang_dirent.d_name, ".")) && \
- (strcmp(lang_dirent.d_name, "..")))
-
-#define for_each_script(lang_dir, script_dirent, script_next) \
- while (!readdir_r(lang_dir, &script_dirent, &script_next) && \
- script_next) \
- if (script_dirent.d_type != DT_DIR)
-
-
-#define RECORD_SUFFIX "-record"
-#define REPORT_SUFFIX "-report"
-
-struct script_desc {
- struct list_head node;
- char *name;
- char *half_liner;
- char *args;
-};
-
-LIST_HEAD(script_descs);
-
-static struct script_desc *script_desc__new(const char *name)
-{
- struct script_desc *s = zalloc(sizeof(*s));
-
- if (s != NULL && name)
- s->name = strdup(name);
-
- return s;
-}
-
-static void script_desc__delete(struct script_desc *s)
-{
- free(s->name);
- free(s->half_liner);
- free(s->args);
- free(s);
-}
-
-static void script_desc__add(struct script_desc *s)
-{
- list_add_tail(&s->node, &script_descs);
-}
-
-static struct script_desc *script_desc__find(const char *name)
-{
- struct script_desc *s;
-
- list_for_each_entry(s, &script_descs, node)
- if (strcasecmp(s->name, name) == 0)
- return s;
- return NULL;
-}
-
-static struct script_desc *script_desc__findnew(const char *name)
-{
- struct script_desc *s = script_desc__find(name);
-
- if (s)
- return s;
-
- s = script_desc__new(name);
- if (!s)
- goto out_delete_desc;
-
- script_desc__add(s);
-
- return s;
-
-out_delete_desc:
- script_desc__delete(s);
-
- return NULL;
-}
-
-static char *ends_with(char *str, const char *suffix)
-{
- size_t suffix_len = strlen(suffix);
- char *p = str;
-
- if (strlen(str) > suffix_len) {
- p = str + strlen(str) - suffix_len;
- if (!strncmp(p, suffix, suffix_len))
- return p;
- }
-
- return NULL;
-}
-
-static char *ltrim(char *str)
-{
- int len = strlen(str);
-
- while (len && isspace(*str)) {
- len--;
- str++;
- }
-
- return str;
-}
-
-static int read_script_info(struct script_desc *desc, const char *filename)
-{
- char line[BUFSIZ], *p;
- FILE *fp;
-
- fp = fopen(filename, "r");
- if (!fp)
- return -1;
-
- while (fgets(line, sizeof(line), fp)) {
- p = ltrim(line);
- if (strlen(p) == 0)
- continue;
- if (*p != '#')
- continue;
- p++;
- if (strlen(p) && *p == '!')
- continue;
-
- p = ltrim(p);
- if (strlen(p) && p[strlen(p) - 1] == '\n')
- p[strlen(p) - 1] = '\0';
-
- if (!strncmp(p, "description:", strlen("description:"))) {
- p += strlen("description:");
- desc->half_liner = strdup(ltrim(p));
- continue;
- }
-
- if (!strncmp(p, "args:", strlen("args:"))) {
- p += strlen("args:");
- desc->args = strdup(ltrim(p));
- continue;
- }
- }
-
- fclose(fp);
-
- return 0;
-}
-
-static int list_available_scripts(const struct option *opt __used,
- const char *s __used, int unset __used)
-{
- struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
- char scripts_path[MAXPATHLEN];
- DIR *scripts_dir, *lang_dir;
- char script_path[MAXPATHLEN];
- char lang_path[MAXPATHLEN];
- struct script_desc *desc;
- char first_half[BUFSIZ];
- char *script_root;
- char *str;
-
- snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
-
- scripts_dir = opendir(scripts_path);
- if (!scripts_dir)
- return -1;
-
- for_each_lang(scripts_dir, lang_dirent, lang_next) {
- snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
- lang_dirent.d_name);
- lang_dir = opendir(lang_path);
- if (!lang_dir)
- continue;
-
- for_each_script(lang_dir, script_dirent, script_next) {
- script_root = strdup(script_dirent.d_name);
- str = ends_with(script_root, REPORT_SUFFIX);
- if (str) {
- *str = '\0';
- desc = script_desc__findnew(script_root);
- snprintf(script_path, MAXPATHLEN, "%s/%s",
- lang_path, script_dirent.d_name);
- read_script_info(desc, script_path);
- }
- free(script_root);
- }
- }
-
- fprintf(stdout, "List of available trace scripts:\n");
- list_for_each_entry(desc, &script_descs, node) {
- sprintf(first_half, "%s %s", desc->name,
- desc->args ? desc->args : "");
- fprintf(stdout, " %-36s %s\n", first_half,
- desc->half_liner ? desc->half_liner : "");
- }
-
- exit(0);
-}
-
-static char *get_script_path(const char *script_root, const char *suffix)
-{
- struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
- char scripts_path[MAXPATHLEN];
- char script_path[MAXPATHLEN];
- DIR *scripts_dir, *lang_dir;
- char lang_path[MAXPATHLEN];
- char *str, *__script_root;
- char *path = NULL;
-
- snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
-
- scripts_dir = opendir(scripts_path);
- if (!scripts_dir)
- return NULL;
-
- for_each_lang(scripts_dir, lang_dirent, lang_next) {
- snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
- lang_dirent.d_name);
- lang_dir = opendir(lang_path);
- if (!lang_dir)
- continue;
-
- for_each_script(lang_dir, script_dirent, script_next) {
- __script_root = strdup(script_dirent.d_name);
- str = ends_with(__script_root, suffix);
- if (str) {
- *str = '\0';
- if (strcmp(__script_root, script_root))
- continue;
- snprintf(script_path, MAXPATHLEN, "%s/%s",
- lang_path, script_dirent.d_name);
- path = strdup(script_path);
- free(__script_root);
- break;
- }
- free(__script_root);
- }
- }
-
- return path;
-}
-
-static bool is_top_script(const char *script_path)
-{
- return ends_with((char *)script_path, "top") == NULL ? false : true;
-}
-
-static int has_required_arg(char *script_path)
-{
- struct script_desc *desc;
- int n_args = 0;
- char *p;
-
- desc = script_desc__new(NULL);
-
- if (read_script_info(desc, script_path))
- goto out;
-
- if (!desc->args)
- goto out;
-
- for (p = desc->args; *p; p++)
- if (*p == '<')
- n_args++;
-out:
- script_desc__delete(desc);
-
- return n_args;
-}
-
-static const char * const trace_usage[] = {
- "perf trace [<options>]",
- "perf trace [<options>] record <script> [<record-options>] <command>",
- "perf trace [<options>] report <script> [script-args]",
- "perf trace [<options>] <script> [<record-options>] <command>",
- "perf trace [<options>] <top-script> [script-args]",
- NULL
-};
-
-static const struct option options[] = {
- OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
- "dump raw trace in ASCII"),
- OPT_INCR('v', "verbose", &verbose,
- "be more verbose (show symbol address, etc)"),
- OPT_BOOLEAN('L', "Latency", &latency_format,
- "show latency attributes (irqs/preemption disabled, etc)"),
- OPT_CALLBACK_NOOPT('l', "list", NULL, NULL, "list available scripts",
- list_available_scripts),
- OPT_CALLBACK('s', "script", NULL, "name",
- "script file name (lang:script name, script name, or *)",
- parse_scriptname),
- OPT_STRING('g', "gen-script", &generate_script_lang, "lang",
- "generate perf-trace.xx script in specified language"),
- OPT_STRING('i', "input", &input_name, "file",
- "input file name"),
- OPT_BOOLEAN('d', "debug-mode", &debug_mode,
- "do various checks like samples ordering and lost events"),
-
- OPT_END()
-};
-
-static bool have_cmd(int argc, const char **argv)
-{
- char **__argv = malloc(sizeof(const char *) * argc);
-
- if (!__argv)
- die("malloc");
- memcpy(__argv, argv, sizeof(const char *) * argc);
- argc = parse_options(argc, (const char **)__argv, record_options,
- NULL, PARSE_OPT_STOP_AT_NON_OPTION);
- free(__argv);
-
- return argc != 0;
-}
-
-int cmd_trace(int argc, const char **argv, const char *prefix __used)
-{
- char *rec_script_path = NULL;
- char *rep_script_path = NULL;
- struct perf_session *session;
- char *script_path = NULL;
- const char **__argv;
- bool system_wide;
- int i, j, err;
-
- setup_scripting();
-
- argc = parse_options(argc, argv, options, trace_usage,
- PARSE_OPT_STOP_AT_NON_OPTION);
-
- if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) {
- rec_script_path = get_script_path(argv[1], RECORD_SUFFIX);
- if (!rec_script_path)
- return cmd_record(argc, argv, NULL);
- }
-
- if (argc > 1 && !strncmp(argv[0], "rep", strlen("rep"))) {
- rep_script_path = get_script_path(argv[1], REPORT_SUFFIX);
- if (!rep_script_path) {
- fprintf(stderr,
- "Please specify a valid report script"
- "(see 'perf trace -l' for listing)\n");
- return -1;
- }
- }
-
- /* make sure PERF_EXEC_PATH is set for scripts */
- perf_set_argv_exec_path(perf_exec_path());
-
- if (argc && !script_name && !rec_script_path && !rep_script_path) {
- int live_pipe[2];
- int rep_args;
- pid_t pid;
-
- rec_script_path = get_script_path(argv[0], RECORD_SUFFIX);
- rep_script_path = get_script_path(argv[0], REPORT_SUFFIX);
-
- if (!rec_script_path && !rep_script_path) {
- fprintf(stderr, " Couldn't find script %s\n\n See perf"
- " trace -l for available scripts.\n", argv[0]);
- usage_with_options(trace_usage, options);
- }
-
- if (is_top_script(argv[0])) {
- rep_args = argc - 1;
- } else {
- int rec_args;
-
- rep_args = has_required_arg(rep_script_path);
- rec_args = (argc - 1) - rep_args;
- if (rec_args < 0) {
- fprintf(stderr, " %s script requires options."
- "\n\n See perf trace -l for available "
- "scripts and options.\n", argv[0]);
- usage_with_options(trace_usage, options);
- }
- }
-
- if (pipe(live_pipe) < 0) {
- perror("failed to create pipe");
- exit(-1);
- }
-
- pid = fork();
- if (pid < 0) {
- perror("failed to fork");
- exit(-1);
- }
-
- if (!pid) {
- system_wide = true;
- j = 0;
-
- dup2(live_pipe[1], 1);
- close(live_pipe[0]);
-
- if (!is_top_script(argv[0]))
- system_wide = !have_cmd(argc - rep_args,
- &argv[rep_args]);
-
- __argv = malloc((argc + 6) * sizeof(const char *));
- if (!__argv)
- die("malloc");
-
- __argv[j++] = "/bin/sh";
- __argv[j++] = rec_script_path;
- if (system_wide)
- __argv[j++] = "-a";
- __argv[j++] = "-q";
- __argv[j++] = "-o";
- __argv[j++] = "-";
- for (i = rep_args + 1; i < argc; i++)
- __argv[j++] = argv[i];
- __argv[j++] = NULL;
-
- execvp("/bin/sh", (char **)__argv);
- free(__argv);
- exit(-1);
- }
-
- dup2(live_pipe[0], 0);
- close(live_pipe[1]);
-
- __argv = malloc((argc + 4) * sizeof(const char *));
- if (!__argv)
- die("malloc");
- j = 0;
- __argv[j++] = "/bin/sh";
- __argv[j++] = rep_script_path;
- for (i = 1; i < rep_args + 1; i++)
- __argv[j++] = argv[i];
- __argv[j++] = "-i";
- __argv[j++] = "-";
- __argv[j++] = NULL;
-
- execvp("/bin/sh", (char **)__argv);
- free(__argv);
- exit(-1);
- }
-
- if (rec_script_path)
- script_path = rec_script_path;
- if (rep_script_path)
- script_path = rep_script_path;
-
- if (script_path) {
- system_wide = false;
- j = 0;
-
- if (rec_script_path)
- system_wide = !have_cmd(argc - 1, &argv[1]);
-
- __argv = malloc((argc + 2) * sizeof(const char *));
- if (!__argv)
- die("malloc");
- __argv[j++] = "/bin/sh";
- __argv[j++] = script_path;
- if (system_wide)
- __argv[j++] = "-a";
- for (i = 2; i < argc; i++)
- __argv[j++] = argv[i];
- __argv[j++] = NULL;
-
- execvp("/bin/sh", (char **)__argv);
- free(__argv);
- exit(-1);
- }
-
- if (symbol__init() < 0)
- return -1;
- if (!script_name)
- setup_pager();
-
- session = perf_session__new(input_name, O_RDONLY, 0, false);
- if (session == NULL)
- return -ENOMEM;
-
- if (strcmp(input_name, "-") &&
- !perf_session__has_traces(session, "record -R"))
- return -EINVAL;
-
- if (generate_script_lang) {
- struct stat perf_stat;
-
- int input = open(input_name, O_RDONLY);
- if (input < 0) {
- perror("failed to open file");
- exit(-1);
- }
-
- err = fstat(input, &perf_stat);
- if (err < 0) {
- perror("failed to stat file");
- exit(-1);
- }
-
- if (!perf_stat.st_size) {
- fprintf(stderr, "zero-sized file, nothing to do!\n");
- exit(0);
- }
-
- scripting_ops = script_spec__lookup(generate_script_lang);
- if (!scripting_ops) {
- fprintf(stderr, "invalid language specifier");
- return -1;
- }
-
- err = scripting_ops->generate_script("perf-trace");
- goto out;
- }
-
- if (script_name) {
- err = scripting_ops->start_script(script_name, argc, argv);
- if (err)
- goto out;
- pr_debug("perf trace started with script %s\n\n", script_name);
- }
-
- err = __cmd_trace(session);
-
- perf_session__delete(session);
- cleanup_scripting();
-out:
- return err;
-}
extern int cmd_stat(int argc, const char **argv, const char *prefix);
extern int cmd_timechart(int argc, const char **argv, const char *prefix);
extern int cmd_top(int argc, const char **argv, const char *prefix);
-extern int cmd_trace(int argc, const char **argv, const char *prefix);
+extern int cmd_script(int argc, const char **argv, const char *prefix);
extern int cmd_version(int argc, const char **argv, const char *prefix);
extern int cmd_probe(int argc, const char **argv, const char *prefix);
extern int cmd_kmem(int argc, const char **argv, const char *prefix);
perf-stat mainporcelain common
perf-timechart mainporcelain common
perf-top mainporcelain common
-perf-trace mainporcelain common
+perf-script mainporcelain common
perf-probe mainporcelain common
perf-kmem mainporcelain common
perf-lock mainporcelain common
ifndef NO_DWARF
define SOURCE_DWARF
#include <dwarf.h>
-#include <libdw.h>
-#include <version.h>
+#include <elfutils/libdw.h>
+#include <elfutils/version.h>
#ifndef _ELFUTILS_PREREQ
#error
#endif
status = p->fn(argc, argv, prefix);
exit_browser(status);
+ perf_evsel_list__delete();
+
if (status)
return status & 0xff;
{ "top", cmd_top, 0 },
{ "annotate", cmd_annotate, 0 },
{ "version", cmd_version, 0 },
- { "trace", cmd_trace, 0 },
+ { "script", cmd_script, 0 },
{ "sched", cmd_sched, 0 },
{ "probe", cmd_probe, 0 },
{ "kmem", cmd_kmem, 0 },
#line 1 "Context.xs"
/*
- * Context.xs. XS interfaces for perf trace.
+ * Context.xs. XS interfaces for perf script.
*
* Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
*
/*
- * Context.xs. XS interfaces for perf trace.
+ * Context.xs. XS interfaces for perf script.
*
* Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
*
#include "perl.h"
#include "XSUB.h"
#include "../../../perf.h"
-#include "../../../util/trace-event.h"
+#include "../../../util/script-event.h"
MODULE = Perf::Trace::Context PACKAGE = Perf::Trace::Context
PROTOTYPES: ENABLE
Perf-Trace-Util version 0.01
============================
-This module contains utility functions for use with perf trace.
+This module contains utility functions for use with perf script.
Core.pm and Util.pm are pure Perl modules; Core.pm contains routines
that the core perf support for Perl calls on and should always be
INSTALLATION
-Building perf with perf trace Perl scripting should install this
+Building perf with perf script Perl scripting should install this
module in the right place.
You should make sure libperl and ExtUtils/Embed.pm are installed first
=head1 SEE ALSO
-Perf (trace) documentation
+Perf (script) documentation
=head1 AUTHOR
__END__
=head1 NAME
-Perf::Trace::Core - Perl extension for perf trace
+Perf::Trace::Core - Perl extension for perf script
=head1 SYNOPSIS
=head1 SEE ALSO
-Perf (trace) documentation
+Perf (script) documentation
=head1 AUTHOR
__END__
=head1 NAME
-Perf::Trace::Util - Perl extension for perf trace
+Perf::Trace::Util - Perl extension for perf script
=head1 SYNOPSIS
=head1 SEE ALSO
-Perf (trace) documentation
+Perf (script) documentation
=head1 AUTHOR
shift
fi
fi
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/failed-syscalls.pl $comm
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/failed-syscalls.pl $comm
fi
comm=$1
shift
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/rw-by-file.pl $comm
-
-
-
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/rw-by-file.pl $comm
#!/bin/bash
# description: system-wide r/w activity
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/rw-by-pid.pl
-
-
-
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/rw-by-pid.pl
interval=$1
shift
fi
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/rwtop.pl $interval
-
-
-
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/rwtop.pl $interval
#!/bin/bash
# description: system-wide min/max/avg wakeup latency
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/wakeup-latency.pl
-
-
-
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/wakeup-latency.pl
#!/bin/bash
# description: workqueue stats (ins/exe/create/destroy)
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/perl/workqueue-stats.pl
-
-
-
-
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/workqueue-stats.pl
-# perf trace event handlers, generated by perf trace -g perl
+# perf script event handlers, generated by perf script -g perl
# (c) 2009, Tom Zanussi <tzanussi@gmail.com>
# Licensed under the terms of the GNU GPL License version 2
use Perf::Trace::Core;
use Perf::Trace::Util;
-my $usage = "perf trace -s rw-by-file.pl <comm>\n";
+my $usage = "perf script -s rw-by-file.pl <comm>\n";
my $for_comm = shift or die $usage;
# workqueue:workqueue_destruction -e workqueue:workqueue_execution
# -e workqueue:workqueue_insertion
#
-# perf trace -p -s tools/perf/scripts/perl/workqueue-stats.pl
+# perf script -p -s tools/perf/scripts/perl/workqueue-stats.pl
use 5.010000;
use strict;
/*
- * Context.c. Python interfaces for perf trace.
+ * Context.c. Python interfaces for perf script.
*
* Copyright (C) 2010 Tom Zanussi <tzanussi@gmail.com>
*
-# Core.py - Python extension for perf trace, core functions
+# Core.py - Python extension for perf script, core functions
#
# Copyright (C) 2010 by Tom Zanussi <tzanussi@gmail.com>
#
-# SchedGui.py - Python extension for perf trace, basic GUI code for
+# SchedGui.py - Python extension for perf script, basic GUI code for
# traces drawing and overview.
#
# Copyright (C) 2010 by Frederic Weisbecker <fweisbec@gmail.com>
-# Util.py - Python extension for perf trace, miscellaneous utility code
+# Util.py - Python extension for perf script, miscellaneous utility code
#
# Copyright (C) 2010 by Tom Zanussi <tzanussi@gmail.com>
#
shift
fi
fi
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/failed-syscalls-by-pid.py $comm
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/failed-syscalls-by-pid.py $comm
#!/bin/bash
# description: futext contention measurement
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/futex-contention.py
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/futex-contention.py
# description: display a process of packet and processing time
# args: [tx] [rx] [dev=] [debug]
-perf trace -s "$PERF_EXEC_PATH"/scripts/python/netdev-times.py $@
+perf script -s "$PERF_EXEC_PATH"/scripts/python/netdev-times.py $@
#!/bin/bash
# description: sched migration overview
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/sched-migration.py
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/sched-migration.py
interval=$1
shift
fi
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/sctop.py $comm $interval
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/sctop.py $comm $interval
shift
fi
fi
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/syscall-counts-by-pid.py $comm
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/syscall-counts-by-pid.py $comm
shift
fi
fi
-perf trace $@ -s "$PERF_EXEC_PATH"/scripts/python/syscall-counts.py $comm
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/syscall-counts.py $comm
-# perf trace event handlers, generated by perf trace -g python
+# perf script event handlers, generated by perf script -g python
# (c) 2010, Tom Zanussi <tzanussi@gmail.com>
# Licensed under the terms of the GNU GPL License version 2
#
from Core import *
from Util import *
-usage = "perf trace -s syscall-counts-by-pid.py [comm|pid]\n";
+usage = "perf script -s syscall-counts-by-pid.py [comm|pid]\n";
for_comm = None
for_pid = None
#
# Copyright (C) 2010 Frederic Weisbecker <fweisbec@gmail.com>
#
-# perf trace event handlers have been generated by perf trace -g python
+# perf script event handlers have been generated by perf script -g python
#
# This software is distributed under the terms of the GNU General
# Public License ("GPL") version 2 as published by the Free Software
from Core import *
from Util import *
-usage = "perf trace -s sctop.py [comm] [interval]\n";
+usage = "perf script -s sctop.py [comm] [interval]\n";
for_comm = None
default_interval = 3
from Core import *
from Util import syscall_name
-usage = "perf trace -s syscall-counts-by-pid.py [comm]\n";
+usage = "perf script -s syscall-counts-by-pid.py [comm]\n";
for_comm = None
for_pid = None
from Core import *
from Util import syscall_name
-usage = "perf trace -s syscall-counts.py [comm]\n";
+usage = "perf script -s syscall-counts.py [comm]\n";
for_comm = None
#include <linux/kernel.h>
#include "debug.h"
-static int build_id__mark_dso_hit(event_t *event, struct perf_session *session)
+static int build_id__mark_dso_hit(event_t *event,
+ struct sample_data *sample __used,
+ struct perf_session *session)
{
struct addr_location al;
u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
return 0;
}
-static int event__exit_del_thread(event_t *self, struct perf_session *session)
+static int event__exit_del_thread(event_t *self, struct sample_data *sample __used,
+ struct perf_session *session)
{
struct thread *thread = perf_session__findnew(session, self->fork.tid);
#include <assert.h>
#include <stdio.h>
-int cpumap[MAX_NR_CPUS];
-
-static int default_cpu_map(void)
+static struct cpu_map *cpu_map__default_new(void)
{
- int nr_cpus, i;
+ struct cpu_map *cpus;
+ int nr_cpus;
nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
- assert(nr_cpus <= MAX_NR_CPUS);
- assert((int)nr_cpus >= 0);
+ if (nr_cpus < 0)
+ return NULL;
+
+ cpus = malloc(sizeof(*cpus) + nr_cpus * sizeof(int));
+ if (cpus != NULL) {
+ int i;
+ for (i = 0; i < nr_cpus; ++i)
+ cpus->map[i] = i;
- for (i = 0; i < nr_cpus; ++i)
- cpumap[i] = i;
+ cpus->nr = nr_cpus;
+ }
- return nr_cpus;
+ return cpus;
}
-static int read_all_cpu_map(void)
+static struct cpu_map *cpu_map__trim_new(int nr_cpus, int *tmp_cpus)
{
+ size_t payload_size = nr_cpus * sizeof(int);
+ struct cpu_map *cpus = malloc(sizeof(*cpus) + payload_size);
+
+ if (cpus != NULL) {
+ cpus->nr = nr_cpus;
+ memcpy(cpus->map, tmp_cpus, payload_size);
+ }
+
+ return cpus;
+}
+
+static struct cpu_map *cpu_map__read_all_cpu_map(void)
+{
+ struct cpu_map *cpus = NULL;
FILE *onlnf;
int nr_cpus = 0;
+ int *tmp_cpus = NULL, *tmp;
+ int max_entries = 0;
int n, cpu, prev;
char sep;
onlnf = fopen("/sys/devices/system/cpu/online", "r");
if (!onlnf)
- return default_cpu_map();
+ return cpu_map__default_new();
sep = 0;
prev = -1;
if (n <= 0)
break;
if (prev >= 0) {
- assert(nr_cpus + cpu - prev - 1 < MAX_NR_CPUS);
+ int new_max = nr_cpus + cpu - prev - 1;
+
+ if (new_max >= max_entries) {
+ max_entries = new_max + MAX_NR_CPUS / 2;
+ tmp = realloc(tmp_cpus, max_entries * sizeof(int));
+ if (tmp == NULL)
+ goto out_free_tmp;
+ tmp_cpus = tmp;
+ }
+
while (++prev < cpu)
- cpumap[nr_cpus++] = prev;
+ tmp_cpus[nr_cpus++] = prev;
+ }
+ if (nr_cpus == max_entries) {
+ max_entries += MAX_NR_CPUS;
+ tmp = realloc(tmp_cpus, max_entries * sizeof(int));
+ if (tmp == NULL)
+ goto out_free_tmp;
+ tmp_cpus = tmp;
}
- assert (nr_cpus < MAX_NR_CPUS);
- cpumap[nr_cpus++] = cpu;
+
+ tmp_cpus[nr_cpus++] = cpu;
if (n == 2 && sep == '-')
prev = cpu;
else
if (n == 1 || sep == '\n')
break;
}
- fclose(onlnf);
- if (nr_cpus > 0)
- return nr_cpus;
- return default_cpu_map();
+ if (nr_cpus > 0)
+ cpus = cpu_map__trim_new(nr_cpus, tmp_cpus);
+ else
+ cpus = cpu_map__default_new();
+out_free_tmp:
+ free(tmp_cpus);
+ fclose(onlnf);
+ return cpus;
}
-int read_cpu_map(const char *cpu_list)
+struct cpu_map *cpu_map__new(const char *cpu_list)
{
+ struct cpu_map *cpus = NULL;
unsigned long start_cpu, end_cpu = 0;
char *p = NULL;
int i, nr_cpus = 0;
+ int *tmp_cpus = NULL, *tmp;
+ int max_entries = 0;
if (!cpu_list)
- return read_all_cpu_map();
+ return cpu_map__read_all_cpu_map();
if (!isdigit(*cpu_list))
- goto invalid;
+ goto out;
while (isdigit(*cpu_list)) {
p = NULL;
for (; start_cpu <= end_cpu; start_cpu++) {
/* check for duplicates */
for (i = 0; i < nr_cpus; i++)
- if (cpumap[i] == (int)start_cpu)
+ if (tmp_cpus[i] == (int)start_cpu)
goto invalid;
- assert(nr_cpus < MAX_NR_CPUS);
- cpumap[nr_cpus++] = (int)start_cpu;
+ if (nr_cpus == max_entries) {
+ max_entries += MAX_NR_CPUS;
+ tmp = realloc(tmp_cpus, max_entries * sizeof(int));
+ if (tmp == NULL)
+ goto invalid;
+ tmp_cpus = tmp;
+ }
+ tmp_cpus[nr_cpus++] = (int)start_cpu;
}
if (*p)
++p;
cpu_list = p;
}
- if (nr_cpus > 0)
- return nr_cpus;
- return default_cpu_map();
+ if (nr_cpus > 0)
+ cpus = cpu_map__trim_new(nr_cpus, tmp_cpus);
+ else
+ cpus = cpu_map__default_new();
invalid:
- return -1;
+ free(tmp_cpus);
+out:
+ return cpus;
+}
+
+struct cpu_map *cpu_map__dummy_new(void)
+{
+ struct cpu_map *cpus = malloc(sizeof(*cpus) + sizeof(int));
+
+ if (cpus != NULL) {
+ cpus->nr = 1;
+ cpus->map[0] = -1;
+ }
+
+ return cpus;
}
#ifndef __PERF_CPUMAP_H
#define __PERF_CPUMAP_H
-extern int read_cpu_map(const char *cpu_list);
-extern int cpumap[];
+struct cpu_map {
+ int nr;
+ int map[];
+};
+
+struct cpu_map *cpu_map__new(const char *cpu_list);
+struct cpu_map *cpu_map__dummy_new(void);
+void *cpu_map__delete(struct cpu_map *map);
#endif /* __PERF_CPUMAP_H */
return ret;
}
-static int dump_printf_color(const char *fmt, const char *color, ...)
+#ifdef NO_NEWT_SUPPORT
+void ui__warning(const char *format, ...)
{
va_list args;
- int ret = 0;
- if (dump_trace) {
- va_start(args, color);
- ret = color_vfprintf(stdout, color, fmt, args);
- va_end(args);
- }
-
- return ret;
+ va_start(args, format);
+ vfprintf(stderr, format, args);
+ va_end(args);
}
-
+#endif
void trace_event(event_t *event)
{
if (!dump_trace)
return;
- dump_printf(".");
- dump_printf_color("\n. ... raw event: size %d bytes\n", color,
- event->header.size);
+ printf(".");
+ color_fprintf(stdout, color, "\n. ... raw event: size %d bytes\n",
+ event->header.size);
for (i = 0; i < event->header.size; i++) {
if ((i & 15) == 0) {
- dump_printf(".");
- dump_printf_color(" %04x: ", color, i);
+ printf(".");
+ color_fprintf(stdout, color, " %04x: ", i);
}
- dump_printf_color(" %02x", color, raw_event[i]);
+ color_fprintf(stdout, color, " %02x", raw_event[i]);
if (((i & 15) == 15) || i == event->header.size-1) {
- dump_printf_color(" ", color);
+ color_fprintf(stdout, color, " ");
for (j = 0; j < 15-(i & 15); j++)
- dump_printf_color(" ", color);
+ color_fprintf(stdout, color, " ");
for (j = i & ~15; j <= i; j++) {
- dump_printf_color("%c", color,
- isprint(raw_event[j]) ?
- raw_event[j] : '.');
+ color_fprintf(stdout, color, "%c",
+ isprint(raw_event[j]) ?
+ raw_event[j] : '.');
}
- dump_printf_color("\n", color);
+ color_fprintf(stdout, color, "\n");
}
}
- dump_printf(".\n");
+ printf(".\n");
}
#include "ui/progress.h"
#endif
+void ui__warning(const char *format, ...) __attribute__((format(printf, 1, 2)));
+
#endif /* __PERF_DEBUG_H */
#include "strlist.h"
#include "thread.h"
-const char *event__name[] = {
+static const char *event__name[] = {
[0] = "TOTAL",
[PERF_RECORD_MMAP] = "MMAP",
[PERF_RECORD_LOST] = "LOST",
[PERF_RECORD_HEADER_EVENT_TYPE] = "EVENT_TYPE",
[PERF_RECORD_HEADER_TRACING_DATA] = "TRACING_DATA",
[PERF_RECORD_HEADER_BUILD_ID] = "BUILD_ID",
+ [PERF_RECORD_FINISHED_ROUND] = "FINISHED_ROUND",
};
-static pid_t event__synthesize_comm(pid_t pid, int full,
+const char *event__get_event_name(unsigned int id)
+{
+ if (id >= ARRAY_SIZE(event__name))
+ return "INVALID";
+ if (!event__name[id])
+ return "UNKNOWN";
+ return event__name[id];
+}
+
+static struct sample_data synth_sample = {
+ .pid = -1,
+ .tid = -1,
+ .time = -1,
+ .stream_id = -1,
+ .cpu = -1,
+ .period = 1,
+};
+
+static pid_t event__synthesize_comm(event_t *event, pid_t pid, int full,
event__handler_t process,
struct perf_session *session)
{
- event_t ev;
char filename[PATH_MAX];
char bf[BUFSIZ];
FILE *fp;
return 0;
}
- memset(&ev.comm, 0, sizeof(ev.comm));
- while (!ev.comm.comm[0] || !ev.comm.pid) {
- if (fgets(bf, sizeof(bf), fp) == NULL)
- goto out_failure;
+ memset(&event->comm, 0, sizeof(event->comm));
+
+ while (!event->comm.comm[0] || !event->comm.pid) {
+ if (fgets(bf, sizeof(bf), fp) == NULL) {
+ pr_warning("couldn't get COMM and pgid, malformed %s\n", filename);
+ goto out;
+ }
if (memcmp(bf, "Name:", 5) == 0) {
char *name = bf + 5;
while (*name && isspace(*name))
++name;
size = strlen(name) - 1;
- memcpy(ev.comm.comm, name, size++);
+ memcpy(event->comm.comm, name, size++);
} else if (memcmp(bf, "Tgid:", 5) == 0) {
char *tgids = bf + 5;
while (*tgids && isspace(*tgids))
++tgids;
- tgid = ev.comm.pid = atoi(tgids);
+ tgid = event->comm.pid = atoi(tgids);
}
}
- ev.comm.header.type = PERF_RECORD_COMM;
+ event->comm.header.type = PERF_RECORD_COMM;
size = ALIGN(size, sizeof(u64));
- ev.comm.header.size = sizeof(ev.comm) - (sizeof(ev.comm.comm) - size);
-
+ memset(event->comm.comm + size, 0, session->id_hdr_size);
+ event->comm.header.size = (sizeof(event->comm) -
+ (sizeof(event->comm.comm) - size) +
+ session->id_hdr_size);
if (!full) {
- ev.comm.tid = pid;
+ event->comm.tid = pid;
- process(&ev, session);
- goto out_fclose;
+ process(event, &synth_sample, session);
+ goto out;
}
snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
if (*end)
continue;
- ev.comm.tid = pid;
+ event->comm.tid = pid;
- process(&ev, session);
+ process(event, &synth_sample, session);
}
- closedir(tasks);
-out_fclose:
+ closedir(tasks);
+out:
fclose(fp);
- return tgid;
-out_failure:
- pr_warning("couldn't get COMM and pgid, malformed %s\n", filename);
- return -1;
+ return tgid;
}
-static int event__synthesize_mmap_events(pid_t pid, pid_t tgid,
+static int event__synthesize_mmap_events(event_t *event, pid_t pid, pid_t tgid,
event__handler_t process,
struct perf_session *session)
{
return -1;
}
+ event->header.type = PERF_RECORD_MMAP;
+ /*
+ * Just like the kernel, see __perf_event_mmap in kernel/perf_event.c
+ */
+ event->header.misc = PERF_RECORD_MISC_USER;
+
while (1) {
char bf[BUFSIZ], *pbf = bf;
- event_t ev = {
- .header = {
- .type = PERF_RECORD_MMAP,
- /*
- * Just like the kernel, see __perf_event_mmap
- * in kernel/perf_event.c
- */
- .misc = PERF_RECORD_MISC_USER,
- },
- };
int n;
size_t size;
if (fgets(bf, sizeof(bf), fp) == NULL)
break;
/* 00400000-0040c000 r-xp 00000000 fd:01 41038 /bin/cat */
- n = hex2u64(pbf, &ev.mmap.start);
+ n = hex2u64(pbf, &event->mmap.start);
if (n < 0)
continue;
pbf += n + 1;
- n = hex2u64(pbf, &ev.mmap.len);
+ n = hex2u64(pbf, &event->mmap.len);
if (n < 0)
continue;
pbf += n + 3;
continue;
pbf += 3;
- n = hex2u64(pbf, &ev.mmap.pgoff);
+ n = hex2u64(pbf, &event->mmap.pgoff);
size = strlen(execname);
execname[size - 1] = '\0'; /* Remove \n */
- memcpy(ev.mmap.filename, execname, size);
+ memcpy(event->mmap.filename, execname, size);
size = ALIGN(size, sizeof(u64));
- ev.mmap.len -= ev.mmap.start;
- ev.mmap.header.size = (sizeof(ev.mmap) -
- (sizeof(ev.mmap.filename) - size));
- ev.mmap.pid = tgid;
- ev.mmap.tid = pid;
-
- process(&ev, session);
+ event->mmap.len -= event->mmap.start;
+ event->mmap.header.size = (sizeof(event->mmap) -
+ (sizeof(event->mmap.filename) - size));
+ memset(event->mmap.filename + size, 0, session->id_hdr_size);
+ event->mmap.header.size += session->id_hdr_size;
+ event->mmap.pid = tgid;
+ event->mmap.tid = pid;
+
+ process(event, &synth_sample, session);
}
}
{
struct rb_node *nd;
struct map_groups *kmaps = &machine->kmaps;
- u16 misc;
+ event_t *event = zalloc(sizeof(event->mmap) + session->id_hdr_size);
+
+ if (event == NULL) {
+ pr_debug("Not enough memory synthesizing mmap event "
+ "for kernel modules\n");
+ return -1;
+ }
+
+ event->header.type = PERF_RECORD_MMAP;
/*
* kernel uses 0 for user space maps, see kernel/perf_event.c
* __perf_event_mmap
*/
if (machine__is_host(machine))
- misc = PERF_RECORD_MISC_KERNEL;
+ event->header.misc = PERF_RECORD_MISC_KERNEL;
else
- misc = PERF_RECORD_MISC_GUEST_KERNEL;
+ event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL;
for (nd = rb_first(&kmaps->maps[MAP__FUNCTION]);
nd; nd = rb_next(nd)) {
- event_t ev;
size_t size;
struct map *pos = rb_entry(nd, struct map, rb_node);
continue;
size = ALIGN(pos->dso->long_name_len + 1, sizeof(u64));
- memset(&ev, 0, sizeof(ev));
- ev.mmap.header.misc = misc;
- ev.mmap.header.type = PERF_RECORD_MMAP;
- ev.mmap.header.size = (sizeof(ev.mmap) -
- (sizeof(ev.mmap.filename) - size));
- ev.mmap.start = pos->start;
- ev.mmap.len = pos->end - pos->start;
- ev.mmap.pid = machine->pid;
-
- memcpy(ev.mmap.filename, pos->dso->long_name,
+ event->mmap.header.type = PERF_RECORD_MMAP;
+ event->mmap.header.size = (sizeof(event->mmap) -
+ (sizeof(event->mmap.filename) - size));
+ memset(event->mmap.filename + size, 0, session->id_hdr_size);
+ event->mmap.header.size += session->id_hdr_size;
+ event->mmap.start = pos->start;
+ event->mmap.len = pos->end - pos->start;
+ event->mmap.pid = machine->pid;
+
+ memcpy(event->mmap.filename, pos->dso->long_name,
pos->dso->long_name_len + 1);
- process(&ev, session);
+ process(event, &synth_sample, session);
}
+ free(event);
return 0;
}
-int event__synthesize_thread(pid_t pid, event__handler_t process,
- struct perf_session *session)
+static int __event__synthesize_thread(event_t *comm_event, event_t *mmap_event,
+ pid_t pid, event__handler_t process,
+ struct perf_session *session)
{
- pid_t tgid = event__synthesize_comm(pid, 1, process, session);
+ pid_t tgid = event__synthesize_comm(comm_event, pid, 1, process,
+ session);
if (tgid == -1)
return -1;
- return event__synthesize_mmap_events(pid, tgid, process, session);
+ return event__synthesize_mmap_events(mmap_event, pid, tgid,
+ process, session);
+}
+
+int event__synthesize_thread(pid_t pid, event__handler_t process,
+ struct perf_session *session)
+{
+ event_t *comm_event, *mmap_event;
+ int err = -1;
+
+ comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size);
+ if (comm_event == NULL)
+ goto out;
+
+ mmap_event = malloc(sizeof(mmap_event->mmap) + session->id_hdr_size);
+ if (mmap_event == NULL)
+ goto out_free_comm;
+
+ err = __event__synthesize_thread(comm_event, mmap_event, pid,
+ process, session);
+ free(mmap_event);
+out_free_comm:
+ free(comm_event);
+out:
+ return err;
}
-void event__synthesize_threads(event__handler_t process,
- struct perf_session *session)
+int event__synthesize_threads(event__handler_t process,
+ struct perf_session *session)
{
DIR *proc;
struct dirent dirent, *next;
+ event_t *comm_event, *mmap_event;
+ int err = -1;
+
+ comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size);
+ if (comm_event == NULL)
+ goto out;
+
+ mmap_event = malloc(sizeof(mmap_event->mmap) + session->id_hdr_size);
+ if (mmap_event == NULL)
+ goto out_free_comm;
proc = opendir("/proc");
+ if (proc == NULL)
+ goto out_free_mmap;
while (!readdir_r(proc, &dirent, &next) && next) {
char *end;
if (*end) /* only interested in proper numerical dirents */
continue;
- event__synthesize_thread(pid, process, session);
+ __event__synthesize_thread(comm_event, mmap_event, pid,
+ process, session);
}
closedir(proc);
+ err = 0;
+out_free_mmap:
+ free(mmap_event);
+out_free_comm:
+ free(comm_event);
+out:
+ return err;
}
struct process_symbol_args {
u64 start;
};
-static int find_symbol_cb(void *arg, const char *name, char type, u64 start)
+static int find_symbol_cb(void *arg, const char *name, char type,
+ u64 start, u64 end __used)
{
struct process_symbol_args *args = arg;
char path[PATH_MAX];
char name_buff[PATH_MAX];
struct map *map;
-
- event_t ev = {
- .header = {
- .type = PERF_RECORD_MMAP,
- },
- };
+ int err;
/*
* We should get this from /sys/kernel/sections/.text, but till that is
* available use this, and after it is use this as a fallback for older
* kernels.
*/
struct process_symbol_args args = { .name = symbol_name, };
+ event_t *event = zalloc(sizeof(event->mmap) + session->id_hdr_size);
+
+ if (event == NULL) {
+ pr_debug("Not enough memory synthesizing mmap event "
+ "for kernel modules\n");
+ return -1;
+ }
mmap_name = machine__mmap_name(machine, name_buff, sizeof(name_buff));
if (machine__is_host(machine)) {
* kernel uses PERF_RECORD_MISC_USER for user space maps,
* see kernel/perf_event.c __perf_event_mmap
*/
- ev.header.misc = PERF_RECORD_MISC_KERNEL;
+ event->header.misc = PERF_RECORD_MISC_KERNEL;
filename = "/proc/kallsyms";
} else {
- ev.header.misc = PERF_RECORD_MISC_GUEST_KERNEL;
+ event->header.misc = PERF_RECORD_MISC_GUEST_KERNEL;
if (machine__is_default_guest(machine))
filename = (char *) symbol_conf.default_guest_kallsyms;
else {
return -ENOENT;
map = machine->vmlinux_maps[MAP__FUNCTION];
- size = snprintf(ev.mmap.filename, sizeof(ev.mmap.filename),
+ size = snprintf(event->mmap.filename, sizeof(event->mmap.filename),
"%s%s", mmap_name, symbol_name) + 1;
size = ALIGN(size, sizeof(u64));
- ev.mmap.header.size = (sizeof(ev.mmap) -
- (sizeof(ev.mmap.filename) - size));
- ev.mmap.pgoff = args.start;
- ev.mmap.start = map->start;
- ev.mmap.len = map->end - ev.mmap.start;
- ev.mmap.pid = machine->pid;
-
- return process(&ev, session);
+ event->mmap.header.type = PERF_RECORD_MMAP;
+ event->mmap.header.size = (sizeof(event->mmap) -
+ (sizeof(event->mmap.filename) - size) + session->id_hdr_size);
+ event->mmap.pgoff = args.start;
+ event->mmap.start = map->start;
+ event->mmap.len = map->end - event->mmap.start;
+ event->mmap.pid = machine->pid;
+
+ err = process(event, &synth_sample, session);
+ free(event);
+
+ return err;
}
static void thread__comm_adjust(struct thread *self, struct hists *hists)
return 0;
}
-int event__process_comm(event_t *self, struct perf_session *session)
+int event__process_comm(event_t *self, struct sample_data *sample __used,
+ struct perf_session *session)
{
struct thread *thread = perf_session__findnew(session, self->comm.tid);
return 0;
}
-int event__process_lost(event_t *self, struct perf_session *session)
+int event__process_lost(event_t *self, struct sample_data *sample __used,
+ struct perf_session *session)
{
dump_printf(": id:%Ld: lost:%Ld\n", self->lost.id, self->lost.lost);
session->hists.stats.total_lost += self->lost.lost;
* a zero sized synthesized MMAP event for the kernel.
*/
if (maps[MAP__FUNCTION]->end == 0)
- maps[MAP__FUNCTION]->end = ~0UL;
+ maps[MAP__FUNCTION]->end = ~0ULL;
}
static int event__process_kernel_mmap(event_t *self,
return -1;
}
-int event__process_mmap(event_t *self, struct perf_session *session)
+int event__process_mmap(event_t *self, struct sample_data *sample __used,
+ struct perf_session *session)
{
struct machine *machine;
struct thread *thread;
return 0;
}
-int event__process_task(event_t *self, struct perf_session *session)
+int event__process_task(event_t *self, struct sample_data *sample __used,
+ struct perf_session *session)
{
struct thread *thread = perf_session__findnew(session, self->fork.tid);
struct thread *parent = perf_session__findnew(session, self->fork.ptid);
return 0;
}
-int event__process(event_t *event, struct perf_session *session)
+int event__process(event_t *event, struct sample_data *sample,
+ struct perf_session *session)
{
switch (event->header.type) {
case PERF_RECORD_COMM:
- event__process_comm(event, session);
+ event__process_comm(event, sample, session);
break;
case PERF_RECORD_MMAP:
- event__process_mmap(event, session);
+ event__process_mmap(event, sample, session);
break;
case PERF_RECORD_FORK:
case PERF_RECORD_EXIT:
- event__process_task(event, session);
+ event__process_task(event, sample, session);
break;
default:
break;
symbol_filter_t filter)
{
u8 cpumode = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
- struct thread *thread;
-
- event__parse_sample(self, session->sample_type, data);
-
- dump_printf("(IP, %d): %d/%d: %#Lx period: %Ld cpu:%d\n",
- self->header.misc, data->pid, data->tid, data->ip,
- data->period, data->cpu);
-
- if (session->sample_type & PERF_SAMPLE_CALLCHAIN) {
- unsigned int i;
-
- dump_printf("... chain: nr:%Lu\n", data->callchain->nr);
+ struct thread *thread = perf_session__findnew(session, self->ip.pid);
- if (!ip_callchain__valid(data->callchain, self)) {
- pr_debug("call-chain problem with event, "
- "skipping it.\n");
- goto out_filtered;
- }
-
- if (dump_trace) {
- for (i = 0; i < data->callchain->nr; i++)
- dump_printf("..... %2d: %016Lx\n",
- i, data->callchain->ips[i]);
- }
- }
- thread = perf_session__findnew(session, self->ip.pid);
if (thread == NULL)
return -1;
return 0;
}
-int event__parse_sample(const event_t *event, u64 type, struct sample_data *data)
+static int event__parse_id_sample(const event_t *event,
+ struct perf_session *session,
+ struct sample_data *sample)
{
- const u64 *array = event->sample.array;
+ const u64 *array;
+ u64 type;
+
+ sample->cpu = sample->pid = sample->tid = -1;
+ sample->stream_id = sample->id = sample->time = -1ULL;
+
+ if (!session->sample_id_all)
+ return 0;
+
+ array = event->sample.array;
+ array += ((event->header.size -
+ sizeof(event->header)) / sizeof(u64)) - 1;
+ type = session->sample_type;
+
+ if (type & PERF_SAMPLE_CPU) {
+ u32 *p = (u32 *)array;
+ sample->cpu = *p;
+ array--;
+ }
+
+ if (type & PERF_SAMPLE_STREAM_ID) {
+ sample->stream_id = *array;
+ array--;
+ }
+
+ if (type & PERF_SAMPLE_ID) {
+ sample->id = *array;
+ array--;
+ }
+
+ if (type & PERF_SAMPLE_TIME) {
+ sample->time = *array;
+ array--;
+ }
+
+ if (type & PERF_SAMPLE_TID) {
+ u32 *p = (u32 *)array;
+ sample->pid = p[0];
+ sample->tid = p[1];
+ }
+
+ return 0;
+}
+
+int event__parse_sample(const event_t *event, struct perf_session *session,
+ struct sample_data *data)
+{
+ const u64 *array;
+ u64 type;
+
+ if (event->header.type != PERF_RECORD_SAMPLE)
+ return event__parse_id_sample(event, session, data);
+
+ array = event->sample.array;
+ type = session->sample_type;
if (type & PERF_SAMPLE_IP) {
data->ip = event->ip.ip;
};
enum perf_user_event_type { /* above any possible kernel type */
+ PERF_RECORD_USER_TYPE_START = 64,
PERF_RECORD_HEADER_ATTR = 64,
PERF_RECORD_HEADER_EVENT_TYPE = 65,
PERF_RECORD_HEADER_TRACING_DATA = 66,
struct perf_session;
-typedef int (*event__handler_t)(event_t *event, struct perf_session *session);
+typedef int (*event__handler_synth_t)(event_t *event,
+ struct perf_session *session);
+typedef int (*event__handler_t)(event_t *event, struct sample_data *sample,
+ struct perf_session *session);
int event__synthesize_thread(pid_t pid, event__handler_t process,
struct perf_session *session);
-void event__synthesize_threads(event__handler_t process,
- struct perf_session *session);
+int event__synthesize_threads(event__handler_t process,
+ struct perf_session *session);
int event__synthesize_kernel_mmap(event__handler_t process,
struct perf_session *session,
struct machine *machine,
struct perf_session *session,
struct machine *machine);
-int event__process_comm(event_t *self, struct perf_session *session);
-int event__process_lost(event_t *self, struct perf_session *session);
-int event__process_mmap(event_t *self, struct perf_session *session);
-int event__process_task(event_t *self, struct perf_session *session);
-int event__process(event_t *event, struct perf_session *session);
+int event__process_comm(event_t *self, struct sample_data *sample,
+ struct perf_session *session);
+int event__process_lost(event_t *self, struct sample_data *sample,
+ struct perf_session *session);
+int event__process_mmap(event_t *self, struct sample_data *sample,
+ struct perf_session *session);
+int event__process_task(event_t *self, struct sample_data *sample,
+ struct perf_session *session);
+int event__process(event_t *event, struct sample_data *sample,
+ struct perf_session *session);
struct addr_location;
int event__preprocess_sample(const event_t *self, struct perf_session *session,
struct addr_location *al, struct sample_data *data,
symbol_filter_t filter);
-int event__parse_sample(const event_t *event, u64 type, struct sample_data *data);
+int event__parse_sample(const event_t *event, struct perf_session *session,
+ struct sample_data *sample);
-extern const char *event__name[];
+const char *event__get_event_name(unsigned int id);
#endif /* __PERF_RECORD_H */
--- /dev/null
+#include "evsel.h"
+#include "../perf.h"
+#include "util.h"
+#include "cpumap.h"
+#include "thread.h"
+
+#define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
+
+struct perf_evsel *perf_evsel__new(u32 type, u64 config, int idx)
+{
+ struct perf_evsel *evsel = zalloc(sizeof(*evsel));
+
+ if (evsel != NULL) {
+ evsel->idx = idx;
+ evsel->attr.type = type;
+ evsel->attr.config = config;
+ INIT_LIST_HEAD(&evsel->node);
+ }
+
+ return evsel;
+}
+
+int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
+{
+ evsel->fd = xyarray__new(ncpus, nthreads, sizeof(int));
+ return evsel->fd != NULL ? 0 : -ENOMEM;
+}
+
+int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus)
+{
+ evsel->counts = zalloc((sizeof(*evsel->counts) +
+ (ncpus * sizeof(struct perf_counts_values))));
+ return evsel->counts != NULL ? 0 : -ENOMEM;
+}
+
+void perf_evsel__free_fd(struct perf_evsel *evsel)
+{
+ xyarray__delete(evsel->fd);
+ evsel->fd = NULL;
+}
+
+void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads)
+{
+ int cpu, thread;
+
+ for (cpu = 0; cpu < ncpus; cpu++)
+ for (thread = 0; thread < nthreads; ++thread) {
+ close(FD(evsel, cpu, thread));
+ FD(evsel, cpu, thread) = -1;
+ }
+}
+
+void perf_evsel__delete(struct perf_evsel *evsel)
+{
+ assert(list_empty(&evsel->node));
+ xyarray__delete(evsel->fd);
+ free(evsel);
+}
+
+int __perf_evsel__read_on_cpu(struct perf_evsel *evsel,
+ int cpu, int thread, bool scale)
+{
+ struct perf_counts_values count;
+ size_t nv = scale ? 3 : 1;
+
+ if (FD(evsel, cpu, thread) < 0)
+ return -EINVAL;
+
+ if (evsel->counts == NULL && perf_evsel__alloc_counts(evsel, cpu + 1) < 0)
+ return -ENOMEM;
+
+ if (readn(FD(evsel, cpu, thread), &count, nv * sizeof(u64)) < 0)
+ return -errno;
+
+ if (scale) {
+ if (count.run == 0)
+ count.val = 0;
+ else if (count.run < count.ena)
+ count.val = (u64)((double)count.val * count.ena / count.run + 0.5);
+ } else
+ count.ena = count.run = 0;
+
+ evsel->counts->cpu[cpu] = count;
+ return 0;
+}
+
+int __perf_evsel__read(struct perf_evsel *evsel,
+ int ncpus, int nthreads, bool scale)
+{
+ size_t nv = scale ? 3 : 1;
+ int cpu, thread;
+ struct perf_counts_values *aggr = &evsel->counts->aggr, count;
+
+ aggr->val = 0;
+
+ for (cpu = 0; cpu < ncpus; cpu++) {
+ for (thread = 0; thread < nthreads; thread++) {
+ if (FD(evsel, cpu, thread) < 0)
+ continue;
+
+ if (readn(FD(evsel, cpu, thread),
+ &count, nv * sizeof(u64)) < 0)
+ return -errno;
+
+ aggr->val += count.val;
+ if (scale) {
+ aggr->ena += count.ena;
+ aggr->run += count.run;
+ }
+ }
+ }
+
+ evsel->counts->scaled = 0;
+ if (scale) {
+ if (aggr->run == 0) {
+ evsel->counts->scaled = -1;
+ aggr->val = 0;
+ return 0;
+ }
+
+ if (aggr->run < aggr->ena) {
+ evsel->counts->scaled = 1;
+ aggr->val = (u64)((double)aggr->val * aggr->ena / aggr->run + 0.5);
+ }
+ } else
+ aggr->ena = aggr->run = 0;
+
+ return 0;
+}
+
+int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus)
+{
+ int cpu;
+
+ if (evsel->fd == NULL && perf_evsel__alloc_fd(evsel, cpus->nr, 1) < 0)
+ return -1;
+
+ for (cpu = 0; cpu < cpus->nr; cpu++) {
+ FD(evsel, cpu, 0) = sys_perf_event_open(&evsel->attr, -1,
+ cpus->map[cpu], -1, 0);
+ if (FD(evsel, cpu, 0) < 0)
+ goto out_close;
+ }
+
+ return 0;
+
+out_close:
+ while (--cpu >= 0) {
+ close(FD(evsel, cpu, 0));
+ FD(evsel, cpu, 0) = -1;
+ }
+ return -1;
+}
+
+int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads)
+{
+ int thread;
+
+ if (evsel->fd == NULL && perf_evsel__alloc_fd(evsel, 1, threads->nr))
+ return -1;
+
+ for (thread = 0; thread < threads->nr; thread++) {
+ FD(evsel, 0, thread) = sys_perf_event_open(&evsel->attr,
+ threads->map[thread], -1, -1, 0);
+ if (FD(evsel, 0, thread) < 0)
+ goto out_close;
+ }
+
+ return 0;
+
+out_close:
+ while (--thread >= 0) {
+ close(FD(evsel, 0, thread));
+ FD(evsel, 0, thread) = -1;
+ }
+ return -1;
+}
+
+int perf_evsel__open(struct perf_evsel *evsel,
+ struct cpu_map *cpus, struct thread_map *threads)
+{
+ if (threads == NULL)
+ return perf_evsel__open_per_cpu(evsel, cpus);
+
+ return perf_evsel__open_per_thread(evsel, threads);
+}
--- /dev/null
+#ifndef __PERF_EVSEL_H
+#define __PERF_EVSEL_H 1
+
+#include <linux/list.h>
+#include <stdbool.h>
+#include "../../../include/linux/perf_event.h"
+#include "types.h"
+#include "xyarray.h"
+
+struct perf_counts_values {
+ union {
+ struct {
+ u64 val;
+ u64 ena;
+ u64 run;
+ };
+ u64 values[3];
+ };
+};
+
+struct perf_counts {
+ s8 scaled;
+ struct perf_counts_values aggr;
+ struct perf_counts_values cpu[];
+};
+
+struct perf_evsel {
+ struct list_head node;
+ struct perf_event_attr attr;
+ char *filter;
+ struct xyarray *fd;
+ struct perf_counts *counts;
+ int idx;
+ void *priv;
+};
+
+struct cpu_map;
+struct thread_map;
+
+struct perf_evsel *perf_evsel__new(u32 type, u64 config, int idx);
+void perf_evsel__delete(struct perf_evsel *evsel);
+
+int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
+int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus);
+void perf_evsel__free_fd(struct perf_evsel *evsel);
+void perf_evsel__close_fd(struct perf_evsel *evsel, int ncpus, int nthreads);
+
+int perf_evsel__open_per_cpu(struct perf_evsel *evsel, struct cpu_map *cpus);
+int perf_evsel__open_per_thread(struct perf_evsel *evsel, struct thread_map *threads);
+int perf_evsel__open(struct perf_evsel *evsel,
+ struct cpu_map *cpus, struct thread_map *threads);
+
+#define perf_evsel__match(evsel, t, c) \
+ (evsel->attr.type == PERF_TYPE_##t && \
+ evsel->attr.config == PERF_COUNT_##c)
+
+int __perf_evsel__read_on_cpu(struct perf_evsel *evsel,
+ int cpu, int thread, bool scale);
+
+/**
+ * perf_evsel__read_on_cpu - Read out the results on a CPU and thread
+ *
+ * @evsel - event selector to read value
+ * @cpu - CPU of interest
+ * @thread - thread of interest
+ */
+static inline int perf_evsel__read_on_cpu(struct perf_evsel *evsel,
+ int cpu, int thread)
+{
+ return __perf_evsel__read_on_cpu(evsel, cpu, thread, false);
+}
+
+/**
+ * perf_evsel__read_on_cpu_scaled - Read out the results on a CPU and thread, scaled
+ *
+ * @evsel - event selector to read value
+ * @cpu - CPU of interest
+ * @thread - thread of interest
+ */
+static inline int perf_evsel__read_on_cpu_scaled(struct perf_evsel *evsel,
+ int cpu, int thread)
+{
+ return __perf_evsel__read_on_cpu(evsel, cpu, thread, true);
+}
+
+int __perf_evsel__read(struct perf_evsel *evsel, int ncpus, int nthreads,
+ bool scale);
+
+/**
+ * perf_evsel__read - Read the aggregate results on all CPUs
+ *
+ * @evsel - event selector to read value
+ * @ncpus - Number of cpus affected, from zero
+ * @nthreads - Number of threads affected, from zero
+ */
+static inline int perf_evsel__read(struct perf_evsel *evsel,
+ int ncpus, int nthreads)
+{
+ return __perf_evsel__read(evsel, ncpus, nthreads, false);
+}
+
+/**
+ * perf_evsel__read_scaled - Read the aggregate results on all CPUs, scaled
+ *
+ * @evsel - event selector to read value
+ * @ncpus - Number of cpus affected, from zero
+ * @nthreads - Number of threads affected, from zero
+ */
+static inline int perf_evsel__read_scaled(struct perf_evsel *evsel,
+ int ncpus, int nthreads)
+{
+ return __perf_evsel__read(evsel, ncpus, nthreads, true);
+}
+
+#endif /* __PERF_EVSEL_H */
set_bit(feat, self->adds_features);
}
+void perf_header__clear_feat(struct perf_header *self, int feat)
+{
+ clear_bit(feat, self->adds_features);
+}
+
bool perf_header__has_feat(const struct perf_header *self, int feat)
{
return test_bit(feat, self->adds_features);
int idx = 0, err;
session = container_of(self, struct perf_session, header);
- if (perf_session__read_build_ids(session, true))
- perf_header__set_feat(self, HEADER_BUILD_ID);
+
+ if (perf_header__has_feat(self, HEADER_BUILD_ID &&
+ !perf_session__read_build_ids(session, true)))
+ perf_header__clear_feat(self, HEADER_BUILD_ID);
nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS);
if (!nr_sections)
/* Write trace info */
trace_sec->offset = lseek(fd, 0, SEEK_CUR);
- read_tracing_data(fd, attrs, nr_counters);
+ read_tracing_data(fd, &evsel_list);
trace_sec->size = lseek(fd, 0, SEEK_CUR) - trace_sec->offset;
}
static int perf_header__getbuffer64(struct perf_header *self,
int fd, void *buf, size_t size)
{
- if (do_read(fd, buf, size) <= 0)
+ if (readn(fd, buf, size) <= 0)
return -1;
if (self->needs_swap)
{
lseek(fd, 0, SEEK_SET);
- if (do_read(fd, self, sizeof(*self)) <= 0 ||
+ if (readn(fd, self, sizeof(*self)) <= 0 ||
memcmp(&self->magic, __perf_magic, sizeof(self->magic)))
return -1;
struct perf_header *ph, int fd,
bool repipe)
{
- if (do_read(fd, self, sizeof(*self)) <= 0 ||
+ if (readn(fd, self, sizeof(*self)) <= 0 ||
memcmp(&self->magic, __perf_magic, sizeof(self->magic)))
return -1;
return type;
}
+bool perf_header__sample_id_all(const struct perf_header *header)
+{
+ bool value = false, first = true;
+ int i;
+
+ for (i = 0; i < header->attrs; i++) {
+ struct perf_header_attr *attr = header->attr[i];
+
+ if (first) {
+ value = attr->attr.sample_id_all;
+ first = false;
+ } else if (value != attr->attr.sample_id_all)
+ die("non matching sample_id_all");
+ }
+
+ return value;
+}
+
struct perf_event_attr *
perf_header__find_attr(u64 id, struct perf_header *header)
{
ev = malloc(size);
+ if (ev == NULL)
+ return -ENOMEM;
+
ev->attr.attr = *attr;
memcpy(ev->attr.id, id, ids * sizeof(u64));
ev->attr.header.type = PERF_RECORD_HEADER_ATTR;
ev->attr.header.size = size;
- err = process(ev, session);
+ err = process(ev, NULL, session);
free(ev);
return err;
}
-int event__synthesize_attrs(struct perf_header *self,
- event__handler_t process,
+int event__synthesize_attrs(struct perf_header *self, event__handler_t process,
struct perf_session *session)
{
struct perf_header_attr *attr;
ev.event_type.header.size = sizeof(ev.event_type) -
(sizeof(ev.event_type.event_type.name) - size);
- err = process(&ev, session);
+ err = process(&ev, NULL, session);
return err;
}
return 0;
}
-int event__synthesize_tracing_data(int fd, struct perf_event_attr *pattrs,
- int nb_events,
+int event__synthesize_tracing_data(int fd, struct list_head *pattrs,
event__handler_t process,
struct perf_session *session __unused)
{
memset(&ev, 0, sizeof(ev));
ev.tracing_data.header.type = PERF_RECORD_HEADER_TRACING_DATA;
- size = read_tracing_data_size(fd, pattrs, nb_events);
+ size = read_tracing_data_size(fd, pattrs);
if (size <= 0)
return size;
aligned_size = ALIGN(size, sizeof(u64));
ev.tracing_data.header.size = sizeof(ev.tracing_data);
ev.tracing_data.size = aligned_size;
- process(&ev, session);
+ process(&ev, NULL, session);
- err = read_tracing_data(fd, pattrs, nb_events);
+ err = read_tracing_data(fd, pattrs);
write_padded(fd, NULL, 0, padding);
return aligned_size;
ev.build_id.header.size = sizeof(ev.build_id) + len;
memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len);
- err = process(&ev, session);
+ err = process(&ev, NULL, session);
return err;
}
int perf_header_attr__add_id(struct perf_header_attr *self, u64 id);
u64 perf_header__sample_type(struct perf_header *header);
+bool perf_header__sample_id_all(const struct perf_header *header);
struct perf_event_attr *
perf_header__find_attr(u64 id, struct perf_header *header);
void perf_header__set_feat(struct perf_header *self, int feat);
+void perf_header__clear_feat(struct perf_header *self, int feat);
bool perf_header__has_feat(const struct perf_header *self, int feat);
int perf_header__process_sections(struct perf_header *self, int fd,
int event__process_event_type(event_t *self,
struct perf_session *session);
-int event__synthesize_tracing_data(int fd, struct perf_event_attr *pattrs,
- int nb_events,
+int event__synthesize_tracing_data(int fd, struct list_head *pattrs,
event__handler_t process,
struct perf_session *session);
int event__process_tracing_data(event_t *self,
static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_list *chain,
int depth, int depth_mask, int period,
- u64 total_samples, int hits,
+ u64 total_samples, u64 hits,
int left_margin)
{
int i;
FILE *file;
int err = 0;
u64 len;
+ char symfs_filename[PATH_MAX];
+
+ if (filename) {
+ snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
+ symbol_conf.symfs, filename);
+ }
if (filename == NULL) {
if (dso->has_build_id) {
return -ENOMEM;
}
goto fallback;
- } else if (readlink(filename, command, sizeof(command)) < 0 ||
+ } else if (readlink(symfs_filename, command, sizeof(command)) < 0 ||
strstr(command, "[kernel.kallsyms]") ||
- access(filename, R_OK)) {
+ access(symfs_filename, R_OK)) {
free(filename);
fallback:
/*
* DSO is the same as when 'perf record' ran.
*/
filename = dso->long_name;
+ snprintf(symfs_filename, sizeof(symfs_filename), "%s%s",
+ symbol_conf.symfs, filename);
free_filename = false;
}
"objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS -C %s|grep -v %s|expand",
map__rip_2objdump(map, sym->start),
map__rip_2objdump(map, sym->end),
- filename, filename);
+ symfs_filename, filename);
pr_debug("Executing: %s\n", command);
size_t ret = 0;
for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) {
- if (!event__name[i])
+ const char *name = event__get_event_name(i);
+
+ if (!strcmp(name, "UNKNOWN"))
continue;
- ret += fprintf(fp, "%10s events: %10d\n",
- event__name[i], self->stats.nr_events[i]);
+
+ ret += fprintf(fp, "%16s events: %10d\n", name,
+ self->stats.nr_events[i]);
}
return ret;
struct events_stats {
u64 total_period;
u64 total_lost;
+ u64 total_invalid_chains;
u32 nr_events[PERF_RECORD_HEADER_MAX];
u32 nr_unknown_events;
+ u32 nr_invalid_chains;
};
enum hist_column {
--- /dev/null
+
+#ifndef PERF_CPUFEATURE_H
+#define PERF_CPUFEATURE_H
+
+/* cpufeature.h ... dummy header file for including arch/x86/lib/memcpy_64.S */
+
+#define X86_FEATURE_REP_GOOD 0
+
+#endif /* PERF_CPUFEATURE_H */
--- /dev/null
+
+#ifndef PERF_DWARF2_H
+#define PERF_DWARF2_H
+
+/* dwarf2.h ... dummy header file for including arch/x86/lib/memcpy_64.S */
+
+#define CFI_STARTPROC
+#define CFI_ENDPROC
+
+#endif /* PERF_DWARF2_H */
+
addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG);
}
+static inline void clear_bit(int nr, unsigned long *addr)
+{
+ addr[nr / BITS_PER_LONG] &= ~(1UL << (nr % BITS_PER_LONG));
+}
+
static __always_inline int test_bit(unsigned int nr, const unsigned long *addr)
{
return ((1UL << (nr % BITS_PER_LONG)) &
--- /dev/null
+
+#ifndef PERF_LINUX_LINKAGE_H_
+#define PERF_LINUX_LINKAGE_H_
+
+/* linkage.h ... for including arch/x86/lib/memcpy_64.S */
+
+#define ENTRY(name) \
+ .globl name; \
+ name:
+
+#define ENDPROC(name)
+
+#endif /* PERF_LINUX_LINKAGE_H_ */
#include "../../../include/linux/hw_breakpoint.h"
#include "util.h"
#include "../perf.h"
+#include "evsel.h"
#include "parse-options.h"
#include "parse-events.h"
#include "exec_cmd.h"
int nr_counters;
-struct perf_event_attr attrs[MAX_COUNTERS];
-char *filters[MAX_COUNTERS];
+LIST_HEAD(evsel_list);
struct event_symbol {
u8 type;
return name;
}
-const char *event_name(int counter)
+const char *event_name(struct perf_evsel *evsel)
{
- u64 config = attrs[counter].config;
- int type = attrs[counter].type;
+ u64 config = evsel->attr.config;
+ int type = evsel->attr.type;
return __event_name(type, config);
}
id = atoll(id_buf);
attr->config = id;
attr->type = PERF_TYPE_TRACEPOINT;
- *strp = evt_name + evt_length;
+ *strp += strlen(sys_name) + evt_length + 1; /* + 1 for the ':' */
attr->sample_type |= PERF_SAMPLE_RAW;
attr->sample_type |= PERF_SAMPLE_TIME;
struct perf_event_attr *attr)
{
const char *evt_name;
- char *flags;
+ char *flags = NULL, *comma_loc;
char sys_name[MAX_EVENT_LENGTH];
unsigned int sys_length, evt_length;
sys_name[sys_length] = '\0';
evt_name = evt_name + 1;
+ comma_loc = strchr(evt_name, ',');
+ if (comma_loc) {
+ /* take the event name up to the comma */
+ evt_name = strndup(evt_name, comma_loc - evt_name);
+ }
flags = strchr(evt_name, ':');
if (flags) {
/* split it out: */
evt_length = strlen(evt_name);
if (evt_length >= MAX_EVENT_LENGTH)
return EVT_FAILED;
-
if (strpbrk(evt_name, "*?")) {
- *strp = evt_name + evt_length;
+ *strp += strlen(sys_name) + evt_length;
return parse_multiple_tracepoint_event(sys_name, evt_name,
flags);
} else
return -1;
for (;;) {
- if (nr_counters == MAX_COUNTERS)
- return -1;
-
memset(&attr, 0, sizeof(attr));
ret = parse_event_symbols(&str, &attr);
if (ret == EVT_FAILED)
return -1;
if (ret != EVT_HANDLED_ALL) {
- attrs[nr_counters] = attr;
- nr_counters++;
+ struct perf_evsel *evsel;
+ evsel = perf_evsel__new(attr.type, attr.config,
+ nr_counters);
+ if (evsel == NULL)
+ return -1;
+ list_add_tail(&evsel->node, &evsel_list);
+ ++nr_counters;
}
if (*str == 0)
int parse_filter(const struct option *opt __used, const char *str,
int unset __used)
{
- int i = nr_counters - 1;
- int len = strlen(str);
+ struct perf_evsel *last = NULL;
- if (i < 0 || attrs[i].type != PERF_TYPE_TRACEPOINT) {
+ if (!list_empty(&evsel_list))
+ last = list_entry(evsel_list.prev, struct perf_evsel, node);
+
+ if (last == NULL || last->attr.type != PERF_TYPE_TRACEPOINT) {
fprintf(stderr,
"-F option should follow a -e tracepoint option\n");
return -1;
}
- filters[i] = malloc(len + 1);
- if (!filters[i]) {
+ last->filter = strdup(str);
+ if (last->filter == NULL) {
fprintf(stderr, "not enough memory to hold filter string\n");
return -1;
}
- strcpy(filters[i], str);
return 0;
}
closedir(sys_dir);
}
+/*
+ * Check whether event is in <debugfs_mount_point>/tracing/events
+ */
+
+int is_valid_tracepoint(const char *event_string)
+{
+ DIR *sys_dir, *evt_dir;
+ struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
+ char evt_path[MAXPATHLEN];
+ char dir_path[MAXPATHLEN];
+
+ if (debugfs_valid_mountpoint(debugfs_path))
+ return 0;
+
+ sys_dir = opendir(debugfs_path);
+ if (!sys_dir)
+ return 0;
+
+ for_each_subsystem(sys_dir, sys_dirent, sys_next) {
+
+ snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path,
+ sys_dirent.d_name);
+ evt_dir = opendir(dir_path);
+ if (!evt_dir)
+ continue;
+
+ for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next) {
+ snprintf(evt_path, MAXPATHLEN, "%s:%s",
+ sys_dirent.d_name, evt_dirent.d_name);
+ if (!strcmp(evt_path, event_string)) {
+ closedir(evt_dir);
+ closedir(sys_dir);
+ return 1;
+ }
+ }
+ closedir(evt_dir);
+ }
+ closedir(sys_dir);
+ return 0;
+}
+
/*
* Print the help text for the event symbols:
*/
exit(129);
}
+
+int perf_evsel_list__create_default(void)
+{
+ struct perf_evsel *evsel = perf_evsel__new(PERF_TYPE_HARDWARE,
+ PERF_COUNT_HW_CPU_CYCLES, 0);
+ if (evsel == NULL)
+ return -ENOMEM;
+
+ list_add(&evsel->node, &evsel_list);
+ ++nr_counters;
+ return 0;
+}
+
+void perf_evsel_list__delete(void)
+{
+ struct perf_evsel *pos, *n;
+
+ list_for_each_entry_safe(pos, n, &evsel_list, node) {
+ list_del_init(&pos->node);
+ perf_evsel__delete(pos);
+ }
+ nr_counters = 0;
+}
* Parse symbolic events/counts passed in as options:
*/
+#include "../../../include/linux/perf_event.h"
+
+struct list_head;
+struct perf_evsel;
+
+extern struct list_head evsel_list;
+
+int perf_evsel_list__create_default(void);
+void perf_evsel_list__delete(void);
+
struct option;
struct tracepoint_path {
};
extern struct tracepoint_path *tracepoint_id_to_path(u64 config);
-extern bool have_tracepoints(struct perf_event_attr *pattrs, int nb_events);
+extern bool have_tracepoints(struct list_head *evsel_list);
extern int nr_counters;
-extern struct perf_event_attr attrs[MAX_COUNTERS];
-extern char *filters[MAX_COUNTERS];
-
-extern const char *event_name(int ctr);
+const char *event_name(struct perf_evsel *event);
extern const char *__event_name(int type, u64 config);
extern int parse_events(const struct option *opt, const char *str, int unset);
#define EVENTS_HELP_MAX (128*1024)
extern void print_events(void);
+extern int is_valid_tracepoint(const char *event_string);
extern char debugfs_path[];
extern int valid_debugfs_mount(const char *debugfs);
-
#endif /* __PERF_PARSE_EVENTS_H */
{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f), .flags = PARSE_OPT_NOARG }
#define OPT_CALLBACK_DEFAULT(s, l, v, a, h, f, d) \
{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f), .defval = (intptr_t)d, .flags = PARSE_OPT_LASTARG_DEFAULT }
+#define OPT_CALLBACK_DEFAULT_NOOPT(s, l, v, a, h, f, d) \
+ { .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l),\
+ .value = (v), (a), .help = (h), .callback = (f), .defval = (intptr_t)d,\
+ .flags = PARSE_OPT_LASTARG_DEFAULT | PARSE_OPT_NOARG}
/* parse_options() will filter out the processed options and leave the
* non-option argments in argv[].
goto out;
if (machine__create_kernel_maps(&machine) < 0) {
- pr_debug("machine__create_kernel_maps ");
+ pr_debug("machine__create_kernel_maps() failed.\n");
goto out;
}
out:
{
const char *path = kernel_get_module_path(module);
if (!path) {
- pr_err("Failed to find path of %s module", module ?: "kernel");
+ pr_err("Failed to find path of %s module.\n",
+ module ?: "kernel");
return -ENOENT;
}
pr_debug("Try to open %s\n", path);
pr_warning("Warning: No dwarf info found in the vmlinux - "
"please rebuild kernel with CONFIG_DEBUG_INFO=y.\n");
if (!need_dwarf) {
- pr_debug("Trying to use symbols.\nn");
+ pr_debug("Trying to use symbols.\n");
return 0;
}
}
#define LINEBUF_SIZE 256
#define NR_ADDITIONAL_LINES 2
-static int show_one_line(FILE *fp, int l, bool skip, bool show_num)
+static int __show_one_line(FILE *fp, int l, bool skip, bool show_num)
{
char buf[LINEBUF_SIZE];
- const char *color = PERF_COLOR_BLUE;
+ const char *color = show_num ? "" : PERF_COLOR_BLUE;
+ const char *prefix = NULL;
- if (fgets(buf, LINEBUF_SIZE, fp) == NULL)
- goto error;
- if (!skip) {
- if (show_num)
- fprintf(stdout, "%7d %s", l, buf);
- else
- color_fprintf(stdout, color, " %s", buf);
- }
-
- while (strlen(buf) == LINEBUF_SIZE - 1 &&
- buf[LINEBUF_SIZE - 2] != '\n') {
+ do {
if (fgets(buf, LINEBUF_SIZE, fp) == NULL)
goto error;
- if (!skip) {
- if (show_num)
- fprintf(stdout, "%s", buf);
- else
- color_fprintf(stdout, color, "%s", buf);
+ if (skip)
+ continue;
+ if (!prefix) {
+ prefix = show_num ? "%7d " : " ";
+ color_fprintf(stdout, color, prefix, l);
}
- }
+ color_fprintf(stdout, color, "%s", buf);
- return 0;
+ } while (strchr(buf, '\n') == NULL);
+
+ return 1;
error:
- if (feof(fp))
- pr_warning("Source file is shorter than expected.\n");
- else
+ if (ferror(fp)) {
pr_warning("File read error: %s\n", strerror(errno));
+ return -1;
+ }
+ return 0;
+}
- return -1;
+static int _show_one_line(FILE *fp, int l, bool skip, bool show_num)
+{
+ int rv = __show_one_line(fp, l, skip, show_num);
+ if (rv == 0) {
+ pr_warning("Source file is shorter than expected.\n");
+ rv = -1;
+ }
+ return rv;
}
+#define show_one_line_with_num(f,l) _show_one_line(f,l,false,true)
+#define show_one_line(f,l) _show_one_line(f,l,false,false)
+#define skip_one_line(f,l) _show_one_line(f,l,true,false)
+#define show_one_line_or_eof(f,l) __show_one_line(f,l,false,false)
+
/*
* Show line-range always requires debuginfo to find source file and
* line number.
fprintf(stdout, "<%s:%d>\n", lr->function,
lr->start - lr->offset);
else
- fprintf(stdout, "<%s:%d>\n", lr->file, lr->start);
+ fprintf(stdout, "<%s:%d>\n", lr->path, lr->start);
fp = fopen(lr->path, "r");
if (fp == NULL) {
return -errno;
}
/* Skip to starting line number */
- while (l < lr->start && ret >= 0)
- ret = show_one_line(fp, l++, true, false);
- if (ret < 0)
- goto end;
+ while (l < lr->start) {
+ ret = skip_one_line(fp, l++);
+ if (ret < 0)
+ goto end;
+ }
list_for_each_entry(ln, &lr->line_list, list) {
- while (ln->line > l && ret >= 0)
- ret = show_one_line(fp, (l++) - lr->offset,
- false, false);
- if (ret >= 0)
- ret = show_one_line(fp, (l++) - lr->offset,
- false, true);
+ for (; ln->line > l; l++) {
+ ret = show_one_line(fp, l - lr->offset);
+ if (ret < 0)
+ goto end;
+ }
+ ret = show_one_line_with_num(fp, l++ - lr->offset);
if (ret < 0)
goto end;
}
if (lr->end == INT_MAX)
lr->end = l + NR_ADDITIONAL_LINES;
- while (l <= lr->end && !feof(fp) && ret >= 0)
- ret = show_one_line(fp, (l++) - lr->offset, false, false);
+ while (l <= lr->end) {
+ ret = show_one_line_or_eof(fp, l++ - lr->offset);
+ if (ret <= 0)
+ break;
+ }
end:
fclose(fp);
return ret;
fd = open_vmlinux(module);
if (fd < 0) {
- pr_warning("Failed to open debuginfo file.\n");
+ pr_warning("Failed to open debug information file.\n");
return fd;
}
}
#endif
+static int parse_line_num(char **ptr, int *val, const char *what)
+{
+ const char *start = *ptr;
+
+ errno = 0;
+ *val = strtol(*ptr, ptr, 0);
+ if (errno || *ptr == start) {
+ semantic_error("'%s' is not a valid number.\n", what);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Stuff 'lr' according to the line range described by 'arg'.
+ * The line range syntax is described by:
+ *
+ * SRC[:SLN[+NUM|-ELN]]
+ * FNC[:SLN[+NUM|-ELN]]
+ */
int parse_line_range_desc(const char *arg, struct line_range *lr)
{
- const char *ptr;
- char *tmp;
- /*
- * <Syntax>
- * SRC:SLN[+NUM|-ELN]
- * FUNC[:SLN[+NUM|-ELN]]
- */
- ptr = strchr(arg, ':');
- if (ptr) {
- lr->start = (int)strtoul(ptr + 1, &tmp, 0);
- if (*tmp == '+') {
- lr->end = lr->start + (int)strtoul(tmp + 1, &tmp, 0);
- lr->end--; /*
- * Adjust the number of lines here.
- * If the number of lines == 1, the
- * the end of line should be equal to
- * the start of line.
- */
- } else if (*tmp == '-')
- lr->end = (int)strtoul(tmp + 1, &tmp, 0);
- else
- lr->end = INT_MAX;
+ char *range, *name = strdup(arg);
+ int err;
+
+ if (!name)
+ return -ENOMEM;
+
+ lr->start = 0;
+ lr->end = INT_MAX;
+
+ range = strchr(name, ':');
+ if (range) {
+ *range++ = '\0';
+
+ err = parse_line_num(&range, &lr->start, "start line");
+ if (err)
+ goto err;
+
+ if (*range == '+' || *range == '-') {
+ const char c = *range++;
+
+ err = parse_line_num(&range, &lr->end, "end line");
+ if (err)
+ goto err;
+
+ if (c == '+') {
+ lr->end += lr->start;
+ /*
+ * Adjust the number of lines here.
+ * If the number of lines == 1, the
+ * the end of line should be equal to
+ * the start of line.
+ */
+ lr->end--;
+ }
+ }
+
pr_debug("Line range is %d to %d\n", lr->start, lr->end);
+
+ err = -EINVAL;
if (lr->start > lr->end) {
semantic_error("Start line must be smaller"
" than end line.\n");
- return -EINVAL;
+ goto err;
}
- if (*tmp != '\0') {
- semantic_error("Tailing with invalid character '%d'.\n",
- *tmp);
- return -EINVAL;
+ if (*range != '\0') {
+ semantic_error("Tailing with invalid str '%s'.\n", range);
+ goto err;
}
- tmp = strndup(arg, (ptr - arg));
- } else {
- tmp = strdup(arg);
- lr->end = INT_MAX;
}
- if (tmp == NULL)
- return -ENOMEM;
-
- if (strchr(tmp, '.'))
- lr->file = tmp;
+ if (strchr(name, '.'))
+ lr->file = name;
else
- lr->function = tmp;
+ lr->function = name;
return 0;
+err:
+ free(name);
+ return err;
}
/* Check the name is good for event/group */
/* Exclusion check */
if (pp->lazy_line && pp->line) {
- semantic_error("Lazy pattern can't be used with line number.");
+ semantic_error("Lazy pattern can't be used with"
+ " line number.\n");
return -EINVAL;
}
if (pp->lazy_line && pp->offset) {
- semantic_error("Lazy pattern can't be used with offset.");
+ semantic_error("Lazy pattern can't be used with offset.\n");
return -EINVAL;
}
if (pp->line && pp->offset) {
- semantic_error("Offset can't be used with line number.");
+ semantic_error("Offset can't be used with line number.\n");
return -EINVAL;
}
if (!pp->line && !pp->lazy_line && pp->file && !pp->function) {
semantic_error("File always requires line number or "
- "lazy pattern.");
+ "lazy pattern.\n");
return -EINVAL;
}
if (pp->offset && !pp->function) {
- semantic_error("Offset requires an entry function.");
+ semantic_error("Offset requires an entry function.\n");
return -EINVAL;
}
if (pp->retprobe && !pp->function) {
- semantic_error("Return probe requires an entry function.");
+ semantic_error("Return probe requires an entry function.\n");
return -EINVAL;
}
if ((pp->offset || pp->line || pp->lazy_line) && pp->retprobe) {
semantic_error("Offset/Line/Lazy pattern can't be used with "
- "return probe.");
+ "return probe.\n");
return -EINVAL;
}
return tmp - buf;
error:
- pr_debug("Failed to synthesize perf probe argument: %s",
+ pr_debug("Failed to synthesize perf probe argument: %s\n",
strerror(-ret));
return ret;
}
goto error;
}
if (pp->file) {
- len = strlen(pp->file) - 31;
- if (len < 0)
- len = 0;
- tmp = strchr(pp->file + len, '/');
- if (!tmp)
- tmp = pp->file + len;
- ret = e_snprintf(file, 32, "@%s", tmp + 1);
+ tmp = pp->file;
+ len = strlen(tmp);
+ if (len > 30) {
+ tmp = strchr(pp->file + len - 30, '/');
+ tmp = tmp ? tmp + 1 : pp->file + len - 30;
+ }
+ ret = e_snprintf(file, 32, "@%s", tmp);
if (ret <= 0)
goto error;
}
return buf;
error:
- pr_debug("Failed to synthesize perf probe point: %s",
+ pr_debug("Failed to synthesize perf probe point: %s\n",
strerror(-ret));
if (buf)
free(buf);
ret = e_snprintf(buf, 128, "%s:%s", group, event);
if (ret < 0) {
- pr_err("Failed to copy event.");
+ pr_err("Failed to copy event.\n");
return ret;
}
regs = get_arch_regstr(regn);
if (!regs) {
/* This should be a bug in DWARF or this tool */
- pr_warning("Mapping for DWARF register number %u "
- "missing on this architecture.", regn);
+ pr_warning("Mapping for the register number %u "
+ "missing on this architecture.\n", regn);
return -ERANGE;
}
if (ret != DW_TAG_pointer_type &&
ret != DW_TAG_array_type) {
pr_warning("Failed to cast into string: "
- "%s(%s) is not a pointer nor array.",
+ "%s(%s) is not a pointer nor array.\n",
dwarf_diename(vr_die), dwarf_diename(&type));
return -EINVAL;
}
if (ret == DW_TAG_pointer_type) {
if (die_get_real_type(&type, &type) == NULL) {
- pr_warning("Failed to get a type information.");
+ pr_warning("Failed to get a type"
+ " information.\n");
return -ENOENT;
}
while (*ref_ptr)
if (!die_compare_name(&type, "char") &&
!die_compare_name(&type, "unsigned char")) {
pr_warning("Failed to cast into string: "
- "%s is not (unsigned) char *.",
+ "%s is not (unsigned) char *.\n",
dwarf_diename(vr_die));
return -EINVAL;
}
return -EINVAL;
}
if (field->name[0] == '[') {
- pr_err("Semantic error: %s is not a pointor nor array.",
- varname);
+ pr_err("Semantic error: %s is not a pointor"
+ " nor array.\n", varname);
return -EINVAL;
}
if (field->ref) {
name = dwarf_diename(sp_die);
if (name) {
if (dwarf_entrypc(sp_die, &eaddr) != 0) {
- pr_warning("Failed to get entry pc of %s\n",
+ pr_warning("Failed to get entry address of %s\n",
dwarf_diename(sp_die));
return -ENOENT;
}
if (retprobe) {
if (eaddr != paddr) {
pr_warning("Return probe must be on the head of"
- " a real function\n");
+ " a real function.\n");
return -EINVAL;
}
tp->retprobe = true;
Dwarf_Frame *frame;
if (dwarf_cfi_addrframe(pf->cfi, pf->addr, &frame) != 0 ||
dwarf_frame_cfa(frame, &pf->fb_ops, &nops) != 0) {
- pr_warning("Failed to get CFA on 0x%jx\n",
+ pr_warning("Failed to get call frame on 0x%jx\n",
(uintmax_t)pf->addr);
return -ENOENT;
}
int ret = 0;
if (dwarf_getsrclines(&pf->cu_die, &lines, &nlines) != 0) {
- pr_warning("No source lines found in this CU.\n");
+ pr_warning("No source lines found.\n");
return -ENOENT;
}
}
if (dwarf_getsrclines(&pf->cu_die, &lines, &nlines) != 0) {
- pr_warning("No source lines found in this CU.\n");
+ pr_warning("No source lines found.\n");
return -ENOENT;
}
else {
/* Get probe address */
if (dwarf_entrypc(in_die, &addr) != 0) {
- pr_warning("Failed to get entry pc of %s.\n",
+ pr_warning("Failed to get entry address of %s.\n",
dwarf_diename(in_die));
param->retval = -ENOENT;
return DWARF_CB_ABORT;
param->retval = find_probe_point_lazy(sp_die, pf);
else {
if (dwarf_entrypc(sp_die, &pf->addr) != 0) {
- pr_warning("Failed to get entry pc of %s.\n",
- dwarf_diename(sp_die));
+ pr_warning("Failed to get entry address of "
+ "%s.\n", dwarf_diename(sp_die));
param->retval = -ENOENT;
return DWARF_CB_ABORT;
}
dbg = dwfl_init_offline_dwarf(fd, &dwfl, &bias);
if (!dbg) {
- pr_warning("No dwarf info found in the vmlinux - "
+ pr_warning("No debug information found in the vmlinux - "
"please rebuild with CONFIG_DEBUG_INFO=y.\n");
return -EBADF;
}
/* Open the live linux kernel */
dbg = dwfl_init_live_kernel_dwarf(addr, &dwfl, &bias);
if (!dbg) {
- pr_warning("No dwarf info found in the vmlinux - "
+ pr_warning("No debug information found in the vmlinux - "
"please rebuild with CONFIG_DEBUG_INFO=y.\n");
ret = -EINVAL;
goto end;
addr += bias;
/* Find cu die */
if (!dwarf_addrdie(dbg, (Dwarf_Addr)addr - bias, &cudie)) {
- pr_warning("No CU DIE is found at %lx\n", addr);
+ pr_warning("Failed to find debug information for address %lx\n",
+ addr);
ret = -EINVAL;
goto end;
}
line_list__init(&lf->lr->line_list);
if (dwarf_getsrclines(&lf->cu_die, &lines, &nlines) != 0) {
- pr_warning("No source lines found in this CU.\n");
+ pr_warning("No source lines found.\n");
return -ENOENT;
}
dbg = dwfl_init_offline_dwarf(fd, &dwfl, &bias);
if (!dbg) {
- pr_warning("No dwarf info found in the vmlinux - "
+ pr_warning("No debug information found in the vmlinux - "
"please rebuild with CONFIG_DEBUG_INFO=y.\n");
return -EBADF;
}
bool externs);
#include <dwarf.h>
-#include <libdw.h>
-#include <libdwfl.h>
-#include <version.h>
+#include <elfutils/libdw.h>
+#include <elfutils/libdwfl.h>
+#include <elfutils/version.h>
struct probe_finder {
struct perf_probe_event *pev; /* Target probe event */
/*
- * trace-event-perl. Feed perf trace events to an embedded Perl interpreter.
+ * trace-event-perl. Feed perf script events to an embedded Perl interpreter.
*
* Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
*
return -1;
}
- fprintf(ofp, "# perf trace event handlers, "
- "generated by perf trace -g perl\n");
+ fprintf(ofp, "# perf script event handlers, "
+ "generated by perf script -g perl\n");
fprintf(ofp, "# Licensed under the terms of the GNU GPL"
" License version 2\n\n");
fprintf(stderr, "couldn't open %s\n", fname);
return -1;
}
- fprintf(ofp, "# perf trace event handlers, "
- "generated by perf trace -g python\n");
+ fprintf(ofp, "# perf script event handlers, "
+ "generated by perf script -g python\n");
fprintf(ofp, "# Licensed under the terms of the GNU GPL"
" License version 2\n\n");
return -1;
}
+static void perf_session__id_header_size(struct perf_session *session)
+{
+ struct sample_data *data;
+ u64 sample_type = session->sample_type;
+ u16 size = 0;
+
+ if (!session->sample_id_all)
+ goto out;
+
+ if (sample_type & PERF_SAMPLE_TID)
+ size += sizeof(data->tid) * 2;
+
+ if (sample_type & PERF_SAMPLE_TIME)
+ size += sizeof(data->time);
+
+ if (sample_type & PERF_SAMPLE_ID)
+ size += sizeof(data->id);
+
+ if (sample_type & PERF_SAMPLE_STREAM_ID)
+ size += sizeof(data->stream_id);
+
+ if (sample_type & PERF_SAMPLE_CPU)
+ size += sizeof(data->cpu) * 2;
+out:
+ session->id_hdr_size = size;
+}
+
+void perf_session__set_sample_id_all(struct perf_session *session, bool value)
+{
+ session->sample_id_all = value;
+ perf_session__id_header_size(session);
+}
+
+void perf_session__set_sample_type(struct perf_session *session, u64 type)
+{
+ session->sample_type = type;
+}
+
void perf_session__update_sample_type(struct perf_session *self)
{
self->sample_type = perf_header__sample_type(&self->header);
+ self->sample_id_all = perf_header__sample_id_all(&self->header);
+ perf_session__id_header_size(self);
}
int perf_session__create_kernel_maps(struct perf_session *self)
machines__destroy_guest_kernel_maps(&self->machines);
}
-struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe)
+struct perf_session *perf_session__new(const char *filename, int mode,
+ bool force, bool repipe,
+ struct perf_event_ops *ops)
{
size_t len = filename ? strlen(filename) + 1 : 0;
struct perf_session *self = zalloc(sizeof(*self) + len);
INIT_LIST_HEAD(&self->dead_threads);
self->hists_tree = RB_ROOT;
self->last_match = NULL;
- self->mmap_window = 32;
+ /*
+ * On 64bit we can mmap the data file in one go. No need for tiny mmap
+ * slices. On 32bit we use 32MB.
+ */
+#if BITS_PER_LONG == 64
+ self->mmap_window = ULLONG_MAX;
+#else
+ self->mmap_window = 32 * 1024 * 1024ULL;
+#endif
self->machines = RB_ROOT;
self->repipe = repipe;
- INIT_LIST_HEAD(&self->ordered_samples.samples_head);
+ INIT_LIST_HEAD(&self->ordered_samples.samples);
+ INIT_LIST_HEAD(&self->ordered_samples.sample_cache);
+ INIT_LIST_HEAD(&self->ordered_samples.to_free);
machine__init(&self->host_machine, "", HOST_KERNEL_ID);
if (mode == O_RDONLY) {
}
perf_session__update_sample_type(self);
+
+ if (ops && ops->ordering_requires_timestamps &&
+ ops->ordered_samples && !self->sample_id_all) {
+ dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n");
+ ops->ordered_samples = false;
+ }
+
out:
return self;
out_free:
return syms;
}
+static int process_event_synth_stub(event_t *event __used,
+ struct perf_session *session __used)
+{
+ dump_printf(": unhandled!\n");
+ return 0;
+}
+
static int process_event_stub(event_t *event __used,
+ struct sample_data *sample __used,
struct perf_session *session __used)
{
dump_printf(": unhandled!\n");
if (handler->exit == NULL)
handler->exit = process_event_stub;
if (handler->lost == NULL)
- handler->lost = process_event_stub;
+ handler->lost = event__process_lost;
if (handler->read == NULL)
handler->read = process_event_stub;
if (handler->throttle == NULL)
if (handler->unthrottle == NULL)
handler->unthrottle = process_event_stub;
if (handler->attr == NULL)
- handler->attr = process_event_stub;
+ handler->attr = process_event_synth_stub;
if (handler->event_type == NULL)
- handler->event_type = process_event_stub;
+ handler->event_type = process_event_synth_stub;
if (handler->tracing_data == NULL)
- handler->tracing_data = process_event_stub;
+ handler->tracing_data = process_event_synth_stub;
if (handler->build_id == NULL)
- handler->build_id = process_event_stub;
+ handler->build_id = process_event_synth_stub;
if (handler->finished_round == NULL) {
if (handler->ordered_samples)
handler->finished_round = process_finished_round;
struct sample_queue {
u64 timestamp;
- struct sample_event *event;
+ u64 file_offset;
+ event_t *event;
struct list_head list;
};
+static void perf_session_free_sample_buffers(struct perf_session *session)
+{
+ struct ordered_samples *os = &session->ordered_samples;
+
+ while (!list_empty(&os->to_free)) {
+ struct sample_queue *sq;
+
+ sq = list_entry(os->to_free.next, struct sample_queue, list);
+ list_del(&sq->list);
+ free(sq);
+ }
+}
+
+static int perf_session_deliver_event(struct perf_session *session,
+ event_t *event,
+ struct sample_data *sample,
+ struct perf_event_ops *ops,
+ u64 file_offset);
+
static void flush_sample_queue(struct perf_session *s,
struct perf_event_ops *ops)
{
- struct list_head *head = &s->ordered_samples.samples_head;
- u64 limit = s->ordered_samples.next_flush;
+ struct ordered_samples *os = &s->ordered_samples;
+ struct list_head *head = &os->samples;
struct sample_queue *tmp, *iter;
+ struct sample_data sample;
+ u64 limit = os->next_flush;
+ u64 last_ts = os->last_sample ? os->last_sample->timestamp : 0ULL;
if (!ops->ordered_samples || !limit)
return;
list_for_each_entry_safe(iter, tmp, head, list) {
if (iter->timestamp > limit)
- return;
+ break;
- if (iter == s->ordered_samples.last_inserted)
- s->ordered_samples.last_inserted = NULL;
+ event__parse_sample(iter->event, s, &sample);
+ perf_session_deliver_event(s, iter->event, &sample, ops,
+ iter->file_offset);
- ops->sample((event_t *)iter->event, s);
-
- s->ordered_samples.last_flush = iter->timestamp;
+ os->last_flush = iter->timestamp;
list_del(&iter->list);
- free(iter->event);
- free(iter);
+ list_add(&iter->list, &os->sample_cache);
+ }
+
+ if (list_empty(head)) {
+ os->last_sample = NULL;
+ } else if (last_ts <= limit) {
+ os->last_sample =
+ list_entry(head->prev, struct sample_queue, list);
}
}
return 0;
}
-static void __queue_sample_end(struct sample_queue *new, struct list_head *head)
-{
- struct sample_queue *iter;
-
- list_for_each_entry_reverse(iter, head, list) {
- if (iter->timestamp < new->timestamp) {
- list_add(&new->list, &iter->list);
- return;
- }
- }
-
- list_add(&new->list, head);
-}
-
-static void __queue_sample_before(struct sample_queue *new,
- struct sample_queue *iter,
- struct list_head *head)
-{
- list_for_each_entry_continue_reverse(iter, head, list) {
- if (iter->timestamp < new->timestamp) {
- list_add(&new->list, &iter->list);
- return;
- }
- }
-
- list_add(&new->list, head);
-}
-
-static void __queue_sample_after(struct sample_queue *new,
- struct sample_queue *iter,
- struct list_head *head)
-{
- list_for_each_entry_continue(iter, head, list) {
- if (iter->timestamp > new->timestamp) {
- list_add_tail(&new->list, &iter->list);
- return;
- }
- }
- list_add_tail(&new->list, head);
-}
-
/* The queue is ordered by time */
-static void __queue_sample_event(struct sample_queue *new,
- struct perf_session *s)
+static void __queue_event(struct sample_queue *new, struct perf_session *s)
{
- struct sample_queue *last_inserted = s->ordered_samples.last_inserted;
- struct list_head *head = &s->ordered_samples.samples_head;
+ struct ordered_samples *os = &s->ordered_samples;
+ struct sample_queue *sample = os->last_sample;
+ u64 timestamp = new->timestamp;
+ struct list_head *p;
+ os->last_sample = new;
- if (!last_inserted) {
- __queue_sample_end(new, head);
+ if (!sample) {
+ list_add(&new->list, &os->samples);
+ os->max_timestamp = timestamp;
return;
}
/*
- * Most of the time the current event has a timestamp
- * very close to the last event inserted, unless we just switched
- * to another event buffer. Having a sorting based on a list and
- * on the last inserted event that is close to the current one is
- * probably more efficient than an rbtree based sorting.
+ * last_sample might point to some random place in the list as it's
+ * the last queued event. We expect that the new event is close to
+ * this.
*/
- if (last_inserted->timestamp >= new->timestamp)
- __queue_sample_before(new, last_inserted, head);
- else
- __queue_sample_after(new, last_inserted, head);
+ if (sample->timestamp <= timestamp) {
+ while (sample->timestamp <= timestamp) {
+ p = sample->list.next;
+ if (p == &os->samples) {
+ list_add_tail(&new->list, &os->samples);
+ os->max_timestamp = timestamp;
+ return;
+ }
+ sample = list_entry(p, struct sample_queue, list);
+ }
+ list_add_tail(&new->list, &sample->list);
+ } else {
+ while (sample->timestamp > timestamp) {
+ p = sample->list.prev;
+ if (p == &os->samples) {
+ list_add(&new->list, &os->samples);
+ return;
+ }
+ sample = list_entry(p, struct sample_queue, list);
+ }
+ list_add(&new->list, &sample->list);
+ }
}
-static int queue_sample_event(event_t *event, struct sample_data *data,
- struct perf_session *s)
+#define MAX_SAMPLE_BUFFER (64 * 1024 / sizeof(struct sample_queue))
+
+static int perf_session_queue_event(struct perf_session *s, event_t *event,
+ struct sample_data *data, u64 file_offset)
{
+ struct ordered_samples *os = &s->ordered_samples;
+ struct list_head *sc = &os->sample_cache;
u64 timestamp = data->time;
struct sample_queue *new;
+ if (!timestamp || timestamp == ~0ULL)
+ return -ETIME;
if (timestamp < s->ordered_samples.last_flush) {
printf("Warning: Timestamp below last timeslice flush\n");
return -EINVAL;
}
- new = malloc(sizeof(*new));
- if (!new)
- return -ENOMEM;
+ if (!list_empty(sc)) {
+ new = list_entry(sc->next, struct sample_queue, list);
+ list_del(&new->list);
+ } else if (os->sample_buffer) {
+ new = os->sample_buffer + os->sample_buffer_idx;
+ if (++os->sample_buffer_idx == MAX_SAMPLE_BUFFER)
+ os->sample_buffer = NULL;
+ } else {
+ os->sample_buffer = malloc(MAX_SAMPLE_BUFFER * sizeof(*new));
+ if (!os->sample_buffer)
+ return -ENOMEM;
+ list_add(&os->sample_buffer->list, &os->to_free);
+ os->sample_buffer_idx = 2;
+ new = os->sample_buffer + 1;
+ }
new->timestamp = timestamp;
+ new->file_offset = file_offset;
+ new->event = event;
- new->event = malloc(event->header.size);
- if (!new->event) {
- free(new);
- return -ENOMEM;
- }
+ __queue_event(new, s);
- memcpy(new->event, event, event->header.size);
+ return 0;
+}
- __queue_sample_event(new, s);
- s->ordered_samples.last_inserted = new;
+static void callchain__printf(struct sample_data *sample)
+{
+ unsigned int i;
- if (new->timestamp > s->ordered_samples.max_timestamp)
- s->ordered_samples.max_timestamp = new->timestamp;
+ printf("... chain: nr:%Lu\n", sample->callchain->nr);
- return 0;
+ for (i = 0; i < sample->callchain->nr; i++)
+ printf("..... %2d: %016Lx\n", i, sample->callchain->ips[i]);
}
-static int perf_session__process_sample(event_t *event, struct perf_session *s,
- struct perf_event_ops *ops)
+static void perf_session__print_tstamp(struct perf_session *session,
+ event_t *event,
+ struct sample_data *sample)
{
- struct sample_data data;
+ if (event->header.type != PERF_RECORD_SAMPLE &&
+ !session->sample_id_all) {
+ fputs("-1 -1 ", stdout);
+ return;
+ }
- if (!ops->ordered_samples)
- return ops->sample(event, s);
+ if ((session->sample_type & PERF_SAMPLE_CPU))
+ printf("%u ", sample->cpu);
- bzero(&data, sizeof(struct sample_data));
- event__parse_sample(event, s->sample_type, &data);
+ if (session->sample_type & PERF_SAMPLE_TIME)
+ printf("%Lu ", sample->time);
+}
- queue_sample_event(event, &data, s);
+static void dump_event(struct perf_session *session, event_t *event,
+ u64 file_offset, struct sample_data *sample)
+{
+ if (!dump_trace)
+ return;
- return 0;
+ printf("\n%#Lx [%#x]: event: %d\n", file_offset, event->header.size,
+ event->header.type);
+
+ trace_event(event);
+
+ if (sample)
+ perf_session__print_tstamp(session, event, sample);
+
+ printf("%#Lx [%#x]: PERF_RECORD_%s", file_offset, event->header.size,
+ event__get_event_name(event->header.type));
}
-static int perf_session__process_event(struct perf_session *self,
- event_t *event,
- struct perf_event_ops *ops,
- u64 offset, u64 head)
+static void dump_sample(struct perf_session *session, event_t *event,
+ struct sample_data *sample)
{
- trace_event(event);
+ if (!dump_trace)
+ return;
- if (event->header.type < PERF_RECORD_HEADER_MAX) {
- dump_printf("%#Lx [%#x]: PERF_RECORD_%s",
- offset + head, event->header.size,
- event__name[event->header.type]);
- hists__inc_nr_events(&self->hists, event->header.type);
- }
+ printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc,
+ sample->pid, sample->tid, sample->ip, sample->period);
- if (self->header.needs_swap && event__swap_ops[event->header.type])
- event__swap_ops[event->header.type](event);
+ if (session->sample_type & PERF_SAMPLE_CALLCHAIN)
+ callchain__printf(sample);
+}
+
+static int perf_session_deliver_event(struct perf_session *session,
+ event_t *event,
+ struct sample_data *sample,
+ struct perf_event_ops *ops,
+ u64 file_offset)
+{
+ dump_event(session, event, file_offset, sample);
switch (event->header.type) {
case PERF_RECORD_SAMPLE:
- return perf_session__process_sample(event, self, ops);
+ dump_sample(session, event, sample);
+ return ops->sample(event, sample, session);
case PERF_RECORD_MMAP:
- return ops->mmap(event, self);
+ return ops->mmap(event, sample, session);
case PERF_RECORD_COMM:
- return ops->comm(event, self);
+ return ops->comm(event, sample, session);
case PERF_RECORD_FORK:
- return ops->fork(event, self);
+ return ops->fork(event, sample, session);
case PERF_RECORD_EXIT:
- return ops->exit(event, self);
+ return ops->exit(event, sample, session);
case PERF_RECORD_LOST:
- return ops->lost(event, self);
+ return ops->lost(event, sample, session);
case PERF_RECORD_READ:
- return ops->read(event, self);
+ return ops->read(event, sample, session);
case PERF_RECORD_THROTTLE:
- return ops->throttle(event, self);
+ return ops->throttle(event, sample, session);
case PERF_RECORD_UNTHROTTLE:
- return ops->unthrottle(event, self);
+ return ops->unthrottle(event, sample, session);
+ default:
+ ++session->hists.stats.nr_unknown_events;
+ return -1;
+ }
+}
+
+static int perf_session__preprocess_sample(struct perf_session *session,
+ event_t *event, struct sample_data *sample)
+{
+ if (event->header.type != PERF_RECORD_SAMPLE ||
+ !(session->sample_type & PERF_SAMPLE_CALLCHAIN))
+ return 0;
+
+ if (!ip_callchain__valid(sample->callchain, event)) {
+ pr_debug("call-chain problem with event, skipping it.\n");
+ ++session->hists.stats.nr_invalid_chains;
+ session->hists.stats.total_invalid_chains += sample->period;
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int perf_session__process_user_event(struct perf_session *session, event_t *event,
+ struct perf_event_ops *ops, u64 file_offset)
+{
+ dump_event(session, event, file_offset, NULL);
+
+ /* These events are processed right away */
+ switch (event->header.type) {
case PERF_RECORD_HEADER_ATTR:
- return ops->attr(event, self);
+ return ops->attr(event, session);
case PERF_RECORD_HEADER_EVENT_TYPE:
- return ops->event_type(event, self);
+ return ops->event_type(event, session);
case PERF_RECORD_HEADER_TRACING_DATA:
/* setup for reading amidst mmap */
- lseek(self->fd, offset + head, SEEK_SET);
- return ops->tracing_data(event, self);
+ lseek(session->fd, file_offset, SEEK_SET);
+ return ops->tracing_data(event, session);
case PERF_RECORD_HEADER_BUILD_ID:
- return ops->build_id(event, self);
+ return ops->build_id(event, session);
case PERF_RECORD_FINISHED_ROUND:
- return ops->finished_round(event, self, ops);
+ return ops->finished_round(event, session, ops);
default:
- ++self->hists.stats.nr_unknown_events;
- return -1;
+ return -EINVAL;
}
}
+static int perf_session__process_event(struct perf_session *session,
+ event_t *event,
+ struct perf_event_ops *ops,
+ u64 file_offset)
+{
+ struct sample_data sample;
+ int ret;
+
+ if (session->header.needs_swap && event__swap_ops[event->header.type])
+ event__swap_ops[event->header.type](event);
+
+ if (event->header.type >= PERF_RECORD_HEADER_MAX)
+ return -EINVAL;
+
+ hists__inc_nr_events(&session->hists, event->header.type);
+
+ if (event->header.type >= PERF_RECORD_USER_TYPE_START)
+ return perf_session__process_user_event(session, event, ops, file_offset);
+
+ /*
+ * For all kernel events we get the sample data
+ */
+ event__parse_sample(event, session, &sample);
+
+ /* Preprocess sample records - precheck callchains */
+ if (perf_session__preprocess_sample(session, event, &sample))
+ return 0;
+
+ if (ops->ordered_samples) {
+ ret = perf_session_queue_event(session, event, &sample,
+ file_offset);
+ if (ret != -ETIME)
+ return ret;
+ }
+
+ return perf_session_deliver_event(session, event, &sample, ops,
+ file_offset);
+}
+
void perf_event_header__bswap(struct perf_event_header *self)
{
self->type = bswap_32(self->type);
return thread;
}
-int do_read(int fd, void *buf, size_t size)
+static void perf_session__warn_about_errors(const struct perf_session *session,
+ const struct perf_event_ops *ops)
{
- void *buf_start = buf;
-
- while (size) {
- int ret = read(fd, buf, size);
-
- if (ret <= 0)
- return ret;
+ if (ops->lost == event__process_lost &&
+ session->hists.stats.total_lost != 0) {
+ ui__warning("Processed %Lu events and LOST %Lu!\n\n"
+ "Check IO/CPU overload!\n\n",
+ session->hists.stats.total_period,
+ session->hists.stats.total_lost);
+ }
- size -= ret;
- buf += ret;
+ if (session->hists.stats.nr_unknown_events != 0) {
+ ui__warning("Found %u unknown events!\n\n"
+ "Is this an older tool processing a perf.data "
+ "file generated by a more recent tool?\n\n"
+ "If that is not the case, consider "
+ "reporting to linux-kernel@vger.kernel.org.\n\n",
+ session->hists.stats.nr_unknown_events);
}
- return buf - buf_start;
+ if (session->hists.stats.nr_invalid_chains != 0) {
+ ui__warning("Found invalid callchains!\n\n"
+ "%u out of %u events were discarded for this reason.\n\n"
+ "Consider reporting to linux-kernel@vger.kernel.org.\n\n",
+ session->hists.stats.nr_invalid_chains,
+ session->hists.stats.nr_events[PERF_RECORD_SAMPLE]);
+ }
}
#define session_done() (*(volatile int *)(&session_done))
head = 0;
more:
- err = do_read(self->fd, &event, sizeof(struct perf_event_header));
+ err = readn(self->fd, &event, sizeof(struct perf_event_header));
if (err <= 0) {
if (err == 0)
goto done;
p += sizeof(struct perf_event_header);
if (size - sizeof(struct perf_event_header)) {
- err = do_read(self->fd, p,
- size - sizeof(struct perf_event_header));
+ err = readn(self->fd, p, size - sizeof(struct perf_event_header));
if (err <= 0) {
if (err == 0) {
pr_err("unexpected end of event stream\n");
}
if (size == 0 ||
- (skip = perf_session__process_event(self, &event, ops,
- 0, head)) < 0) {
+ (skip = perf_session__process_event(self, &event, ops, head)) < 0) {
dump_printf("%#Lx [%#x]: skipping unknown header type: %d\n",
head, event.header.size, event.header.type);
/*
head += size;
- dump_printf("\n%#Lx [%#x]: event: %d\n",
- head, event.header.size, event.header.type);
-
if (skip > 0)
head += skip;
done:
err = 0;
out_err:
+ perf_session__warn_about_errors(self, ops);
+ perf_session_free_sample_buffers(self);
return err;
}
-int __perf_session__process_events(struct perf_session *self,
+int __perf_session__process_events(struct perf_session *session,
u64 data_offset, u64 data_size,
u64 file_size, struct perf_event_ops *ops)
{
- int err, mmap_prot, mmap_flags;
- u64 head, shift;
- u64 offset = 0;
- size_t page_size;
+ u64 head, page_offset, file_offset, file_pos, progress_next;
+ int err, mmap_prot, mmap_flags, map_idx = 0;
+ struct ui_progress *progress;
+ size_t page_size, mmap_size;
+ char *buf, *mmaps[8];
event_t *event;
uint32_t size;
- char *buf;
- struct ui_progress *progress = ui_progress__new("Processing events...",
- self->size);
- if (progress == NULL)
- return -1;
perf_event_ops__fill_defaults(ops);
page_size = sysconf(_SC_PAGESIZE);
- head = data_offset;
- shift = page_size * (head / page_size);
- offset += shift;
- head -= shift;
+ page_offset = page_size * (data_offset / page_size);
+ file_offset = page_offset;
+ head = data_offset - page_offset;
+
+ if (data_offset + data_size < file_size)
+ file_size = data_offset + data_size;
+
+ progress_next = file_size / 16;
+ progress = ui_progress__new("Processing events...", file_size);
+ if (progress == NULL)
+ return -1;
+
+ mmap_size = session->mmap_window;
+ if (mmap_size > file_size)
+ mmap_size = file_size;
+
+ memset(mmaps, 0, sizeof(mmaps));
mmap_prot = PROT_READ;
mmap_flags = MAP_SHARED;
- if (self->header.needs_swap) {
+ if (session->header.needs_swap) {
mmap_prot |= PROT_WRITE;
mmap_flags = MAP_PRIVATE;
}
remap:
- buf = mmap(NULL, page_size * self->mmap_window, mmap_prot,
- mmap_flags, self->fd, offset);
+ buf = mmap(NULL, mmap_size, mmap_prot, mmap_flags, session->fd,
+ file_offset);
if (buf == MAP_FAILED) {
pr_err("failed to mmap file\n");
err = -errno;
goto out_err;
}
+ mmaps[map_idx] = buf;
+ map_idx = (map_idx + 1) & (ARRAY_SIZE(mmaps) - 1);
+ file_pos = file_offset + head;
more:
event = (event_t *)(buf + head);
- ui_progress__update(progress, offset);
- if (self->header.needs_swap)
+ if (session->header.needs_swap)
perf_event_header__bswap(&event->header);
size = event->header.size;
if (size == 0)
size = 8;
- if (head + event->header.size >= page_size * self->mmap_window) {
- int munmap_ret;
-
- shift = page_size * (head / page_size);
-
- munmap_ret = munmap(buf, page_size * self->mmap_window);
- assert(munmap_ret == 0);
+ if (head + event->header.size >= mmap_size) {
+ if (mmaps[map_idx]) {
+ munmap(mmaps[map_idx], mmap_size);
+ mmaps[map_idx] = NULL;
+ }
- offset += shift;
- head -= shift;
+ page_offset = page_size * (head / page_size);
+ file_offset += page_offset;
+ head -= page_offset;
goto remap;
}
size = event->header.size;
- dump_printf("\n%#Lx [%#x]: event: %d\n",
- offset + head, event->header.size, event->header.type);
-
if (size == 0 ||
- perf_session__process_event(self, event, ops, offset, head) < 0) {
+ perf_session__process_event(session, event, ops, file_pos) < 0) {
dump_printf("%#Lx [%#x]: skipping unknown header type: %d\n",
- offset + head, event->header.size,
+ file_offset + head, event->header.size,
event->header.type);
/*
* assume we lost track of the stream, check alignment, and
}
head += size;
+ file_pos += size;
- if (offset + head >= data_offset + data_size)
- goto done;
+ if (file_pos >= progress_next) {
+ progress_next += file_size / 16;
+ ui_progress__update(progress, file_pos);
+ }
- if (offset + head < file_size)
+ if (file_pos < file_size)
goto more;
-done:
+
err = 0;
/* do the final flush for ordered samples */
- self->ordered_samples.next_flush = ULLONG_MAX;
- flush_sample_queue(self, ops);
+ session->ordered_samples.next_flush = ULLONG_MAX;
+ flush_sample_queue(session, ops);
out_err:
ui_progress__delete(progress);
+ perf_session__warn_about_errors(session, ops);
+ perf_session_free_sample_buffers(session);
return err;
}
u64 last_flush;
u64 next_flush;
u64 max_timestamp;
- struct list_head samples_head;
- struct sample_queue *last_inserted;
+ struct list_head samples;
+ struct list_head sample_cache;
+ struct list_head to_free;
+ struct sample_queue *sample_buffer;
+ struct sample_queue *last_sample;
+ int sample_buffer_idx;
};
struct perf_session {
int fd;
bool fd_pipe;
bool repipe;
+ bool sample_id_all;
+ u16 id_hdr_size;
int cwdlen;
char *cwd;
struct ordered_samples ordered_samples;
struct perf_event_ops;
-typedef int (*event_op)(event_t *self, struct perf_session *session);
+typedef int (*event_op)(event_t *self, struct sample_data *sample,
+ struct perf_session *session);
+typedef int (*event_synth_op)(event_t *self, struct perf_session *session);
typedef int (*event_op2)(event_t *self, struct perf_session *session,
struct perf_event_ops *ops);
lost,
read,
throttle,
- unthrottle,
- attr,
+ unthrottle;
+ event_synth_op attr,
event_type,
tracing_data,
build_id;
event_op2 finished_round;
bool ordered_samples;
+ bool ordering_requires_timestamps;
};
-struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe);
+struct perf_session *perf_session__new(const char *filename, int mode,
+ bool force, bool repipe,
+ struct perf_event_ops *ops);
void perf_session__delete(struct perf_session *self);
void perf_event_header__bswap(struct perf_event_header *self);
int perf_session__create_kernel_maps(struct perf_session *self);
-int do_read(int fd, void *buf, size_t size);
void perf_session__update_sample_type(struct perf_session *self);
+void perf_session__set_sample_id_all(struct perf_session *session, bool value);
+void perf_session__set_sample_type(struct perf_session *session, u64 type);
void perf_session__remove_thread(struct perf_session *self, struct thread *th);
static inline
return repsep_snprintf(bf, size, "%-*s", width, dso_name);
}
- return repsep_snprintf(bf, size, "%*Lx", width, self->ip);
+ return repsep_snprintf(bf, size, "%-*s", width, "[unknown]");
}
/* --sort symbol */
if (verbose) {
char o = self->ms.map ? dso__symtab_origin(self->ms.map->dso) : '!';
- ret += repsep_snprintf(bf, size, "%*Lx %c ",
+ ret += repsep_snprintf(bf, size, "%-#*llx %c ",
BITS_PER_LONG / 4, self->ip, o);
}
ret += repsep_snprintf(bf + ret, size - ret, "%s",
self->ms.sym->name);
else
- ret += repsep_snprintf(bf + ret, size - ret, "%*Lx",
+ ret += repsep_snprintf(bf + ret, size - ret, "%-#*llx",
BITS_PER_LONG / 4, self->ip);
return ret;
#include <limits.h>
#include <sys/utsname.h>
+#ifndef KSYM_NAME_LEN
+#define KSYM_NAME_LEN 128
+#endif
+
#ifndef NT_GNU_BUILD_ID
#define NT_GNU_BUILD_ID 3
#endif
.exclude_other = true,
.use_modules = true,
.try_vmlinux_path = true,
+ .symfs = "",
};
int dso__name_len(const struct dso *self)
prev = curr;
curr = rb_entry(nd, struct symbol, rb_node);
- if (prev->end == prev->start)
+ if (prev->end == prev->start && prev->end != curr->start)
prev->end = curr->start - 1;
}
* We still haven't the actual symbols, so guess the
* last map final address.
*/
- curr->end = ~0UL;
+ curr->end = ~0ULL;
}
static void map_groups__fixup_end(struct map_groups *self)
int kallsyms__parse(const char *filename, void *arg,
int (*process_symbol)(void *arg, const char *name,
- char type, u64 start))
+ char type, u64 start, u64 end))
{
char *line = NULL;
size_t n;
- int err = 0;
+ int err = -1;
+ u64 prev_start = 0;
+ char prev_symbol_type = 0;
+ char *prev_symbol_name;
FILE *file = fopen(filename, "r");
if (file == NULL)
goto out_failure;
+ prev_symbol_name = malloc(KSYM_NAME_LEN);
+ if (prev_symbol_name == NULL)
+ goto out_close;
+
+ err = 0;
+
while (!feof(file)) {
u64 start;
int line_len, len;
continue;
symbol_type = toupper(line[len]);
- symbol_name = line + len + 2;
+ len += 2;
+ symbol_name = line + len;
+ len = line_len - len;
- err = process_symbol(arg, symbol_name, symbol_type, start);
- if (err)
+ if (len >= KSYM_NAME_LEN) {
+ err = -1;
break;
+ }
+
+ if (prev_symbol_type) {
+ u64 end = start;
+ if (end != prev_start)
+ --end;
+ err = process_symbol(arg, prev_symbol_name,
+ prev_symbol_type, prev_start, end);
+ if (err)
+ break;
+ }
+
+ memcpy(prev_symbol_name, symbol_name, len + 1);
+ prev_symbol_type = symbol_type;
+ prev_start = start;
}
+ free(prev_symbol_name);
free(line);
+out_close:
fclose(file);
return err;
}
static int map__process_kallsym_symbol(void *arg, const char *name,
- char type, u64 start)
+ char type, u64 start, u64 end)
{
struct symbol *sym;
struct process_kallsyms_args *a = arg;
if (!symbol_type__is_a(type, a->map->type))
return 0;
- /*
- * Will fix up the end later, when we have all symbols sorted.
- */
- sym = symbol__new(start, 0, kallsyms2elf_type(type), name);
-
+ sym = symbol__new(start, end - start + 1,
+ kallsyms2elf_type(type), name);
if (sym == NULL)
return -ENOMEM;
/*
if (dso__load_all_kallsyms(self, filename, map) < 0)
return -1;
- symbols__fixup_end(&self->symbols[map->type]);
if (self->kernel == DSO_TYPE_GUEST_KERNEL)
self->origin = DSO__ORIG_GUEST_KERNEL;
else
char sympltname[1024];
Elf *elf;
int nr = 0, symidx, fd, err = 0;
+ char name[PATH_MAX];
- fd = open(self->long_name, O_RDONLY);
+ snprintf(name, sizeof(name), "%s%s",
+ symbol_conf.symfs, self->long_name);
+ fd = open(name, O_RDONLY);
if (fd < 0)
goto out;
self->origin++) {
switch (self->origin) {
case DSO__ORIG_BUILD_ID_CACHE:
- if (dso__build_id_filename(self, name, size) == NULL)
+ /* skip the locally configured cache if a symfs is given */
+ if (symbol_conf.symfs[0] ||
+ (dso__build_id_filename(self, name, size) == NULL)) {
continue;
+ }
break;
case DSO__ORIG_FEDORA:
- snprintf(name, size, "/usr/lib/debug%s.debug",
- self->long_name);
+ snprintf(name, size, "%s/usr/lib/debug%s.debug",
+ symbol_conf.symfs, self->long_name);
break;
case DSO__ORIG_UBUNTU:
- snprintf(name, size, "/usr/lib/debug%s",
- self->long_name);
+ snprintf(name, size, "%s/usr/lib/debug%s",
+ symbol_conf.symfs, self->long_name);
break;
case DSO__ORIG_BUILDID: {
char build_id_hex[BUILD_ID_SIZE * 2 + 1];
sizeof(self->build_id),
build_id_hex);
snprintf(name, size,
- "/usr/lib/debug/.build-id/%.2s/%s.debug",
- build_id_hex, build_id_hex + 2);
+ "%s/usr/lib/debug/.build-id/%.2s/%s.debug",
+ symbol_conf.symfs, build_id_hex, build_id_hex + 2);
}
break;
case DSO__ORIG_DSO:
- snprintf(name, size, "%s", self->long_name);
+ snprintf(name, size, "%s%s",
+ symbol_conf.symfs, self->long_name);
break;
case DSO__ORIG_GUEST_KMODULE:
if (map->groups && map->groups->machine)
root_dir = map->groups->machine->root_dir;
else
root_dir = "";
- snprintf(name, size, "%s%s", root_dir, self->long_name);
+ snprintf(name, size, "%s%s%s", symbol_conf.symfs,
+ root_dir, self->long_name);
+ break;
+
+ case DSO__ORIG_KMODULE:
+ snprintf(name, size, "%s%s", symbol_conf.symfs,
+ self->long_name);
break;
default:
const char *vmlinux, symbol_filter_t filter)
{
int err = -1, fd;
+ char symfs_vmlinux[PATH_MAX];
- fd = open(vmlinux, O_RDONLY);
+ snprintf(symfs_vmlinux, sizeof(symfs_vmlinux), "%s/%s",
+ symbol_conf.symfs, vmlinux);
+ fd = open(symfs_vmlinux, O_RDONLY);
if (fd < 0)
return -1;
dso__set_loaded(self, map->type);
- err = dso__load_sym(self, map, vmlinux, fd, filter, 0, 0);
+ err = dso__load_sym(self, map, symfs_vmlinux, fd, filter, 0, 0);
close(fd);
if (err > 0)
- pr_debug("Using %s for symbols\n", vmlinux);
+ pr_debug("Using %s for symbols\n", symfs_vmlinux);
return err;
}
const char *kallsyms_filename = NULL;
char *kallsyms_allocated_filename = NULL;
/*
- * Step 1: if the user specified a vmlinux filename, use it and only
- * it, reporting errors to the user if it cannot be used.
+ * Step 1: if the user specified a kallsyms or vmlinux filename, use
+ * it and only it, reporting errors to the user if it cannot be used.
*
* For instance, try to analyse an ARM perf.data file _without_ a
* build-id, or if the user specifies the wrong path to the right
* validation in dso__load_vmlinux and will bail out if they don't
* match.
*/
+ if (symbol_conf.kallsyms_name != NULL) {
+ kallsyms_filename = symbol_conf.kallsyms_name;
+ goto do_kallsyms;
+ }
+
if (symbol_conf.vmlinux_name != NULL) {
err = dso__load_vmlinux(self, map,
symbol_conf.vmlinux_name, filter);
goto out_fixup;
}
+ /* do not try local files if a symfs was given */
+ if (symbol_conf.symfs[0] != 0)
+ return -1;
+
/*
* Say the kernel DSO was created when processing the build-id header table,
* we have a build-id, so check if it is the same as the running kernel,
};
static int symbol__in_kernel(void *arg, const char *name,
- char type __used, u64 start)
+ char type __used, u64 start, u64 end __used)
{
struct process_args *args = arg;
struct utsname uts;
char bf[PATH_MAX];
- if (uname(&uts) < 0)
- return -1;
-
vmlinux_path = malloc(sizeof(char *) * 5);
if (vmlinux_path == NULL)
return -1;
if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
goto out_fail;
++vmlinux_path__nr_entries;
+
+ /* only try running kernel version if no symfs was given */
+ if (symbol_conf.symfs[0] != 0)
+ return 0;
+
+ if (uname(&uts) < 0)
+ return -1;
+
snprintf(bf, sizeof(bf), "/boot/vmlinux-%s", uts.release);
vmlinux_path[vmlinux_path__nr_entries] = strdup(bf);
if (vmlinux_path[vmlinux_path__nr_entries] == NULL)
int symbol__init(void)
{
+ const char *symfs;
+
if (symbol_conf.initialized)
return 0;
symbol_conf.sym_list_str, "symbol") < 0)
goto out_free_comm_list;
+ /*
+ * A path to symbols of "/" is identical to ""
+ * reset here for simplicity.
+ */
+ symfs = realpath(symbol_conf.symfs, NULL);
+ if (symfs == NULL)
+ symfs = symbol_conf.symfs;
+ if (strcmp(symfs, "/") == 0)
+ symbol_conf.symfs = "";
+ if (symfs != symbol_conf.symfs)
+ free((void *)symfs);
+
symbol_conf.initialized = true;
return 0;
show_cpu_utilization,
initialized;
const char *vmlinux_name,
+ *kallsyms_name,
*source_prefix,
*field_sep;
const char *default_guest_vmlinux_name,
struct strlist *dso_list,
*comm_list,
*sym_list;
+ const char *symfs;
};
extern struct symbol_conf symbol_conf;
int build_id__sprintf(const u8 *self, int len, char *bf);
int kallsyms__parse(const char *filename, void *arg,
int (*process_symbol)(void *arg, const char *name,
- char type, u64 start));
+ char type, u64 start, u64 end));
void machine__destroy_kernel_maps(struct machine *self);
int __machine__create_kernel_maps(struct machine *self, struct dso *kernel);
return 1;
}
-int find_all_tid(int pid, pid_t ** all_tid)
+struct thread_map *thread_map__new_by_pid(pid_t pid)
{
+ struct thread_map *threads;
char name[256];
int items;
struct dirent **namelist = NULL;
- int ret = 0;
int i;
sprintf(name, "/proc/%d/task", pid);
items = scandir(name, &namelist, filter, NULL);
if (items <= 0)
- return -ENOENT;
- *all_tid = malloc(sizeof(pid_t) * items);
- if (!*all_tid) {
- ret = -ENOMEM;
- goto failure;
- }
-
- for (i = 0; i < items; i++)
- (*all_tid)[i] = atoi(namelist[i]->d_name);
+ return NULL;
- ret = items;
+ threads = malloc(sizeof(*threads) + sizeof(pid_t) * items);
+ if (threads != NULL) {
+ for (i = 0; i < items; i++)
+ threads->map[i] = atoi(namelist[i]->d_name);
+ threads->nr = items;
+ }
-failure:
for (i=0; i<items; i++)
free(namelist[i]);
free(namelist);
- return ret;
+ return threads;
+}
+
+struct thread_map *thread_map__new_by_tid(pid_t tid)
+{
+ struct thread_map *threads = malloc(sizeof(*threads) + sizeof(pid_t));
+
+ if (threads != NULL) {
+ threads->map[0] = tid;
+ threads->nr = 1;
+ }
+
+ return threads;
+}
+
+struct thread_map *thread_map__new(pid_t pid, pid_t tid)
+{
+ if (pid != -1)
+ return thread_map__new_by_pid(pid);
+ return thread_map__new_by_tid(tid);
}
static struct thread *thread__new(pid_t pid)
int comm_len;
};
+struct thread_map {
+ int nr;
+ int map[];
+};
+
struct perf_session;
void thread__delete(struct thread *self);
-int find_all_tid(int pid, pid_t ** all_tid);
+struct thread_map *thread_map__new_by_pid(pid_t pid);
+struct thread_map *thread_map__new_by_tid(pid_t tid);
+struct thread_map *thread_map__new(pid_t pid, pid_t tid);
+
+static inline void thread_map__delete(struct thread_map *threads)
+{
+ free(threads);
+}
+
int thread__set_comm(struct thread *self, const char *comm);
int thread__comm_len(struct thread *self);
struct thread *perf_session__findnew(struct perf_session *self, pid_t pid);
#include <ctype.h>
#include <errno.h>
#include <stdbool.h>
+#include <linux/list.h>
#include <linux/kernel.h>
#include "../perf.h"
#include "trace-event.h"
#include "debugfs.h"
+#include "evsel.h"
#define VERSION "0.5"
}
static struct tracepoint_path *
-get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events)
+get_tracepoints_path(struct list_head *pattrs)
{
struct tracepoint_path path, *ppath = &path;
- int i, nr_tracepoints = 0;
+ struct perf_evsel *pos;
+ int nr_tracepoints = 0;
- for (i = 0; i < nb_events; i++) {
- if (pattrs[i].type != PERF_TYPE_TRACEPOINT)
+ list_for_each_entry(pos, pattrs, node) {
+ if (pos->attr.type != PERF_TYPE_TRACEPOINT)
continue;
++nr_tracepoints;
- ppath->next = tracepoint_id_to_path(pattrs[i].config);
+ ppath->next = tracepoint_id_to_path(pos->attr.config);
if (!ppath->next)
die("%s\n", "No memory to alloc tracepoints list");
ppath = ppath->next;
return nr_tracepoints > 0 ? path.next : NULL;
}
-bool have_tracepoints(struct perf_event_attr *pattrs, int nb_events)
+bool have_tracepoints(struct list_head *pattrs)
{
- int i;
+ struct perf_evsel *pos;
- for (i = 0; i < nb_events; i++)
- if (pattrs[i].type == PERF_TYPE_TRACEPOINT)
+ list_for_each_entry(pos, pattrs, node)
+ if (pos->attr.type == PERF_TYPE_TRACEPOINT)
return true;
return false;
}
-int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events)
+int read_tracing_data(int fd, struct list_head *pattrs)
{
char buf[BUFSIZ];
- struct tracepoint_path *tps = get_tracepoints_path(pattrs, nb_events);
+ struct tracepoint_path *tps = get_tracepoints_path(pattrs);
/*
* What? No tracepoints? No sense writing anything here, bail out.
return 0;
}
-ssize_t read_tracing_data_size(int fd, struct perf_event_attr *pattrs,
- int nb_events)
+ssize_t read_tracing_data_size(int fd, struct list_head *pattrs)
{
ssize_t size;
int err = 0;
calc_data_size = 1;
- err = read_tracing_data(fd, pattrs, nb_events);
+ err = read_tracing_data(fd, pattrs);
size = calc_data_size - 1;
calc_data_size = 0;
void *raw_field_ptr(struct event *event, const char *name, void *data);
unsigned long long eval_flag(const char *flag);
-int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events);
-ssize_t read_tracing_data_size(int fd, struct perf_event_attr *pattrs,
- int nb_events);
+int read_tracing_data(int fd, struct list_head *pattrs);
+ssize_t read_tracing_data_size(int fd, struct list_head *pattrs);
/* taken from kernel/trace/trace.h */
enum trace_flag_type {
return rc;
}
-static const char yes[] = "Yes", no[] = "No";
+static const char yes[] = "Yes", no[] = "No",
+ warning_str[] = "Warning!", ok[] = "Ok";
bool ui__dialog_yesno(const char *msg)
{
/* newtWinChoice should really be accepting const char pointers... */
return newtWinChoice(NULL, (char *)yes, (char *)no, (char *)msg) == 1;
}
+
+void ui__warning(const char *format, ...)
+{
+ va_list args;
+
+ va_start(args, format);
+ if (use_browser > 0)
+ newtWinMessagev((char *)warning_str, (char *)ok,
+ (char *)format, args);
+ else
+ vfprintf(stderr, format, args);
+ va_end(args);
+}
return value;
}
+
+int readn(int fd, void *buf, size_t n)
+{
+ void *buf_start = buf;
+
+ while (n) {
+ int ret = read(fd, buf, n);
+
+ if (ret <= 0)
+ return ret;
+
+ n -= ret;
+ buf += ret;
+ }
+
+ return buf - buf_start;
+}
bool strglobmatch(const char *str, const char *pat);
bool strlazymatch(const char *str, const char *pat);
unsigned long convert_unit(unsigned long value, char *unit);
+int readn(int fd, void *buf, size_t size);
#define _STR(x) #x
#define STR(x) _STR(x)
--- /dev/null
+#include "xyarray.h"
+#include "util.h"
+
+struct xyarray *xyarray__new(int xlen, int ylen, size_t entry_size)
+{
+ size_t row_size = ylen * entry_size;
+ struct xyarray *xy = zalloc(sizeof(*xy) + xlen * row_size);
+
+ if (xy != NULL) {
+ xy->entry_size = entry_size;
+ xy->row_size = row_size;
+ }
+
+ return xy;
+}
+
+void xyarray__delete(struct xyarray *xy)
+{
+ free(xy);
+}
--- /dev/null
+#ifndef _PERF_XYARRAY_H_
+#define _PERF_XYARRAY_H_ 1
+
+#include <sys/types.h>
+
+struct xyarray {
+ size_t row_size;
+ size_t entry_size;
+ char contents[];
+};
+
+struct xyarray *xyarray__new(int xlen, int ylen, size_t entry_size);
+void xyarray__delete(struct xyarray *xy);
+
+static inline void *xyarray__entry(struct xyarray *xy, int x, int y)
+{
+ return &xy->contents[x * xy->row_size + y * xy->entry_size];
+}
+
+#endif /* _PERF_XYARRAY_H_ */