perf/x86/intel: Support full width counting
[firefly-linux-kernel-4.4.55.git] / arch / x86 / kernel / process_64.c
1 /*
2  *  Copyright (C) 1995  Linus Torvalds
3  *
4  *  Pentium III FXSR, SSE support
5  *      Gareth Hughes <gareth@valinux.com>, May 2000
6  *
7  *  X86-64 port
8  *      Andi Kleen.
9  *
10  *      CPU hotplug support - ashok.raj@intel.com
11  */
12
13 /*
14  * This file handles the architecture-dependent parts of process handling..
15  */
16
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/fs.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/elfcore.h>
24 #include <linux/smp.h>
25 #include <linux/slab.h>
26 #include <linux/user.h>
27 #include <linux/interrupt.h>
28 #include <linux/delay.h>
29 #include <linux/module.h>
30 #include <linux/ptrace.h>
31 #include <linux/notifier.h>
32 #include <linux/kprobes.h>
33 #include <linux/kdebug.h>
34 #include <linux/prctl.h>
35 #include <linux/uaccess.h>
36 #include <linux/io.h>
37 #include <linux/ftrace.h>
38
39 #include <asm/pgtable.h>
40 #include <asm/processor.h>
41 #include <asm/i387.h>
42 #include <asm/fpu-internal.h>
43 #include <asm/mmu_context.h>
44 #include <asm/prctl.h>
45 #include <asm/desc.h>
46 #include <asm/proto.h>
47 #include <asm/ia32.h>
48 #include <asm/idle.h>
49 #include <asm/syscalls.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52
53 asmlinkage extern void ret_from_fork(void);
54
55 DEFINE_PER_CPU(unsigned long, old_rsp);
56
57 /* Prints also some state that isn't saved in the pt_regs */
58 void __show_regs(struct pt_regs *regs, int all)
59 {
60         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
61         unsigned long d0, d1, d2, d3, d6, d7;
62         unsigned int fsindex, gsindex;
63         unsigned int ds, cs, es;
64
65         printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
66         printk_address(regs->ip, 1);
67         printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
68                         regs->sp, regs->flags);
69         printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
70                regs->ax, regs->bx, regs->cx);
71         printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
72                regs->dx, regs->si, regs->di);
73         printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
74                regs->bp, regs->r8, regs->r9);
75         printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
76                regs->r10, regs->r11, regs->r12);
77         printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
78                regs->r13, regs->r14, regs->r15);
79
80         asm("movl %%ds,%0" : "=r" (ds));
81         asm("movl %%cs,%0" : "=r" (cs));
82         asm("movl %%es,%0" : "=r" (es));
83         asm("movl %%fs,%0" : "=r" (fsindex));
84         asm("movl %%gs,%0" : "=r" (gsindex));
85
86         rdmsrl(MSR_FS_BASE, fs);
87         rdmsrl(MSR_GS_BASE, gs);
88         rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
89
90         if (!all)
91                 return;
92
93         cr0 = read_cr0();
94         cr2 = read_cr2();
95         cr3 = read_cr3();
96         cr4 = read_cr4();
97
98         printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
99                fs, fsindex, gs, gsindex, shadowgs);
100         printk(KERN_DEFAULT "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
101                         es, cr0);
102         printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
103                         cr4);
104
105         get_debugreg(d0, 0);
106         get_debugreg(d1, 1);
107         get_debugreg(d2, 2);
108         printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
109         get_debugreg(d3, 3);
110         get_debugreg(d6, 6);
111         get_debugreg(d7, 7);
112         printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
113 }
114
115 void release_thread(struct task_struct *dead_task)
116 {
117         if (dead_task->mm) {
118                 if (dead_task->mm->context.size) {
119                         pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
120                                 dead_task->comm,
121                                 dead_task->mm->context.ldt,
122                                 dead_task->mm->context.size);
123                         BUG();
124                 }
125         }
126 }
127
128 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
129 {
130         struct user_desc ud = {
131                 .base_addr = addr,
132                 .limit = 0xfffff,
133                 .seg_32bit = 1,
134                 .limit_in_pages = 1,
135                 .useable = 1,
136         };
137         struct desc_struct *desc = t->thread.tls_array;
138         desc += tls;
139         fill_ldt(desc, &ud);
140 }
141
142 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
143 {
144         return get_desc_base(&t->thread.tls_array[tls]);
145 }
146
147 int copy_thread(unsigned long clone_flags, unsigned long sp,
148                 unsigned long arg, struct task_struct *p)
149 {
150         int err;
151         struct pt_regs *childregs;
152         struct task_struct *me = current;
153
154         p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
155         childregs = task_pt_regs(p);
156         p->thread.sp = (unsigned long) childregs;
157         p->thread.usersp = me->thread.usersp;
158         set_tsk_thread_flag(p, TIF_FORK);
159         p->fpu_counter = 0;
160         p->thread.io_bitmap_ptr = NULL;
161
162         savesegment(gs, p->thread.gsindex);
163         p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
164         savesegment(fs, p->thread.fsindex);
165         p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
166         savesegment(es, p->thread.es);
167         savesegment(ds, p->thread.ds);
168         memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
169
170         if (unlikely(p->flags & PF_KTHREAD)) {
171                 /* kernel thread */
172                 memset(childregs, 0, sizeof(struct pt_regs));
173                 childregs->sp = (unsigned long)childregs;
174                 childregs->ss = __KERNEL_DS;
175                 childregs->bx = sp; /* function */
176                 childregs->bp = arg;
177                 childregs->orig_ax = -1;
178                 childregs->cs = __KERNEL_CS | get_kernel_rpl();
179                 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
180                 return 0;
181         }
182         *childregs = *current_pt_regs();
183
184         childregs->ax = 0;
185         if (sp)
186                 childregs->sp = sp;
187
188         err = -ENOMEM;
189         memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
190
191         if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
192                 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
193                                                   IO_BITMAP_BYTES, GFP_KERNEL);
194                 if (!p->thread.io_bitmap_ptr) {
195                         p->thread.io_bitmap_max = 0;
196                         return -ENOMEM;
197                 }
198                 set_tsk_thread_flag(p, TIF_IO_BITMAP);
199         }
200
201         /*
202          * Set a new TLS for the child thread?
203          */
204         if (clone_flags & CLONE_SETTLS) {
205 #ifdef CONFIG_IA32_EMULATION
206                 if (test_thread_flag(TIF_IA32))
207                         err = do_set_thread_area(p, -1,
208                                 (struct user_desc __user *)childregs->si, 0);
209                 else
210 #endif
211                         err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
212                 if (err)
213                         goto out;
214         }
215         err = 0;
216 out:
217         if (err && p->thread.io_bitmap_ptr) {
218                 kfree(p->thread.io_bitmap_ptr);
219                 p->thread.io_bitmap_max = 0;
220         }
221
222         return err;
223 }
224
225 static void
226 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
227                     unsigned long new_sp,
228                     unsigned int _cs, unsigned int _ss, unsigned int _ds)
229 {
230         loadsegment(fs, 0);
231         loadsegment(es, _ds);
232         loadsegment(ds, _ds);
233         load_gs_index(0);
234         current->thread.usersp  = new_sp;
235         regs->ip                = new_ip;
236         regs->sp                = new_sp;
237         this_cpu_write(old_rsp, new_sp);
238         regs->cs                = _cs;
239         regs->ss                = _ss;
240         regs->flags             = X86_EFLAGS_IF;
241 }
242
243 void
244 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
245 {
246         start_thread_common(regs, new_ip, new_sp,
247                             __USER_CS, __USER_DS, 0);
248 }
249
250 #ifdef CONFIG_IA32_EMULATION
251 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
252 {
253         start_thread_common(regs, new_ip, new_sp,
254                             test_thread_flag(TIF_X32)
255                             ? __USER_CS : __USER32_CS,
256                             __USER_DS, __USER_DS);
257 }
258 #endif
259
260 /*
261  *      switch_to(x,y) should switch tasks from x to y.
262  *
263  * This could still be optimized:
264  * - fold all the options into a flag word and test it with a single test.
265  * - could test fs/gs bitsliced
266  *
267  * Kprobes not supported here. Set the probe on schedule instead.
268  * Function graph tracer not supported too.
269  */
270 __notrace_funcgraph struct task_struct *
271 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
272 {
273         struct thread_struct *prev = &prev_p->thread;
274         struct thread_struct *next = &next_p->thread;
275         int cpu = smp_processor_id();
276         struct tss_struct *tss = &per_cpu(init_tss, cpu);
277         unsigned fsindex, gsindex;
278         fpu_switch_t fpu;
279
280         fpu = switch_fpu_prepare(prev_p, next_p, cpu);
281
282         /*
283          * Reload esp0, LDT and the page table pointer:
284          */
285         load_sp0(tss, next);
286
287         /*
288          * Switch DS and ES.
289          * This won't pick up thread selector changes, but I guess that is ok.
290          */
291         savesegment(es, prev->es);
292         if (unlikely(next->es | prev->es))
293                 loadsegment(es, next->es);
294
295         savesegment(ds, prev->ds);
296         if (unlikely(next->ds | prev->ds))
297                 loadsegment(ds, next->ds);
298
299
300         /* We must save %fs and %gs before load_TLS() because
301          * %fs and %gs may be cleared by load_TLS().
302          *
303          * (e.g. xen_load_tls())
304          */
305         savesegment(fs, fsindex);
306         savesegment(gs, gsindex);
307
308         load_TLS(next, cpu);
309
310         /*
311          * Leave lazy mode, flushing any hypercalls made here.
312          * This must be done before restoring TLS segments so
313          * the GDT and LDT are properly updated, and must be
314          * done before math_state_restore, so the TS bit is up
315          * to date.
316          */
317         arch_end_context_switch(next_p);
318
319         /*
320          * Switch FS and GS.
321          *
322          * Segment register != 0 always requires a reload.  Also
323          * reload when it has changed.  When prev process used 64bit
324          * base always reload to avoid an information leak.
325          */
326         if (unlikely(fsindex | next->fsindex | prev->fs)) {
327                 loadsegment(fs, next->fsindex);
328                 /*
329                  * Check if the user used a selector != 0; if yes
330                  *  clear 64bit base, since overloaded base is always
331                  *  mapped to the Null selector
332                  */
333                 if (fsindex)
334                         prev->fs = 0;
335         }
336         /* when next process has a 64bit base use it */
337         if (next->fs)
338                 wrmsrl(MSR_FS_BASE, next->fs);
339         prev->fsindex = fsindex;
340
341         if (unlikely(gsindex | next->gsindex | prev->gs)) {
342                 load_gs_index(next->gsindex);
343                 if (gsindex)
344                         prev->gs = 0;
345         }
346         if (next->gs)
347                 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
348         prev->gsindex = gsindex;
349
350         switch_fpu_finish(next_p, fpu);
351
352         /*
353          * Switch the PDA and FPU contexts.
354          */
355         prev->usersp = this_cpu_read(old_rsp);
356         this_cpu_write(old_rsp, next->usersp);
357         this_cpu_write(current_task, next_p);
358
359         this_cpu_write(kernel_stack,
360                   (unsigned long)task_stack_page(next_p) +
361                   THREAD_SIZE - KERNEL_STACK_OFFSET);
362
363         /*
364          * Now maybe reload the debug registers and handle I/O bitmaps
365          */
366         if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
367                      task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
368                 __switch_to_xtra(prev_p, next_p, tss);
369
370         return prev_p;
371 }
372
373 void set_personality_64bit(void)
374 {
375         /* inherit personality from parent */
376
377         /* Make sure to be in 64bit mode */
378         clear_thread_flag(TIF_IA32);
379         clear_thread_flag(TIF_ADDR32);
380         clear_thread_flag(TIF_X32);
381
382         /* Ensure the corresponding mm is not marked. */
383         if (current->mm)
384                 current->mm->context.ia32_compat = 0;
385
386         /* TBD: overwrites user setup. Should have two bits.
387            But 64bit processes have always behaved this way,
388            so it's not too bad. The main problem is just that
389            32bit childs are affected again. */
390         current->personality &= ~READ_IMPLIES_EXEC;
391 }
392
393 void set_personality_ia32(bool x32)
394 {
395         /* inherit personality from parent */
396
397         /* Make sure to be in 32bit mode */
398         set_thread_flag(TIF_ADDR32);
399
400         /* Mark the associated mm as containing 32-bit tasks. */
401         if (current->mm)
402                 current->mm->context.ia32_compat = 1;
403
404         if (x32) {
405                 clear_thread_flag(TIF_IA32);
406                 set_thread_flag(TIF_X32);
407                 current->personality &= ~READ_IMPLIES_EXEC;
408                 /* is_compat_task() uses the presence of the x32
409                    syscall bit flag to determine compat status */
410                 current_thread_info()->status &= ~TS_COMPAT;
411         } else {
412                 set_thread_flag(TIF_IA32);
413                 clear_thread_flag(TIF_X32);
414                 current->personality |= force_personality32;
415                 /* Prepare the first "return" to user space */
416                 current_thread_info()->status |= TS_COMPAT;
417         }
418 }
419 EXPORT_SYMBOL_GPL(set_personality_ia32);
420
421 unsigned long get_wchan(struct task_struct *p)
422 {
423         unsigned long stack;
424         u64 fp, ip;
425         int count = 0;
426
427         if (!p || p == current || p->state == TASK_RUNNING)
428                 return 0;
429         stack = (unsigned long)task_stack_page(p);
430         if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
431                 return 0;
432         fp = *(u64 *)(p->thread.sp);
433         do {
434                 if (fp < (unsigned long)stack ||
435                     fp >= (unsigned long)stack+THREAD_SIZE)
436                         return 0;
437                 ip = *(u64 *)(fp+8);
438                 if (!in_sched_functions(ip))
439                         return ip;
440                 fp = *(u64 *)fp;
441         } while (count++ < 16);
442         return 0;
443 }
444
445 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
446 {
447         int ret = 0;
448         int doit = task == current;
449         int cpu;
450
451         switch (code) {
452         case ARCH_SET_GS:
453                 if (addr >= TASK_SIZE_OF(task))
454                         return -EPERM;
455                 cpu = get_cpu();
456                 /* handle small bases via the GDT because that's faster to
457                    switch. */
458                 if (addr <= 0xffffffff) {
459                         set_32bit_tls(task, GS_TLS, addr);
460                         if (doit) {
461                                 load_TLS(&task->thread, cpu);
462                                 load_gs_index(GS_TLS_SEL);
463                         }
464                         task->thread.gsindex = GS_TLS_SEL;
465                         task->thread.gs = 0;
466                 } else {
467                         task->thread.gsindex = 0;
468                         task->thread.gs = addr;
469                         if (doit) {
470                                 load_gs_index(0);
471                                 ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
472                         }
473                 }
474                 put_cpu();
475                 break;
476         case ARCH_SET_FS:
477                 /* Not strictly needed for fs, but do it for symmetry
478                    with gs */
479                 if (addr >= TASK_SIZE_OF(task))
480                         return -EPERM;
481                 cpu = get_cpu();
482                 /* handle small bases via the GDT because that's faster to
483                    switch. */
484                 if (addr <= 0xffffffff) {
485                         set_32bit_tls(task, FS_TLS, addr);
486                         if (doit) {
487                                 load_TLS(&task->thread, cpu);
488                                 loadsegment(fs, FS_TLS_SEL);
489                         }
490                         task->thread.fsindex = FS_TLS_SEL;
491                         task->thread.fs = 0;
492                 } else {
493                         task->thread.fsindex = 0;
494                         task->thread.fs = addr;
495                         if (doit) {
496                                 /* set the selector to 0 to not confuse
497                                    __switch_to */
498                                 loadsegment(fs, 0);
499                                 ret = wrmsrl_safe(MSR_FS_BASE, addr);
500                         }
501                 }
502                 put_cpu();
503                 break;
504         case ARCH_GET_FS: {
505                 unsigned long base;
506                 if (task->thread.fsindex == FS_TLS_SEL)
507                         base = read_32bit_tls(task, FS_TLS);
508                 else if (doit)
509                         rdmsrl(MSR_FS_BASE, base);
510                 else
511                         base = task->thread.fs;
512                 ret = put_user(base, (unsigned long __user *)addr);
513                 break;
514         }
515         case ARCH_GET_GS: {
516                 unsigned long base;
517                 unsigned gsindex;
518                 if (task->thread.gsindex == GS_TLS_SEL)
519                         base = read_32bit_tls(task, GS_TLS);
520                 else if (doit) {
521                         savesegment(gs, gsindex);
522                         if (gsindex)
523                                 rdmsrl(MSR_KERNEL_GS_BASE, base);
524                         else
525                                 base = task->thread.gs;
526                 } else
527                         base = task->thread.gs;
528                 ret = put_user(base, (unsigned long __user *)addr);
529                 break;
530         }
531
532         default:
533                 ret = -EINVAL;
534                 break;
535         }
536
537         return ret;
538 }
539
540 long sys_arch_prctl(int code, unsigned long addr)
541 {
542         return do_arch_prctl(current, code, addr);
543 }
544
545 unsigned long KSTK_ESP(struct task_struct *task)
546 {
547         return (test_tsk_thread_flag(task, TIF_IA32)) ?
548                         (task_pt_regs(task)->sp) : ((task)->thread.usersp);
549 }