4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include "../include/lustre_dlm.h"
45 #include "../include/lustre_lite.h"
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include "../include/lustre/ll_fiemap.h"
51 #include "../include/cl_object.h"
54 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
56 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
59 static enum llioc_iter
60 ll_iocontrol_call(struct inode *inode, struct file *file,
61 unsigned int cmd, unsigned long arg, int *rcp);
63 static struct ll_file_data *ll_file_data_get(void)
65 struct ll_file_data *fd;
67 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
74 static void ll_file_data_put(struct ll_file_data *fd)
77 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
80 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
81 struct lustre_handle *fh)
83 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
84 op_data->op_attr.ia_mode = inode->i_mode;
85 op_data->op_attr.ia_atime = inode->i_atime;
86 op_data->op_attr.ia_mtime = inode->i_mtime;
87 op_data->op_attr.ia_ctime = inode->i_ctime;
88 op_data->op_attr.ia_size = i_size_read(inode);
89 op_data->op_attr_blocks = inode->i_blocks;
90 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
91 ll_inode_to_ext_flags(inode->i_flags);
92 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
94 op_data->op_handle = *fh;
95 op_data->op_capa1 = ll_mdscapa_get(inode);
97 if (ll_i2info(inode)->lli_flags & LLIF_DATA_MODIFIED)
98 op_data->op_bias |= MDS_DATA_MODIFIED;
102 * Closes the IO epoch and packs all the attributes into @op_data for
105 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
106 struct obd_client_handle *och)
108 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
109 ATTR_MTIME | ATTR_MTIME_SET |
110 ATTR_CTIME | ATTR_CTIME_SET;
112 if (!(och->och_flags & FMODE_WRITE))
115 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
116 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
118 ll_ioepoch_close(inode, op_data, &och, 0);
121 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
122 ll_prep_md_op_data(op_data, inode, NULL, NULL,
123 0, 0, LUSTRE_OPC_ANY, NULL);
126 static int ll_close_inode_openhandle(struct obd_export *md_exp,
128 struct obd_client_handle *och,
129 const __u64 *data_version)
131 struct obd_export *exp = ll_i2mdexp(inode);
132 struct md_op_data *op_data;
133 struct ptlrpc_request *req = NULL;
134 struct obd_device *obd = class_exp2obd(exp);
140 * XXX: in case of LMV, is this correct to access
143 CERROR("Invalid MDC connection handle %#llx\n",
144 ll_i2mdexp(inode)->exp_handle.h_cookie);
149 op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
151 /* XXX We leak openhandle and request here. */
156 ll_prepare_close(inode, op_data, och);
157 if (data_version != NULL) {
158 /* Pass in data_version implies release. */
159 op_data->op_bias |= MDS_HSM_RELEASE;
160 op_data->op_data_version = *data_version;
161 op_data->op_lease_handle = och->och_lease_handle;
162 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
164 epoch_close = op_data->op_flags & MF_EPOCH_CLOSE;
165 rc = md_close(md_exp, op_data, och->och_mod, &req);
167 /* This close must have the epoch closed. */
168 LASSERT(epoch_close);
169 /* MDS has instructed us to obtain Size-on-MDS attribute from
170 * OSTs and send setattr to back to MDS. */
171 rc = ll_som_update(inode, op_data);
173 CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
178 CERROR("inode %lu mdc close failed: rc = %d\n",
182 /* DATA_MODIFIED flag was successfully sent on close, cancel data
183 * modification flag. */
184 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
185 struct ll_inode_info *lli = ll_i2info(inode);
187 spin_lock(&lli->lli_lock);
188 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
189 spin_unlock(&lli->lli_lock);
193 rc = ll_objects_destroy(req, inode);
195 CERROR("inode %lu ll_objects destroy: rc = %d\n",
198 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
199 struct mdt_body *body;
201 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
202 if (!(body->valid & OBD_MD_FLRELEASED))
206 ll_finish_md_op_data(op_data);
209 if (exp_connect_som(exp) && !epoch_close &&
210 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
211 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
213 md_clear_open_replay_data(md_exp, och);
214 /* Free @och if it is not waiting for DONE_WRITING. */
215 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
218 if (req) /* This is close request */
219 ptlrpc_req_finished(req);
223 int ll_md_real_close(struct inode *inode, fmode_t fmode)
225 struct ll_inode_info *lli = ll_i2info(inode);
226 struct obd_client_handle **och_p;
227 struct obd_client_handle *och;
231 if (fmode & FMODE_WRITE) {
232 och_p = &lli->lli_mds_write_och;
233 och_usecount = &lli->lli_open_fd_write_count;
234 } else if (fmode & FMODE_EXEC) {
235 och_p = &lli->lli_mds_exec_och;
236 och_usecount = &lli->lli_open_fd_exec_count;
238 LASSERT(fmode & FMODE_READ);
239 och_p = &lli->lli_mds_read_och;
240 och_usecount = &lli->lli_open_fd_read_count;
243 mutex_lock(&lli->lli_och_mutex);
244 if (*och_usecount > 0) {
245 /* There are still users of this handle, so skip
247 mutex_unlock(&lli->lli_och_mutex);
253 mutex_unlock(&lli->lli_och_mutex);
256 /* There might be a race and this handle may already
258 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
265 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
268 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
269 struct ll_inode_info *lli = ll_i2info(inode);
271 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
272 struct lustre_handle lockh;
273 ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_OPEN} };
276 /* clear group lock, if present */
277 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
278 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
280 if (fd->fd_lease_och != NULL) {
283 /* Usually the lease is not released when the
284 * application crashed, we need to release here. */
285 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
286 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
287 PFID(&lli->lli_fid), rc, lease_broken);
289 fd->fd_lease_och = NULL;
292 if (fd->fd_och != NULL) {
293 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
298 /* Let's see if we have good enough OPEN lock on the file and if
299 we can skip talking to MDS */
301 mutex_lock(&lli->lli_och_mutex);
302 if (fd->fd_omode & FMODE_WRITE) {
304 LASSERT(lli->lli_open_fd_write_count);
305 lli->lli_open_fd_write_count--;
306 } else if (fd->fd_omode & FMODE_EXEC) {
308 LASSERT(lli->lli_open_fd_exec_count);
309 lli->lli_open_fd_exec_count--;
312 LASSERT(lli->lli_open_fd_read_count);
313 lli->lli_open_fd_read_count--;
315 mutex_unlock(&lli->lli_och_mutex);
317 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
318 LDLM_IBITS, &policy, lockmode, &lockh))
319 rc = ll_md_real_close(inode, fd->fd_omode);
322 LUSTRE_FPRIVATE(file) = NULL;
323 ll_file_data_put(fd);
324 ll_capa_close(inode);
329 /* While this returns an error code, fput() the caller does not, so we need
330 * to make every effort to clean up all of our state here. Also, applications
331 * rarely check close errors and even if an error is returned they will not
332 * re-try the close call.
334 int ll_file_release(struct inode *inode, struct file *file)
336 struct ll_file_data *fd;
337 struct ll_sb_info *sbi = ll_i2sbi(inode);
338 struct ll_inode_info *lli = ll_i2info(inode);
341 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
342 inode->i_generation, inode);
344 #ifdef CONFIG_FS_POSIX_ACL
345 if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
346 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
349 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
350 fd->fd_flags &= ~LL_FILE_RMTACL;
351 rct_del(&sbi->ll_rct, current_pid());
352 et_search_free(&sbi->ll_et, current_pid());
357 if (!is_root_inode(inode))
358 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
359 fd = LUSTRE_FPRIVATE(file);
362 /* The last ref on @file, maybe not the owner pid of statahead.
363 * Different processes can open the same dir, "ll_opendir_key" means:
364 * it is me that should stop the statahead thread. */
365 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
366 lli->lli_opendir_pid != 0)
367 ll_stop_statahead(inode, lli->lli_opendir_key);
369 if (is_root_inode(inode)) {
370 LUSTRE_FPRIVATE(file) = NULL;
371 ll_file_data_put(fd);
375 if (!S_ISDIR(inode->i_mode)) {
376 lov_read_and_clear_async_rc(lli->lli_clob);
377 lli->lli_async_rc = 0;
380 rc = ll_md_close(sbi->ll_md_exp, inode, file);
382 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
383 libcfs_debug_dumplog();
388 static int ll_intent_file_open(struct dentry *dentry, void *lmm,
389 int lmmsize, struct lookup_intent *itp)
391 struct inode *inode = d_inode(dentry);
392 struct ll_sb_info *sbi = ll_i2sbi(inode);
393 struct dentry *parent = dentry->d_parent;
394 const char *name = dentry->d_name.name;
395 const int len = dentry->d_name.len;
396 struct md_op_data *op_data;
397 struct ptlrpc_request *req;
398 __u32 opc = LUSTRE_OPC_ANY;
401 /* Usually we come here only for NFSD, and we want open lock.
402 But we can also get here with pre 2.6.15 patchless kernels, and in
403 that case that lock is also ok */
404 /* We can also get here if there was cached open handle in revalidate_it
405 * but it disappeared while we were getting from there to ll_file_open.
406 * But this means this file was closed and immediately opened which
407 * makes a good candidate for using OPEN lock */
408 /* If lmmsize & lmm are not 0, we are just setting stripe info
409 * parameters. No need for the open lock */
410 if (lmm == NULL && lmmsize == 0) {
411 itp->it_flags |= MDS_OPEN_LOCK;
412 if (itp->it_flags & FMODE_WRITE)
413 opc = LUSTRE_OPC_CREATE;
416 op_data = ll_prep_md_op_data(NULL, d_inode(parent),
420 return PTR_ERR(op_data);
422 itp->it_flags |= MDS_OPEN_BY_FID;
423 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
424 0 /*unused */, &req, ll_md_blocking_ast, 0);
425 ll_finish_md_op_data(op_data);
427 /* reason for keep own exit path - don`t flood log
428 * with messages with -ESTALE errors.
430 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
431 it_open_error(DISP_OPEN_OPEN, itp))
433 ll_release_openhandle(inode, itp);
437 if (it_disposition(itp, DISP_LOOKUP_NEG)) {
442 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
443 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
444 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
448 rc = ll_prep_inode(&inode, req, NULL, itp);
449 if (!rc && itp->d.lustre.it_lock_mode)
450 ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL);
453 ptlrpc_req_finished(req);
454 ll_intent_drop_lock(itp);
460 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
461 * not believe attributes if a few ioepoch holders exist. Attributes for
462 * previous ioepoch if new one is opened are also skipped by MDS.
464 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
466 if (ioepoch && lli->lli_ioepoch != ioepoch) {
467 lli->lli_ioepoch = ioepoch;
468 CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
469 ioepoch, PFID(&lli->lli_fid));
473 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
474 struct obd_client_handle *och)
476 struct ptlrpc_request *req = it->d.lustre.it_data;
477 struct mdt_body *body;
479 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
480 och->och_fh = body->handle;
481 och->och_fid = body->fid1;
482 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
483 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
484 och->och_flags = it->it_flags;
486 return md_set_open_replay_data(md_exp, och, it);
489 static int ll_local_open(struct file *file, struct lookup_intent *it,
490 struct ll_file_data *fd, struct obd_client_handle *och)
492 struct inode *inode = file_inode(file);
493 struct ll_inode_info *lli = ll_i2info(inode);
495 LASSERT(!LUSTRE_FPRIVATE(file));
500 struct ptlrpc_request *req = it->d.lustre.it_data;
501 struct mdt_body *body;
504 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
508 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
509 ll_ioepoch_open(lli, body->ioepoch);
512 LUSTRE_FPRIVATE(file) = fd;
513 ll_readahead_init(inode, &fd->fd_ras);
514 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
518 /* Open a file, and (for the very first open) create objects on the OSTs at
519 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
520 * creation or open until ll_lov_setstripe() ioctl is called.
522 * If we already have the stripe MD locally then we don't request it in
523 * md_open(), by passing a lmm_size = 0.
525 * It is up to the application to ensure no other processes open this file
526 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
527 * used. We might be able to avoid races of that sort by getting lli_open_sem
528 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
529 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
531 int ll_file_open(struct inode *inode, struct file *file)
533 struct ll_inode_info *lli = ll_i2info(inode);
534 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
535 .it_flags = file->f_flags };
536 struct obd_client_handle **och_p = NULL;
537 __u64 *och_usecount = NULL;
538 struct ll_file_data *fd;
539 int rc = 0, opendir_set = 0;
541 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
542 inode->i_generation, inode, file->f_flags);
544 it = file->private_data; /* XXX: compat macro */
545 file->private_data = NULL; /* prevent ll_local_open assertion */
547 fd = ll_file_data_get();
554 if (S_ISDIR(inode->i_mode)) {
555 spin_lock(&lli->lli_sa_lock);
556 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
557 lli->lli_opendir_pid == 0) {
558 lli->lli_opendir_key = fd;
559 lli->lli_opendir_pid = current_pid();
562 spin_unlock(&lli->lli_sa_lock);
565 if (is_root_inode(inode)) {
566 LUSTRE_FPRIVATE(file) = fd;
570 if (!it || !it->d.lustre.it_disposition) {
571 /* Convert f_flags into access mode. We cannot use file->f_mode,
572 * because everything but O_ACCMODE mask was stripped from
574 if ((oit.it_flags + 1) & O_ACCMODE)
576 if (file->f_flags & O_TRUNC)
577 oit.it_flags |= FMODE_WRITE;
579 /* kernel only call f_op->open in dentry_open. filp_open calls
580 * dentry_open after call to open_namei that checks permissions.
581 * Only nfsd_open call dentry_open directly without checking
582 * permissions and because of that this code below is safe. */
583 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
584 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
586 /* We do not want O_EXCL here, presumably we opened the file
587 * already? XXX - NFS implications? */
588 oit.it_flags &= ~O_EXCL;
590 /* bug20584, if "it_flags" contains O_CREAT, the file will be
591 * created if necessary, then "IT_CREAT" should be set to keep
592 * consistent with it */
593 if (oit.it_flags & O_CREAT)
594 oit.it_op |= IT_CREAT;
600 /* Let's see if we have file open on MDS already. */
601 if (it->it_flags & FMODE_WRITE) {
602 och_p = &lli->lli_mds_write_och;
603 och_usecount = &lli->lli_open_fd_write_count;
604 } else if (it->it_flags & FMODE_EXEC) {
605 och_p = &lli->lli_mds_exec_och;
606 och_usecount = &lli->lli_open_fd_exec_count;
608 och_p = &lli->lli_mds_read_och;
609 och_usecount = &lli->lli_open_fd_read_count;
612 mutex_lock(&lli->lli_och_mutex);
613 if (*och_p) { /* Open handle is present */
614 if (it_disposition(it, DISP_OPEN_OPEN)) {
615 /* Well, there's extra open request that we do not need,
616 let's close it somehow. This will decref request. */
617 rc = it_open_error(DISP_OPEN_OPEN, it);
619 mutex_unlock(&lli->lli_och_mutex);
623 ll_release_openhandle(inode, it);
627 rc = ll_local_open(file, it, fd, NULL);
630 mutex_unlock(&lli->lli_och_mutex);
634 LASSERT(*och_usecount == 0);
635 if (!it->d.lustre.it_disposition) {
636 /* We cannot just request lock handle now, new ELC code
637 means that one of other OPEN locks for this file
638 could be cancelled, and since blocking ast handler
639 would attempt to grab och_mutex as well, that would
640 result in a deadlock */
641 mutex_unlock(&lli->lli_och_mutex);
642 it->it_create_mode |= M_CHECK_STALE;
643 rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it);
644 it->it_create_mode &= ~M_CHECK_STALE;
650 *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
658 /* md_intent_lock() didn't get a request ref if there was an
659 * open error, so don't do cleanup on the request here
661 /* XXX (green): Should not we bail out on any error here, not
662 * just open error? */
663 rc = it_open_error(DISP_OPEN_OPEN, it);
667 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
669 rc = ll_local_open(file, it, fd, *och_p);
673 mutex_unlock(&lli->lli_och_mutex);
676 /* Must do this outside lli_och_mutex lock to prevent deadlock where
677 different kind of OPEN lock for this same inode gets cancelled
678 by ldlm_cancel_lru */
679 if (!S_ISREG(inode->i_mode))
684 if (!lli->lli_has_smd &&
685 (cl_is_lov_delay_create(file->f_flags) ||
686 (file->f_mode & FMODE_WRITE) == 0)) {
687 CDEBUG(D_INODE, "object creation was delayed\n");
690 cl_lov_delay_create_clear(&file->f_flags);
695 if (och_p && *och_p) {
700 mutex_unlock(&lli->lli_och_mutex);
703 if (opendir_set != 0)
704 ll_stop_statahead(inode, lli->lli_opendir_key);
705 ll_file_data_put(fd);
707 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
710 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
711 ptlrpc_req_finished(it->d.lustre.it_data);
712 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
718 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
719 struct ldlm_lock_desc *desc, void *data, int flag)
722 struct lustre_handle lockh;
725 case LDLM_CB_BLOCKING:
726 ldlm_lock2handle(lock, &lockh);
727 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
729 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
733 case LDLM_CB_CANCELING:
741 * Acquire a lease and open the file.
743 static struct obd_client_handle *
744 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
747 struct lookup_intent it = { .it_op = IT_OPEN };
748 struct ll_sb_info *sbi = ll_i2sbi(inode);
749 struct md_op_data *op_data;
750 struct ptlrpc_request *req;
751 struct lustre_handle old_handle = { 0 };
752 struct obd_client_handle *och = NULL;
756 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
757 return ERR_PTR(-EINVAL);
760 struct ll_inode_info *lli = ll_i2info(inode);
761 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
762 struct obd_client_handle **och_p;
765 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
766 return ERR_PTR(-EPERM);
768 /* Get the openhandle of the file */
770 mutex_lock(&lli->lli_och_mutex);
771 if (fd->fd_lease_och != NULL) {
772 mutex_unlock(&lli->lli_och_mutex);
776 if (fd->fd_och == NULL) {
777 if (file->f_mode & FMODE_WRITE) {
778 LASSERT(lli->lli_mds_write_och != NULL);
779 och_p = &lli->lli_mds_write_och;
780 och_usecount = &lli->lli_open_fd_write_count;
782 LASSERT(lli->lli_mds_read_och != NULL);
783 och_p = &lli->lli_mds_read_och;
784 och_usecount = &lli->lli_open_fd_read_count;
786 if (*och_usecount == 1) {
793 mutex_unlock(&lli->lli_och_mutex);
794 if (rc < 0) /* more than 1 opener */
797 LASSERT(fd->fd_och != NULL);
798 old_handle = fd->fd_och->och_fh;
801 och = kzalloc(sizeof(*och), GFP_NOFS);
803 return ERR_PTR(-ENOMEM);
805 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
806 LUSTRE_OPC_ANY, NULL);
807 if (IS_ERR(op_data)) {
808 rc = PTR_ERR(op_data);
812 /* To tell the MDT this openhandle is from the same owner */
813 op_data->op_handle = old_handle;
815 it.it_flags = fmode | open_flags;
816 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
817 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
818 ll_md_blocking_lease_ast,
819 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
820 * it can be cancelled which may mislead applications that the lease is
822 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
823 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
824 * doesn't deal with openhandle, so normal openhandle will be leaked. */
825 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
826 ll_finish_md_op_data(op_data);
827 ptlrpc_req_finished(req);
831 if (it_disposition(&it, DISP_LOOKUP_NEG)) {
836 rc = it_open_error(DISP_OPEN_OPEN, &it);
840 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
841 ll_och_fill(sbi->ll_md_exp, &it, och);
843 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
848 /* already get lease, handle lease lock */
849 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
850 if (it.d.lustre.it_lock_mode == 0 ||
851 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
852 /* open lock must return for lease */
853 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
854 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
855 it.d.lustre.it_lock_bits);
860 ll_intent_release(&it);
864 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
866 CERROR("Close openhandle returned %d\n", rc2);
868 /* cancel open lock */
869 if (it.d.lustre.it_lock_mode != 0) {
870 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
871 it.d.lustre.it_lock_mode);
872 it.d.lustre.it_lock_mode = 0;
875 ll_intent_release(&it);
882 * Release lease and close the file.
883 * It will check if the lease has ever broken.
885 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
888 struct ldlm_lock *lock;
889 bool cancelled = true;
892 lock = ldlm_handle2lock(&och->och_lease_handle);
894 lock_res_and_lock(lock);
895 cancelled = ldlm_is_cancel(lock);
896 unlock_res_and_lock(lock);
900 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
901 PFID(&ll_i2info(inode)->lli_fid), cancelled);
904 ldlm_cli_cancel(&och->och_lease_handle, 0);
905 if (lease_broken != NULL)
906 *lease_broken = cancelled;
908 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
913 /* Fills the obdo with the attributes for the lsm */
914 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
915 struct obd_capa *capa, struct obdo *obdo,
916 __u64 ioepoch, int sync)
918 struct ptlrpc_request_set *set;
919 struct obd_info oinfo = { };
922 LASSERT(lsm != NULL);
926 oinfo.oi_oa->o_oi = lsm->lsm_oi;
927 oinfo.oi_oa->o_mode = S_IFREG;
928 oinfo.oi_oa->o_ioepoch = ioepoch;
929 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
930 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
931 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
932 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
933 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
934 OBD_MD_FLDATAVERSION;
935 oinfo.oi_capa = capa;
937 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
938 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
941 set = ptlrpc_prep_set();
943 CERROR("can't allocate ptlrpc set\n");
946 rc = obd_getattr_async(exp, &oinfo, set);
948 rc = ptlrpc_set_wait(set);
949 ptlrpc_set_destroy(set);
952 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
953 OBD_MD_FLATIME | OBD_MD_FLMTIME |
954 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
955 OBD_MD_FLDATAVERSION);
960 * Performs the getattr on the inode and updates its fields.
961 * If @sync != 0, perform the getattr under the server-side lock.
963 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
964 __u64 ioepoch, int sync)
966 struct obd_capa *capa = ll_mdscapa_get(inode);
967 struct lov_stripe_md *lsm;
970 lsm = ccc_inode_lsm_get(inode);
971 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
972 capa, obdo, ioepoch, sync);
975 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
977 obdo_refresh_inode(inode, obdo, obdo->o_valid);
978 CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
979 POSTID(oi), i_size_read(inode),
980 (unsigned long long)inode->i_blocks,
981 1UL << inode->i_blkbits);
983 ccc_inode_lsm_put(inode, lsm);
987 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
989 struct ll_inode_info *lli = ll_i2info(inode);
990 struct cl_object *obj = lli->lli_clob;
991 struct cl_attr *attr = ccc_env_thread_attr(env);
995 ll_inode_size_lock(inode);
996 /* merge timestamps the most recently obtained from mds with
997 timestamps obtained from osts */
998 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
999 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1000 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1002 lvb.lvb_size = i_size_read(inode);
1003 lvb.lvb_blocks = inode->i_blocks;
1004 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1005 lvb.lvb_atime = LTIME_S(inode->i_atime);
1006 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1008 cl_object_attr_lock(obj);
1009 rc = cl_object_attr_get(env, obj, attr);
1010 cl_object_attr_unlock(obj);
1013 if (lvb.lvb_atime < attr->cat_atime)
1014 lvb.lvb_atime = attr->cat_atime;
1015 if (lvb.lvb_ctime < attr->cat_ctime)
1016 lvb.lvb_ctime = attr->cat_ctime;
1017 if (lvb.lvb_mtime < attr->cat_mtime)
1018 lvb.lvb_mtime = attr->cat_mtime;
1020 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1021 PFID(&lli->lli_fid), attr->cat_size);
1022 cl_isize_write_nolock(inode, attr->cat_size);
1024 inode->i_blocks = attr->cat_blocks;
1026 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1027 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1028 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1030 ll_inode_size_unlock(inode);
1035 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1038 struct obdo obdo = { 0 };
1041 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1043 st->st_size = obdo.o_size;
1044 st->st_blocks = obdo.o_blocks;
1045 st->st_mtime = obdo.o_mtime;
1046 st->st_atime = obdo.o_atime;
1047 st->st_ctime = obdo.o_ctime;
1052 static bool file_is_noatime(const struct file *file)
1054 const struct vfsmount *mnt = file->f_path.mnt;
1055 const struct inode *inode = file_inode(file);
1057 /* Adapted from file_accessed() and touch_atime().*/
1058 if (file->f_flags & O_NOATIME)
1061 if (inode->i_flags & S_NOATIME)
1064 if (IS_NOATIME(inode))
1067 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1070 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1073 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1079 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1081 struct inode *inode = file_inode(file);
1083 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1085 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1086 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1087 file->f_flags & O_DIRECT ||
1090 io->ci_obj = ll_i2info(inode)->lli_clob;
1091 io->ci_lockreq = CILR_MAYBE;
1092 if (ll_file_nolock(file)) {
1093 io->ci_lockreq = CILR_NEVER;
1094 io->ci_no_srvlock = 1;
1095 } else if (file->f_flags & O_APPEND) {
1096 io->ci_lockreq = CILR_MANDATORY;
1099 io->ci_noatime = file_is_noatime(file);
1103 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1104 struct file *file, enum cl_io_type iot,
1105 loff_t *ppos, size_t count)
1107 struct ll_inode_info *lli = ll_i2info(file_inode(file));
1108 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1113 io = ccc_env_thread_io(env);
1114 ll_io_init(io, file, iot == CIT_WRITE);
1116 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1117 struct vvp_io *vio = vvp_env_io(env);
1118 struct ccc_io *cio = ccc_env_io(env);
1119 int write_mutex_locked = 0;
1121 cio->cui_fd = LUSTRE_FPRIVATE(file);
1122 vio->cui_io_subtype = args->via_io_subtype;
1124 switch (vio->cui_io_subtype) {
1126 cio->cui_iter = args->u.normal.via_iter;
1127 cio->cui_iocb = args->u.normal.via_iocb;
1128 if ((iot == CIT_WRITE) &&
1129 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1130 if (mutex_lock_interruptible(&lli->
1132 result = -ERESTARTSYS;
1135 write_mutex_locked = 1;
1136 } else if (iot == CIT_READ) {
1137 down_read(&lli->lli_trunc_sem);
1141 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1142 vio->u.splice.cui_flags = args->u.splice.via_flags;
1145 CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
1148 result = cl_io_loop(env, io);
1149 if (write_mutex_locked)
1150 mutex_unlock(&lli->lli_write_mutex);
1151 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1152 up_read(&lli->lli_trunc_sem);
1154 /* cl_io_rw_init() handled IO */
1155 result = io->ci_result;
1158 if (io->ci_nob > 0) {
1159 result = io->ci_nob;
1160 *ppos = io->u.ci_wr.wr.crw_pos;
1164 cl_io_fini(env, io);
1165 /* If any bit been read/written (result != 0), we just return
1166 * short read/write instead of restart io. */
1167 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1168 CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n",
1169 iot == CIT_READ ? "read" : "write",
1170 file, *ppos, count);
1171 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1175 if (iot == CIT_READ) {
1177 ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1178 LPROC_LL_READ_BYTES, result);
1179 } else if (iot == CIT_WRITE) {
1181 ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1182 LPROC_LL_WRITE_BYTES, result);
1183 fd->fd_write_failed = false;
1184 } else if (result != -ERESTARTSYS) {
1185 fd->fd_write_failed = true;
1192 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1195 struct vvp_io_args *args;
1199 env = cl_env_get(&refcheck);
1201 return PTR_ERR(env);
1203 args = vvp_env_args(env, IO_NORMAL);
1204 args->u.normal.via_iter = to;
1205 args->u.normal.via_iocb = iocb;
1207 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1208 &iocb->ki_pos, iov_iter_count(to));
1209 cl_env_put(env, &refcheck);
1214 * Write to a file (through the page cache).
1216 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1219 struct vvp_io_args *args;
1223 env = cl_env_get(&refcheck);
1225 return PTR_ERR(env);
1227 args = vvp_env_args(env, IO_NORMAL);
1228 args->u.normal.via_iter = from;
1229 args->u.normal.via_iocb = iocb;
1231 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1232 &iocb->ki_pos, iov_iter_count(from));
1233 cl_env_put(env, &refcheck);
1238 * Send file content (through pagecache) somewhere with helper
1240 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1241 struct pipe_inode_info *pipe, size_t count,
1245 struct vvp_io_args *args;
1249 env = cl_env_get(&refcheck);
1251 return PTR_ERR(env);
1253 args = vvp_env_args(env, IO_SPLICE);
1254 args->u.splice.via_pipe = pipe;
1255 args->u.splice.via_flags = flags;
1257 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1258 cl_env_put(env, &refcheck);
1262 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
1264 struct obd_export *exp = ll_i2dtexp(inode);
1265 struct obd_trans_info oti = { 0 };
1266 struct obdo *oa = NULL;
1269 struct lov_stripe_md *lsm = NULL, *lsm2;
1275 lsm = ccc_inode_lsm_get(inode);
1276 if (!lsm_has_objects(lsm)) {
1281 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1282 (lsm->lsm_stripe_count));
1284 lsm2 = libcfs_kvzalloc(lsm_size, GFP_NOFS);
1291 oa->o_nlink = ost_idx;
1292 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1293 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1294 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1295 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1296 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1297 memcpy(lsm2, lsm, lsm_size);
1298 ll_inode_size_lock(inode);
1299 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1300 ll_inode_size_unlock(inode);
1305 ccc_inode_lsm_put(inode, lsm);
1310 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1312 struct ll_recreate_obj ucreat;
1315 if (!capable(CFS_CAP_SYS_ADMIN))
1318 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1322 ostid_set_seq_mdt0(&oi);
1323 ostid_set_id(&oi, ucreat.lrc_id);
1324 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1327 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1333 if (!capable(CFS_CAP_SYS_ADMIN))
1336 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1339 fid_to_ostid(&fid, &oi);
1340 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1341 return ll_lov_recreate(inode, &oi, ost_idx);
1344 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1345 int flags, struct lov_user_md *lum, int lum_size)
1347 struct lov_stripe_md *lsm = NULL;
1348 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1351 lsm = ccc_inode_lsm_get(inode);
1353 ccc_inode_lsm_put(inode, lsm);
1354 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1360 ll_inode_size_lock(inode);
1361 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1364 rc = oit.d.lustre.it_status;
1368 ll_release_openhandle(inode, &oit);
1371 ll_inode_size_unlock(inode);
1372 ll_intent_release(&oit);
1373 ccc_inode_lsm_put(inode, lsm);
1377 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1381 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1382 struct lov_mds_md **lmmp, int *lmm_size,
1383 struct ptlrpc_request **request)
1385 struct ll_sb_info *sbi = ll_i2sbi(inode);
1386 struct mdt_body *body;
1387 struct lov_mds_md *lmm = NULL;
1388 struct ptlrpc_request *req = NULL;
1389 struct md_op_data *op_data;
1392 rc = ll_get_default_mdsize(sbi, &lmmsize);
1396 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1397 strlen(filename), lmmsize,
1398 LUSTRE_OPC_ANY, NULL);
1399 if (IS_ERR(op_data))
1400 return PTR_ERR(op_data);
1402 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1403 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1404 ll_finish_md_op_data(op_data);
1406 CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
1411 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1412 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1414 lmmsize = body->eadatasize;
1416 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1422 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1423 LASSERT(lmm != NULL);
1425 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1426 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1432 * This is coming from the MDS, so is probably in
1433 * little endian. We convert it to host endian before
1434 * passing it to userspace.
1436 if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) {
1439 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1440 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1443 /* if function called for directory - we should
1444 * avoid swab not existent lsm objects */
1445 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1446 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1447 if (S_ISREG(body->mode))
1448 lustre_swab_lov_user_md_objects(
1449 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1451 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1452 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1453 if (S_ISREG(body->mode))
1454 lustre_swab_lov_user_md_objects(
1455 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1462 *lmm_size = lmmsize;
1467 static int ll_lov_setea(struct inode *inode, struct file *file,
1470 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1471 struct lov_user_md *lump;
1472 int lum_size = sizeof(struct lov_user_md) +
1473 sizeof(struct lov_user_ost_data);
1476 if (!capable(CFS_CAP_SYS_ADMIN))
1479 lump = libcfs_kvzalloc(lum_size, GFP_NOFS);
1483 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1488 rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump,
1490 cl_lov_delay_create_clear(&file->f_flags);
1496 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1499 struct lov_user_md_v3 lumv3;
1500 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1501 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1502 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1504 int flags = FMODE_WRITE;
1506 /* first try with v1 which is smaller than v3 */
1507 lum_size = sizeof(struct lov_user_md_v1);
1508 if (copy_from_user(lumv1, lumv1p, lum_size))
1511 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1512 lum_size = sizeof(struct lov_user_md_v3);
1513 if (copy_from_user(&lumv3, lumv3p, lum_size))
1517 rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1,
1519 cl_lov_delay_create_clear(&file->f_flags);
1521 struct lov_stripe_md *lsm;
1524 put_user(0, &lumv1p->lmm_stripe_count);
1526 ll_layout_refresh(inode, &gen);
1527 lsm = ccc_inode_lsm_get(inode);
1528 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1529 0, lsm, (void *)arg);
1530 ccc_inode_lsm_put(inode, lsm);
1535 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1537 struct lov_stripe_md *lsm;
1540 lsm = ccc_inode_lsm_get(inode);
1542 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1544 ccc_inode_lsm_put(inode, lsm);
1549 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1551 struct ll_inode_info *lli = ll_i2info(inode);
1552 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1553 struct ccc_grouplock grouplock;
1557 CWARN("group id for group lock must not be 0\n");
1561 if (ll_file_nolock(file))
1564 spin_lock(&lli->lli_lock);
1565 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1566 CWARN("group lock already existed with gid %lu\n",
1567 fd->fd_grouplock.cg_gid);
1568 spin_unlock(&lli->lli_lock);
1571 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1572 spin_unlock(&lli->lli_lock);
1574 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1575 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1579 spin_lock(&lli->lli_lock);
1580 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1581 spin_unlock(&lli->lli_lock);
1582 CERROR("another thread just won the race\n");
1583 cl_put_grouplock(&grouplock);
1587 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1588 fd->fd_grouplock = grouplock;
1589 spin_unlock(&lli->lli_lock);
1591 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1595 static int ll_put_grouplock(struct inode *inode, struct file *file,
1598 struct ll_inode_info *lli = ll_i2info(inode);
1599 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1600 struct ccc_grouplock grouplock;
1602 spin_lock(&lli->lli_lock);
1603 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1604 spin_unlock(&lli->lli_lock);
1605 CWARN("no group lock held\n");
1608 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1610 if (fd->fd_grouplock.cg_gid != arg) {
1611 CWARN("group lock %lu doesn't match current id %lu\n",
1612 arg, fd->fd_grouplock.cg_gid);
1613 spin_unlock(&lli->lli_lock);
1617 grouplock = fd->fd_grouplock;
1618 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1619 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1620 spin_unlock(&lli->lli_lock);
1622 cl_put_grouplock(&grouplock);
1623 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1628 * Close inode open handle
1630 * \param inode [in] inode in question
1631 * \param it [in,out] intent which contains open info and result
1634 * \retval <0 failure
1636 int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
1638 struct obd_client_handle *och;
1643 /* Root ? Do nothing. */
1644 if (is_root_inode(inode))
1647 /* No open handle to close? Move away */
1648 if (!it_disposition(it, DISP_OPEN_OPEN))
1651 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1653 och = kzalloc(sizeof(*och), GFP_NOFS);
1659 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1661 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1664 /* this one is in place of ll_file_open */
1665 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1666 ptlrpc_req_finished(it->d.lustre.it_data);
1667 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1673 * Get size for inode for which FIEMAP mapping is requested.
1674 * Make the FIEMAP get_info call and returns the result.
1676 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1679 struct obd_export *exp = ll_i2dtexp(inode);
1680 struct lov_stripe_md *lsm = NULL;
1681 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1682 __u32 vallen = num_bytes;
1685 /* Checks for fiemap flags */
1686 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1687 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1691 /* Check for FIEMAP_FLAG_SYNC */
1692 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1693 rc = filemap_fdatawrite(inode->i_mapping);
1698 lsm = ccc_inode_lsm_get(inode);
1702 /* If the stripe_count > 1 and the application does not understand
1703 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1705 if (lsm->lsm_stripe_count > 1 &&
1706 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
1711 fm_key.oa.o_oi = lsm->lsm_oi;
1712 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1714 if (i_size_read(inode) == 0) {
1715 rc = ll_glimpse_size(inode);
1720 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1721 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1722 /* If filesize is 0, then there would be no objects for mapping */
1723 if (fm_key.oa.o_size == 0) {
1724 fiemap->fm_mapped_extents = 0;
1729 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1731 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1734 CERROR("obd_get_info failed: rc = %d\n", rc);
1737 ccc_inode_lsm_put(inode, lsm);
1741 int ll_fid2path(struct inode *inode, void __user *arg)
1743 struct obd_export *exp = ll_i2mdexp(inode);
1744 const struct getinfo_fid2path __user *gfin = arg;
1745 struct getinfo_fid2path *gfout;
1750 if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1751 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1754 /* Only need to get the buflen */
1755 if (get_user(pathlen, &gfin->gf_pathlen))
1758 if (pathlen > PATH_MAX)
1761 outsize = sizeof(*gfout) + pathlen;
1763 gfout = kzalloc(outsize, GFP_NOFS);
1767 if (copy_from_user(gfout, arg, sizeof(*gfout))) {
1772 /* Call mdc_iocontrol */
1773 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1777 if (copy_to_user(arg, gfout, outsize))
1785 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1787 struct ll_user_fiemap *fiemap_s;
1788 size_t num_bytes, ret_bytes;
1789 unsigned int extent_count;
1792 /* Get the extent count so we can calculate the size of
1793 * required fiemap buffer */
1794 if (get_user(extent_count,
1795 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1799 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1801 num_bytes = sizeof(*fiemap_s) + (extent_count *
1802 sizeof(struct ll_fiemap_extent));
1804 fiemap_s = libcfs_kvzalloc(num_bytes, GFP_NOFS);
1805 if (fiemap_s == NULL)
1808 /* get the fiemap value */
1809 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1810 sizeof(*fiemap_s))) {
1815 /* If fm_extent_count is non-zero, read the first extent since
1816 * it is used to calculate end_offset and device from previous
1819 if (copy_from_user(&fiemap_s->fm_extents[0],
1820 (char __user *)arg + sizeof(*fiemap_s),
1821 sizeof(struct ll_fiemap_extent))) {
1827 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1831 ret_bytes = sizeof(struct ll_user_fiemap);
1833 if (extent_count != 0)
1834 ret_bytes += (fiemap_s->fm_mapped_extents *
1835 sizeof(struct ll_fiemap_extent));
1837 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1846 * Read the data_version for inode.
1848 * This value is computed using stripe object version on OST.
1849 * Version is computed using server side locking.
1851 * @param extent_lock Take extent lock. Not needed if a process is already
1852 * holding the OST object group locks.
1854 int ll_data_version(struct inode *inode, __u64 *data_version,
1857 struct lov_stripe_md *lsm = NULL;
1858 struct ll_sb_info *sbi = ll_i2sbi(inode);
1859 struct obdo *obdo = NULL;
1862 /* If no stripe, we consider version is 0. */
1863 lsm = ccc_inode_lsm_get(inode);
1864 if (!lsm_has_objects(lsm)) {
1866 CDEBUG(D_INODE, "No object for inode\n");
1871 obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
1877 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1879 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1882 *data_version = obdo->o_data_version;
1887 ccc_inode_lsm_put(inode, lsm);
1892 * Trigger a HSM release request for the provided inode.
1894 int ll_hsm_release(struct inode *inode)
1896 struct cl_env_nest nest;
1898 struct obd_client_handle *och = NULL;
1899 __u64 data_version = 0;
1903 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1904 ll_get_fsname(inode->i_sb, NULL, 0),
1905 PFID(&ll_i2info(inode)->lli_fid));
1907 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1913 /* Grab latest data_version and [am]time values */
1914 rc = ll_data_version(inode, &data_version, 1);
1918 env = cl_env_nested_get(&nest);
1924 ll_merge_lvb(env, inode);
1925 cl_env_nested_put(&nest, env);
1927 /* Release the file.
1928 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1929 * we still need it to pack l_remote_handle to MDT. */
1930 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1936 if (och != NULL && !IS_ERR(och)) /* close the file */
1937 ll_lease_close(och, inode, NULL);
1942 struct ll_swap_stack {
1943 struct iattr ia1, ia2;
1945 struct inode *inode1, *inode2;
1946 bool check_dv1, check_dv2;
1949 static int ll_swap_layouts(struct file *file1, struct file *file2,
1950 struct lustre_swap_layouts *lsl)
1952 struct mdc_swap_layouts msl;
1953 struct md_op_data *op_data;
1956 struct ll_swap_stack *llss = NULL;
1959 llss = kzalloc(sizeof(*llss), GFP_NOFS);
1963 llss->inode1 = file_inode(file1);
1964 llss->inode2 = file_inode(file2);
1966 if (!S_ISREG(llss->inode2->i_mode)) {
1971 if (inode_permission(llss->inode1, MAY_WRITE) ||
1972 inode_permission(llss->inode2, MAY_WRITE)) {
1977 if (llss->inode2->i_sb != llss->inode1->i_sb) {
1982 /* we use 2 bool because it is easier to swap than 2 bits */
1983 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1984 llss->check_dv1 = true;
1986 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1987 llss->check_dv2 = true;
1989 /* we cannot use lsl->sl_dvX directly because we may swap them */
1990 llss->dv1 = lsl->sl_dv1;
1991 llss->dv2 = lsl->sl_dv2;
1993 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1994 if (rc == 0) /* same file, done! */ {
1999 if (rc < 0) { /* sequentialize it */
2000 swap(llss->inode1, llss->inode2);
2002 swap(llss->dv1, llss->dv2);
2003 swap(llss->check_dv1, llss->check_dv2);
2007 if (gid != 0) { /* application asks to flush dirty cache */
2008 rc = ll_get_grouplock(llss->inode1, file1, gid);
2012 rc = ll_get_grouplock(llss->inode2, file2, gid);
2014 ll_put_grouplock(llss->inode1, file1, gid);
2019 /* to be able to restore mtime and atime after swap
2020 * we need to first save them */
2022 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2023 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2024 llss->ia1.ia_atime = llss->inode1->i_atime;
2025 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2026 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2027 llss->ia2.ia_atime = llss->inode2->i_atime;
2028 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2031 /* ultimate check, before swapping the layouts we check if
2032 * dataversion has changed (if requested) */
2033 if (llss->check_dv1) {
2034 rc = ll_data_version(llss->inode1, &dv, 0);
2037 if (dv != llss->dv1) {
2043 if (llss->check_dv2) {
2044 rc = ll_data_version(llss->inode2, &dv, 0);
2047 if (dv != llss->dv2) {
2053 /* struct md_op_data is used to send the swap args to the mdt
2054 * only flags is missing, so we use struct mdc_swap_layouts
2055 * through the md_op_data->op_data */
2056 /* flags from user space have to be converted before they are send to
2057 * server, no flag is sent today, they are only used on the client */
2060 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2061 0, LUSTRE_OPC_ANY, &msl);
2062 if (IS_ERR(op_data)) {
2063 rc = PTR_ERR(op_data);
2067 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2068 sizeof(*op_data), op_data, NULL);
2069 ll_finish_md_op_data(op_data);
2073 ll_put_grouplock(llss->inode2, file2, gid);
2074 ll_put_grouplock(llss->inode1, file1, gid);
2077 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2081 /* clear useless flags */
2082 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2083 llss->ia1.ia_valid &= ~ATTR_MTIME;
2084 llss->ia2.ia_valid &= ~ATTR_MTIME;
2087 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2088 llss->ia1.ia_valid &= ~ATTR_ATIME;
2089 llss->ia2.ia_valid &= ~ATTR_ATIME;
2092 /* update time if requested */
2094 if (llss->ia2.ia_valid != 0) {
2095 mutex_lock(&llss->inode1->i_mutex);
2096 rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
2097 mutex_unlock(&llss->inode1->i_mutex);
2100 if (llss->ia1.ia_valid != 0) {
2103 mutex_lock(&llss->inode2->i_mutex);
2104 rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
2105 mutex_unlock(&llss->inode2->i_mutex);
2116 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2118 struct md_op_data *op_data;
2121 /* Detect out-of range masks */
2122 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2125 /* Non-root users are forbidden to set or clear flags which are
2126 * NOT defined in HSM_USER_MASK. */
2127 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2128 !capable(CFS_CAP_SYS_ADMIN))
2131 /* Detect out-of range archive id */
2132 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2133 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2136 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2137 LUSTRE_OPC_ANY, hss);
2138 if (IS_ERR(op_data))
2139 return PTR_ERR(op_data);
2141 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2142 sizeof(*op_data), op_data, NULL);
2144 ll_finish_md_op_data(op_data);
2149 static int ll_hsm_import(struct inode *inode, struct file *file,
2150 struct hsm_user_import *hui)
2152 struct hsm_state_set *hss = NULL;
2153 struct iattr *attr = NULL;
2157 if (!S_ISREG(inode->i_mode))
2161 hss = kzalloc(sizeof(*hss), GFP_NOFS);
2165 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2166 hss->hss_archive_id = hui->hui_archive_id;
2167 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2168 rc = ll_hsm_state_set(inode, hss);
2172 attr = kzalloc(sizeof(*attr), GFP_NOFS);
2178 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2179 attr->ia_mode |= S_IFREG;
2180 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2181 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2182 attr->ia_size = hui->hui_size;
2183 attr->ia_mtime.tv_sec = hui->hui_mtime;
2184 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2185 attr->ia_atime.tv_sec = hui->hui_atime;
2186 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2188 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2189 ATTR_UID | ATTR_GID |
2190 ATTR_MTIME | ATTR_MTIME_SET |
2191 ATTR_ATIME | ATTR_ATIME_SET;
2193 mutex_lock(&inode->i_mutex);
2195 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2199 mutex_unlock(&inode->i_mutex);
2208 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2210 struct inode *inode = file_inode(file);
2211 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2214 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2215 inode->i_generation, inode, cmd);
2216 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2218 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2219 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2223 case LL_IOC_GETFLAGS:
2224 /* Get the current value of the file flags */
2225 return put_user(fd->fd_flags, (int *)arg);
2226 case LL_IOC_SETFLAGS:
2227 case LL_IOC_CLRFLAGS:
2228 /* Set or clear specific file flags */
2229 /* XXX This probably needs checks to ensure the flags are
2230 * not abused, and to handle any flag side effects.
2232 if (get_user(flags, (int *) arg))
2235 if (cmd == LL_IOC_SETFLAGS) {
2236 if ((flags & LL_FILE_IGNORE_LOCK) &&
2237 !(file->f_flags & O_DIRECT)) {
2238 CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
2243 fd->fd_flags |= flags;
2245 fd->fd_flags &= ~flags;
2248 case LL_IOC_LOV_SETSTRIPE:
2249 return ll_lov_setstripe(inode, file, arg);
2250 case LL_IOC_LOV_SETEA:
2251 return ll_lov_setea(inode, file, arg);
2252 case LL_IOC_LOV_SWAP_LAYOUTS: {
2254 struct lustre_swap_layouts lsl;
2256 if (copy_from_user(&lsl, (char *)arg,
2257 sizeof(struct lustre_swap_layouts)))
2260 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2263 file2 = fget(lsl.sl_fd);
2268 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2269 rc = ll_swap_layouts(file, file2, &lsl);
2273 case LL_IOC_LOV_GETSTRIPE:
2274 return ll_lov_getstripe(inode, arg);
2275 case LL_IOC_RECREATE_OBJ:
2276 return ll_lov_recreate_obj(inode, arg);
2277 case LL_IOC_RECREATE_FID:
2278 return ll_lov_recreate_fid(inode, arg);
2279 case FSFILT_IOC_FIEMAP:
2280 return ll_ioctl_fiemap(inode, arg);
2281 case FSFILT_IOC_GETFLAGS:
2282 case FSFILT_IOC_SETFLAGS:
2283 return ll_iocontrol(inode, file, cmd, arg);
2284 case FSFILT_IOC_GETVERSION_OLD:
2285 case FSFILT_IOC_GETVERSION:
2286 return put_user(inode->i_generation, (int *)arg);
2287 case LL_IOC_GROUP_LOCK:
2288 return ll_get_grouplock(inode, file, arg);
2289 case LL_IOC_GROUP_UNLOCK:
2290 return ll_put_grouplock(inode, file, arg);
2291 case IOC_OBD_STATFS:
2292 return ll_obd_statfs(inode, (void *)arg);
2294 /* We need to special case any other ioctls we want to handle,
2295 * to send them to the MDS/OST as appropriate and to properly
2296 * network encode the arg field.
2297 case FSFILT_IOC_SETVERSION_OLD:
2298 case FSFILT_IOC_SETVERSION:
2300 case LL_IOC_FLUSHCTX:
2301 return ll_flush_ctx(inode);
2302 case LL_IOC_PATH2FID: {
2303 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2304 sizeof(struct lu_fid)))
2309 case OBD_IOC_FID2PATH:
2310 return ll_fid2path(inode, (void *)arg);
2311 case LL_IOC_DATA_VERSION: {
2312 struct ioc_data_version idv;
2315 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2318 rc = ll_data_version(inode, &idv.idv_version,
2319 !(idv.idv_flags & LL_DV_NOFLUSH));
2321 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2327 case LL_IOC_GET_MDTIDX: {
2330 mdtidx = ll_get_mdt_idx(inode);
2334 if (put_user((int)mdtidx, (int *)arg))
2339 case OBD_IOC_GETDTNAME:
2340 case OBD_IOC_GETMDNAME:
2341 return ll_get_obd_name(inode, cmd, arg);
2342 case LL_IOC_HSM_STATE_GET: {
2343 struct md_op_data *op_data;
2344 struct hsm_user_state *hus;
2347 hus = kzalloc(sizeof(*hus), GFP_NOFS);
2351 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2352 LUSTRE_OPC_ANY, hus);
2353 if (IS_ERR(op_data)) {
2355 return PTR_ERR(op_data);
2358 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2361 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2364 ll_finish_md_op_data(op_data);
2368 case LL_IOC_HSM_STATE_SET: {
2369 struct hsm_state_set *hss;
2372 hss = memdup_user((char *)arg, sizeof(*hss));
2374 return PTR_ERR(hss);
2376 rc = ll_hsm_state_set(inode, hss);
2381 case LL_IOC_HSM_ACTION: {
2382 struct md_op_data *op_data;
2383 struct hsm_current_action *hca;
2386 hca = kzalloc(sizeof(*hca), GFP_NOFS);
2390 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2391 LUSTRE_OPC_ANY, hca);
2392 if (IS_ERR(op_data)) {
2394 return PTR_ERR(op_data);
2397 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2400 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2403 ll_finish_md_op_data(op_data);
2407 case LL_IOC_SET_LEASE: {
2408 struct ll_inode_info *lli = ll_i2info(inode);
2409 struct obd_client_handle *och = NULL;
2415 if (!(file->f_mode & FMODE_WRITE))
2420 if (!(file->f_mode & FMODE_READ))
2425 mutex_lock(&lli->lli_och_mutex);
2426 if (fd->fd_lease_och != NULL) {
2427 och = fd->fd_lease_och;
2428 fd->fd_lease_och = NULL;
2430 mutex_unlock(&lli->lli_och_mutex);
2433 mode = och->och_flags &
2434 (FMODE_READ|FMODE_WRITE);
2435 rc = ll_lease_close(och, inode, &lease_broken);
2436 if (rc == 0 && lease_broken)
2442 /* return the type of lease or error */
2443 return rc < 0 ? rc : (int)mode;
2448 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2450 /* apply for lease */
2451 och = ll_lease_open(inode, file, mode, 0);
2453 return PTR_ERR(och);
2456 mutex_lock(&lli->lli_och_mutex);
2457 if (fd->fd_lease_och == NULL) {
2458 fd->fd_lease_och = och;
2461 mutex_unlock(&lli->lli_och_mutex);
2463 /* impossible now that only excl is supported for now */
2464 ll_lease_close(och, inode, &lease_broken);
2469 case LL_IOC_GET_LEASE: {
2470 struct ll_inode_info *lli = ll_i2info(inode);
2471 struct ldlm_lock *lock = NULL;
2474 mutex_lock(&lli->lli_och_mutex);
2475 if (fd->fd_lease_och != NULL) {
2476 struct obd_client_handle *och = fd->fd_lease_och;
2478 lock = ldlm_handle2lock(&och->och_lease_handle);
2480 lock_res_and_lock(lock);
2481 if (!ldlm_is_cancel(lock))
2482 rc = och->och_flags &
2483 (FMODE_READ | FMODE_WRITE);
2484 unlock_res_and_lock(lock);
2485 ldlm_lock_put(lock);
2488 mutex_unlock(&lli->lli_och_mutex);
2491 case LL_IOC_HSM_IMPORT: {
2492 struct hsm_user_import *hui;
2494 hui = memdup_user((void *)arg, sizeof(*hui));
2496 return PTR_ERR(hui);
2498 rc = ll_hsm_import(inode, file, hui);
2506 if (ll_iocontrol_call(inode, file, cmd, arg, &err) ==
2510 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2517 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2519 struct inode *inode = file_inode(file);
2520 loff_t retval, eof = 0;
2522 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2523 (origin == SEEK_CUR) ? file->f_pos : 0);
2524 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2525 inode->i_ino, inode->i_generation, inode, retval, retval,
2527 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2529 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2530 retval = ll_glimpse_size(inode);
2533 eof = i_size_read(inode);
2536 retval = generic_file_llseek_size(file, offset, origin,
2537 ll_file_maxbytes(inode), eof);
2541 static int ll_flush(struct file *file, fl_owner_t id)
2543 struct inode *inode = file_inode(file);
2544 struct ll_inode_info *lli = ll_i2info(inode);
2545 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2548 LASSERT(!S_ISDIR(inode->i_mode));
2550 /* catch async errors that were recorded back when async writeback
2551 * failed for pages in this mapping. */
2552 rc = lli->lli_async_rc;
2553 lli->lli_async_rc = 0;
2554 err = lov_read_and_clear_async_rc(lli->lli_clob);
2558 /* The application has been told write failure already.
2559 * Do not report failure again. */
2560 if (fd->fd_write_failed)
2562 return rc ? -EIO : 0;
2566 * Called to make sure a portion of file has been written out.
2567 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2569 * Return how many pages have been written.
2571 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2572 enum cl_fsync_mode mode, int ignore_layout)
2574 struct cl_env_nest nest;
2577 struct obd_capa *capa = NULL;
2578 struct cl_fsync_io *fio;
2581 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2582 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2585 env = cl_env_nested_get(&nest);
2587 return PTR_ERR(env);
2589 capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2591 io = ccc_env_thread_io(env);
2592 io->ci_obj = cl_i2info(inode)->lli_clob;
2593 io->ci_ignore_layout = ignore_layout;
2595 /* initialize parameters for sync */
2596 fio = &io->u.ci_fsync;
2597 fio->fi_capa = capa;
2598 fio->fi_start = start;
2600 fio->fi_fid = ll_inode2fid(inode);
2601 fio->fi_mode = mode;
2602 fio->fi_nr_written = 0;
2604 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2605 result = cl_io_loop(env, io);
2607 result = io->ci_result;
2609 result = fio->fi_nr_written;
2610 cl_io_fini(env, io);
2611 cl_env_nested_put(&nest, env);
2618 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2620 struct inode *inode = file_inode(file);
2621 struct ll_inode_info *lli = ll_i2info(inode);
2622 struct ptlrpc_request *req;
2623 struct obd_capa *oc;
2626 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2627 inode->i_generation, inode);
2628 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2630 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2631 mutex_lock(&inode->i_mutex);
2633 /* catch async errors that were recorded back when async writeback
2634 * failed for pages in this mapping. */
2635 if (!S_ISDIR(inode->i_mode)) {
2636 err = lli->lli_async_rc;
2637 lli->lli_async_rc = 0;
2640 err = lov_read_and_clear_async_rc(lli->lli_clob);
2645 oc = ll_mdscapa_get(inode);
2646 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2652 ptlrpc_req_finished(req);
2654 if (S_ISREG(inode->i_mode)) {
2655 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2657 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2658 if (rc == 0 && err < 0)
2661 fd->fd_write_failed = true;
2663 fd->fd_write_failed = false;
2666 mutex_unlock(&inode->i_mutex);
2671 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2673 struct inode *inode = file_inode(file);
2674 struct ll_sb_info *sbi = ll_i2sbi(inode);
2675 struct ldlm_enqueue_info einfo = {
2676 .ei_type = LDLM_FLOCK,
2677 .ei_cb_cp = ldlm_flock_completion_ast,
2678 .ei_cbdata = file_lock,
2680 struct md_op_data *op_data;
2681 struct lustre_handle lockh = {0};
2682 ldlm_policy_data_t flock = { {0} };
2687 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2688 inode->i_ino, file_lock);
2690 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2692 if (file_lock->fl_flags & FL_FLOCK)
2693 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2694 else if (!(file_lock->fl_flags & FL_POSIX))
2697 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2698 flock.l_flock.pid = file_lock->fl_pid;
2699 flock.l_flock.start = file_lock->fl_start;
2700 flock.l_flock.end = file_lock->fl_end;
2702 /* Somewhat ugly workaround for svc lockd.
2703 * lockd installs custom fl_lmops->lm_compare_owner that checks
2704 * for the fl_owner to be the same (which it always is on local node
2705 * I guess between lockd processes) and then compares pid.
2706 * As such we assign pid to the owner field to make it all work,
2707 * conflict with normal locks is unlikely since pid space and
2708 * pointer space for current->files are not intersecting */
2709 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2710 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2712 switch (file_lock->fl_type) {
2714 einfo.ei_mode = LCK_PR;
2717 /* An unlock request may or may not have any relation to
2718 * existing locks so we may not be able to pass a lock handle
2719 * via a normal ldlm_lock_cancel() request. The request may even
2720 * unlock a byte range in the middle of an existing lock. In
2721 * order to process an unlock request we need all of the same
2722 * information that is given with a normal read or write record
2723 * lock request. To avoid creating another ldlm unlock (cancel)
2724 * message we'll treat a LCK_NL flock request as an unlock. */
2725 einfo.ei_mode = LCK_NL;
2728 einfo.ei_mode = LCK_PW;
2731 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2732 file_lock->fl_type);
2747 flags = LDLM_FL_BLOCK_NOWAIT;
2753 flags = LDLM_FL_TEST_LOCK;
2754 /* Save the old mode so that if the mode in the lock changes we
2755 * can decrement the appropriate reader or writer refcount. */
2756 file_lock->fl_type = einfo.ei_mode;
2759 CERROR("unknown fcntl lock command: %d\n", cmd);
2763 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2764 LUSTRE_OPC_ANY, NULL);
2765 if (IS_ERR(op_data))
2766 return PTR_ERR(op_data);
2768 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2769 inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2770 flock.l_flock.start, flock.l_flock.end);
2772 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2773 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2775 if ((file_lock->fl_flags & FL_FLOCK) &&
2776 (rc == 0 || file_lock->fl_type == F_UNLCK))
2777 rc2 = flock_lock_file_wait(file, file_lock);
2778 if ((file_lock->fl_flags & FL_POSIX) &&
2779 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2780 !(flags & LDLM_FL_TEST_LOCK))
2781 rc2 = posix_lock_file_wait(file, file_lock);
2783 if (rc2 && file_lock->fl_type != F_UNLCK) {
2784 einfo.ei_mode = LCK_NL;
2785 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2786 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2790 ll_finish_md_op_data(op_data);
2796 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2802 * test if some locks matching bits and l_req_mode are acquired
2803 * - bits can be in different locks
2804 * - if found clear the common lock bits in *bits
2805 * - the bits not found, are kept in *bits
2807 * \param bits [IN] searched lock bits [IN]
2808 * \param l_req_mode [IN] searched lock mode
2809 * \retval boolean, true iff all bits are found
2811 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2813 struct lustre_handle lockh;
2814 ldlm_policy_data_t policy;
2815 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2816 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2824 fid = &ll_i2info(inode)->lli_fid;
2825 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2826 ldlm_lockname[mode]);
2828 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2829 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2830 policy.l_inodebits.bits = *bits & (1 << i);
2831 if (policy.l_inodebits.bits == 0)
2834 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2835 &policy, mode, &lockh)) {
2836 struct ldlm_lock *lock;
2838 lock = ldlm_handle2lock(&lockh);
2841 ~(lock->l_policy_data.l_inodebits.bits);
2842 LDLM_LOCK_PUT(lock);
2844 *bits &= ~policy.l_inodebits.bits;
2851 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2852 struct lustre_handle *lockh, __u64 flags,
2855 ldlm_policy_data_t policy = { .l_inodebits = {bits} };
2859 fid = &ll_i2info(inode)->lli_fid;
2860 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2862 rc = md_lock_match(ll_i2mdexp(inode), flags | LDLM_FL_BLOCK_GRANTED,
2863 fid, LDLM_IBITS, &policy, mode, lockh);
2868 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2870 /* Already unlinked. Just update nlink and return success */
2871 if (rc == -ENOENT) {
2873 /* This path cannot be hit for regular files unless in
2874 * case of obscure races, so no need to validate size.
2876 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2878 } else if (rc != 0) {
2879 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2880 "%s: revalidate FID "DFID" error: rc = %d\n",
2881 ll_get_fsname(inode->i_sb, NULL, 0),
2882 PFID(ll_inode2fid(inode)), rc);
2888 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2890 struct inode *inode = d_inode(dentry);
2891 struct ptlrpc_request *req = NULL;
2892 struct obd_export *exp;
2895 LASSERT(inode != NULL);
2897 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n",
2898 inode->i_ino, inode->i_generation, inode, dentry);
2900 exp = ll_i2mdexp(inode);
2902 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2903 * But under CMD case, it caused some lock issues, should be fixed
2904 * with new CMD ibits lock. See bug 12718 */
2905 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2906 struct lookup_intent oit = { .it_op = IT_GETATTR };
2907 struct md_op_data *op_data;
2909 if (ibits == MDS_INODELOCK_LOOKUP)
2910 oit.it_op = IT_LOOKUP;
2912 /* Call getattr by fid, so do not provide name at all. */
2913 op_data = ll_prep_md_op_data(NULL, inode,
2915 LUSTRE_OPC_ANY, NULL);
2916 if (IS_ERR(op_data))
2917 return PTR_ERR(op_data);
2919 oit.it_create_mode |= M_CHECK_STALE;
2920 rc = md_intent_lock(exp, op_data, NULL, 0,
2921 /* we are not interested in name
2924 ll_md_blocking_ast, 0);
2925 ll_finish_md_op_data(op_data);
2926 oit.it_create_mode &= ~M_CHECK_STALE;
2928 rc = ll_inode_revalidate_fini(inode, rc);
2932 rc = ll_revalidate_it_finish(req, &oit, inode);
2934 ll_intent_release(&oit);
2938 /* Unlinked? Unhash dentry, so it is not picked up later by
2939 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2940 here to preserve get_cwd functionality on 2.6.
2942 if (!d_inode(dentry)->i_nlink)
2943 d_lustre_invalidate(dentry, 0);
2945 ll_lookup_finish_locks(&oit, inode);
2946 } else if (!ll_have_md_lock(d_inode(dentry), &ibits, LCK_MINMODE)) {
2947 struct ll_sb_info *sbi = ll_i2sbi(d_inode(dentry));
2948 u64 valid = OBD_MD_FLGETATTR;
2949 struct md_op_data *op_data;
2952 if (S_ISREG(inode->i_mode)) {
2953 rc = ll_get_default_mdsize(sbi, &ealen);
2956 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2959 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2960 0, ealen, LUSTRE_OPC_ANY,
2962 if (IS_ERR(op_data))
2963 return PTR_ERR(op_data);
2965 op_data->op_valid = valid;
2966 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2967 * capa for this inode. Because we only keep capas of dirs
2969 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2970 ll_finish_md_op_data(op_data);
2972 rc = ll_inode_revalidate_fini(inode, rc);
2976 rc = ll_prep_inode(&inode, req, NULL, NULL);
2979 ptlrpc_req_finished(req);
2983 static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2985 struct inode *inode = d_inode(dentry);
2988 rc = __ll_inode_revalidate(dentry, ibits);
2992 /* if object isn't regular file, don't validate size */
2993 if (!S_ISREG(inode->i_mode)) {
2994 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2995 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2996 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2998 /* In case of restore, the MDT has the right size and has
2999 * already send it back without granting the layout lock,
3000 * inode is up-to-date so glimpse is useless.
3001 * Also to glimpse we need the layout, in case of a running
3002 * restore the MDT holds the layout lock so the glimpse will
3003 * block up to the end of restore (getattr will block)
3005 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3006 rc = ll_glimpse_size(inode);
3011 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3013 struct inode *inode = d_inode(de);
3014 struct ll_sb_info *sbi = ll_i2sbi(inode);
3015 struct ll_inode_info *lli = ll_i2info(inode);
3018 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3019 MDS_INODELOCK_LOOKUP);
3020 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3025 stat->dev = inode->i_sb->s_dev;
3026 if (ll_need_32bit_api(sbi))
3027 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3029 stat->ino = inode->i_ino;
3030 stat->mode = inode->i_mode;
3031 stat->nlink = inode->i_nlink;
3032 stat->uid = inode->i_uid;
3033 stat->gid = inode->i_gid;
3034 stat->rdev = inode->i_rdev;
3035 stat->atime = inode->i_atime;
3036 stat->mtime = inode->i_mtime;
3037 stat->ctime = inode->i_ctime;
3038 stat->blksize = 1 << inode->i_blkbits;
3040 stat->size = i_size_read(inode);
3041 stat->blocks = inode->i_blocks;
3046 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3047 __u64 start, __u64 len)
3051 struct ll_user_fiemap *fiemap;
3052 unsigned int extent_count = fieinfo->fi_extents_max;
3054 num_bytes = sizeof(*fiemap) + (extent_count *
3055 sizeof(struct ll_fiemap_extent));
3056 fiemap = libcfs_kvzalloc(num_bytes, GFP_NOFS);
3061 fiemap->fm_flags = fieinfo->fi_flags;
3062 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3063 fiemap->fm_start = start;
3064 fiemap->fm_length = len;
3065 if (extent_count > 0)
3066 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3067 sizeof(struct ll_fiemap_extent));
3069 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3071 fieinfo->fi_flags = fiemap->fm_flags;
3072 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3073 if (extent_count > 0)
3074 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3075 fiemap->fm_mapped_extents *
3076 sizeof(struct ll_fiemap_extent));
3082 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3084 struct ll_inode_info *lli = ll_i2info(inode);
3085 struct posix_acl *acl = NULL;
3087 spin_lock(&lli->lli_lock);
3088 /* VFS' acl_permission_check->check_acl will release the refcount */
3089 acl = posix_acl_dup(lli->lli_posix_acl);
3090 spin_unlock(&lli->lli_lock);
3096 int ll_inode_permission(struct inode *inode, int mask)
3100 #ifdef MAY_NOT_BLOCK
3101 if (mask & MAY_NOT_BLOCK)
3105 /* as root inode are NOT getting validated in lookup operation,
3106 * need to do it before permission check. */
3108 if (is_root_inode(inode)) {
3109 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3110 MDS_INODELOCK_LOOKUP);
3115 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3116 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3118 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3119 return lustre_check_remote_perm(inode, mask);
3121 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3122 rc = generic_permission(inode, mask);
3127 /* -o localflock - only provides locally consistent flock locks */
3128 struct file_operations ll_file_operations = {
3129 .read_iter = ll_file_read_iter,
3130 .write_iter = ll_file_write_iter,
3131 .unlocked_ioctl = ll_file_ioctl,
3132 .open = ll_file_open,
3133 .release = ll_file_release,
3134 .mmap = ll_file_mmap,
3135 .llseek = ll_file_seek,
3136 .splice_read = ll_file_splice_read,
3141 struct file_operations ll_file_operations_flock = {
3142 .read_iter = ll_file_read_iter,
3143 .write_iter = ll_file_write_iter,
3144 .unlocked_ioctl = ll_file_ioctl,
3145 .open = ll_file_open,
3146 .release = ll_file_release,
3147 .mmap = ll_file_mmap,
3148 .llseek = ll_file_seek,
3149 .splice_read = ll_file_splice_read,
3152 .flock = ll_file_flock,
3153 .lock = ll_file_flock
3156 /* These are for -o noflock - to return ENOSYS on flock calls */
3157 struct file_operations ll_file_operations_noflock = {
3158 .read_iter = ll_file_read_iter,
3159 .write_iter = ll_file_write_iter,
3160 .unlocked_ioctl = ll_file_ioctl,
3161 .open = ll_file_open,
3162 .release = ll_file_release,
3163 .mmap = ll_file_mmap,
3164 .llseek = ll_file_seek,
3165 .splice_read = ll_file_splice_read,
3168 .flock = ll_file_noflock,
3169 .lock = ll_file_noflock
3172 struct inode_operations ll_file_inode_operations = {
3173 .setattr = ll_setattr,
3174 .getattr = ll_getattr,
3175 .permission = ll_inode_permission,
3176 .setxattr = ll_setxattr,
3177 .getxattr = ll_getxattr,
3178 .listxattr = ll_listxattr,
3179 .removexattr = ll_removexattr,
3180 .fiemap = ll_fiemap,
3181 .get_acl = ll_get_acl,
3184 /* dynamic ioctl number support routines */
3185 static struct llioc_ctl_data {
3186 struct rw_semaphore ioc_sem;
3187 struct list_head ioc_head;
3189 __RWSEM_INITIALIZER(llioc.ioc_sem),
3190 LIST_HEAD_INIT(llioc.ioc_head)
3195 struct list_head iocd_list;
3196 unsigned int iocd_size;
3197 llioc_callback_t iocd_cb;
3198 unsigned int iocd_count;
3199 unsigned int iocd_cmd[0];
3202 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3205 struct llioc_data *in_data = NULL;
3207 if (cb == NULL || cmd == NULL ||
3208 count > LLIOC_MAX_CMD || count < 0)
3211 size = sizeof(*in_data) + count * sizeof(unsigned int);
3212 in_data = kzalloc(size, GFP_NOFS);
3216 memset(in_data, 0, sizeof(*in_data));
3217 in_data->iocd_size = size;
3218 in_data->iocd_cb = cb;
3219 in_data->iocd_count = count;
3220 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3222 down_write(&llioc.ioc_sem);
3223 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3224 up_write(&llioc.ioc_sem);
3228 EXPORT_SYMBOL(ll_iocontrol_register);
3230 void ll_iocontrol_unregister(void *magic)
3232 struct llioc_data *tmp;
3237 down_write(&llioc.ioc_sem);
3238 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3240 list_del(&tmp->iocd_list);
3241 up_write(&llioc.ioc_sem);
3247 up_write(&llioc.ioc_sem);
3249 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3251 EXPORT_SYMBOL(ll_iocontrol_unregister);
3253 static enum llioc_iter
3254 ll_iocontrol_call(struct inode *inode, struct file *file,
3255 unsigned int cmd, unsigned long arg, int *rcp)
3257 enum llioc_iter ret = LLIOC_CONT;
3258 struct llioc_data *data;
3259 int rc = -EINVAL, i;
3261 down_read(&llioc.ioc_sem);
3262 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3263 for (i = 0; i < data->iocd_count; i++) {
3264 if (cmd != data->iocd_cmd[i])
3267 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3271 if (ret == LLIOC_STOP)
3274 up_read(&llioc.ioc_sem);
3281 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3283 struct ll_inode_info *lli = ll_i2info(inode);
3284 struct cl_env_nest nest;
3288 if (lli->lli_clob == NULL)
3291 env = cl_env_nested_get(&nest);
3293 return PTR_ERR(env);
3295 result = cl_conf_set(env, lli->lli_clob, conf);
3296 cl_env_nested_put(&nest, env);
3298 if (conf->coc_opc == OBJECT_CONF_SET) {
3299 struct ldlm_lock *lock = conf->coc_lock;
3301 LASSERT(lock != NULL);
3302 LASSERT(ldlm_has_layout(lock));
3304 /* it can only be allowed to match after layout is
3305 * applied to inode otherwise false layout would be
3306 * seen. Applying layout should happen before dropping
3307 * the intent lock. */
3308 ldlm_lock_allow_match(lock);
3314 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3315 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3318 struct ll_sb_info *sbi = ll_i2sbi(inode);
3319 struct obd_capa *oc;
3320 struct ptlrpc_request *req;
3321 struct mdt_body *body;
3327 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3328 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3329 lock->l_lvb_data, lock->l_lvb_len);
3331 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3334 /* if layout lock was granted right away, the layout is returned
3335 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3336 * blocked and then granted via completion ast, we have to fetch
3337 * layout here. Please note that we can't use the LVB buffer in
3338 * completion AST because it doesn't have a large enough buffer */
3339 oc = ll_mdscapa_get(inode);
3340 rc = ll_get_default_mdsize(sbi, &lmmsize);
3342 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3343 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3349 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3355 lmmsize = body->eadatasize;
3356 if (lmmsize == 0) /* empty layout */ {
3361 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3367 lvbdata = libcfs_kvzalloc(lmmsize, GFP_NOFS);
3368 if (lvbdata == NULL) {
3373 memcpy(lvbdata, lmm, lmmsize);
3374 lock_res_and_lock(lock);
3375 if (lock->l_lvb_data != NULL)
3376 kvfree(lock->l_lvb_data);
3378 lock->l_lvb_data = lvbdata;
3379 lock->l_lvb_len = lmmsize;
3380 unlock_res_and_lock(lock);
3383 ptlrpc_req_finished(req);
3388 * Apply the layout to the inode. Layout lock is held and will be released
3391 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3392 struct inode *inode, __u32 *gen, bool reconf)
3394 struct ll_inode_info *lli = ll_i2info(inode);
3395 struct ll_sb_info *sbi = ll_i2sbi(inode);
3396 struct ldlm_lock *lock;
3397 struct lustre_md md = { NULL };
3398 struct cl_object_conf conf;
3401 bool wait_layout = false;
3403 LASSERT(lustre_handle_is_used(lockh));
3405 lock = ldlm_handle2lock(lockh);
3406 LASSERT(lock != NULL);
3407 LASSERT(ldlm_has_layout(lock));
3409 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3410 inode, PFID(&lli->lli_fid), reconf);
3412 /* in case this is a caching lock and reinstate with new inode */
3413 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3415 lock_res_and_lock(lock);
3416 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3417 unlock_res_and_lock(lock);
3418 /* checking lvb_ready is racy but this is okay. The worst case is
3419 * that multi processes may configure the file on the same time. */
3420 if (lvb_ready || !reconf) {
3423 /* layout_gen must be valid if layout lock is not
3424 * cancelled and stripe has already set */
3425 *gen = ll_layout_version_get(lli);
3431 rc = ll_layout_fetch(inode, lock);
3435 /* for layout lock, lmm is returned in lock's lvb.
3436 * lvb_data is immutable if the lock is held so it's safe to access it
3437 * without res lock. See the description in ldlm_lock_decref_internal()
3438 * for the condition to free lvb_data of layout lock */
3439 if (lock->l_lvb_data != NULL) {
3440 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3441 lock->l_lvb_data, lock->l_lvb_len);
3443 *gen = LL_LAYOUT_GEN_EMPTY;
3445 *gen = md.lsm->lsm_layout_gen;
3448 CERROR("%s: file "DFID" unpackmd error: %d\n",
3449 ll_get_fsname(inode->i_sb, NULL, 0),
3450 PFID(&lli->lli_fid), rc);
3456 /* set layout to file. Unlikely this will fail as old layout was
3457 * surely eliminated */
3458 memset(&conf, 0, sizeof(conf));
3459 conf.coc_opc = OBJECT_CONF_SET;
3460 conf.coc_inode = inode;
3461 conf.coc_lock = lock;
3462 conf.u.coc_md = &md;
3463 rc = ll_layout_conf(inode, &conf);
3466 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3468 /* refresh layout failed, need to wait */
3469 wait_layout = rc == -EBUSY;
3472 LDLM_LOCK_PUT(lock);
3473 ldlm_lock_decref(lockh, mode);
3475 /* wait for IO to complete if it's still being used. */
3477 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3478 ll_get_fsname(inode->i_sb, NULL, 0),
3479 inode, PFID(&lli->lli_fid));
3481 memset(&conf, 0, sizeof(conf));
3482 conf.coc_opc = OBJECT_CONF_WAIT;
3483 conf.coc_inode = inode;
3484 rc = ll_layout_conf(inode, &conf);
3488 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3489 PFID(&lli->lli_fid), rc);
3495 * This function checks if there exists a LAYOUT lock on the client side,
3496 * or enqueues it if it doesn't have one in cache.
3498 * This function will not hold layout lock so it may be revoked any time after
3499 * this function returns. Any operations depend on layout should be redone
3502 * This function should be called before lov_io_init() to get an uptodate
3503 * layout version, the caller should save the version number and after IO
3504 * is finished, this function should be called again to verify that layout
3505 * is not changed during IO time.
3507 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3509 struct ll_inode_info *lli = ll_i2info(inode);
3510 struct ll_sb_info *sbi = ll_i2sbi(inode);
3511 struct md_op_data *op_data;
3512 struct lookup_intent it;
3513 struct lustre_handle lockh;
3515 struct ldlm_enqueue_info einfo = {
3516 .ei_type = LDLM_IBITS,
3518 .ei_cb_bl = ll_md_blocking_ast,
3519 .ei_cb_cp = ldlm_completion_ast,
3523 *gen = ll_layout_version_get(lli);
3524 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3528 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3529 LASSERT(S_ISREG(inode->i_mode));
3531 /* take layout lock mutex to enqueue layout lock exclusively. */
3532 mutex_lock(&lli->lli_layout_mutex);
3535 /* mostly layout lock is caching on the local side, so try to match
3536 * it before grabbing layout lock mutex. */
3537 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3538 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3539 if (mode != 0) { /* hit cached lock */
3540 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3544 mutex_unlock(&lli->lli_layout_mutex);
3548 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3549 0, 0, LUSTRE_OPC_ANY, NULL);
3550 if (IS_ERR(op_data)) {
3551 mutex_unlock(&lli->lli_layout_mutex);
3552 return PTR_ERR(op_data);
3555 /* have to enqueue one */
3556 memset(&it, 0, sizeof(it));
3557 it.it_op = IT_LAYOUT;
3558 lockh.cookie = 0ULL;
3560 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3561 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3562 PFID(&lli->lli_fid));
3564 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3566 if (it.d.lustre.it_data != NULL)
3567 ptlrpc_req_finished(it.d.lustre.it_data);
3568 it.d.lustre.it_data = NULL;
3570 ll_finish_md_op_data(op_data);
3572 mode = it.d.lustre.it_lock_mode;
3573 it.d.lustre.it_lock_mode = 0;
3574 ll_intent_drop_lock(&it);
3577 /* set lock data in case this is a new lock */
3578 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3579 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3583 mutex_unlock(&lli->lli_layout_mutex);
3589 * This function send a restore request to the MDT
3591 int ll_layout_restore(struct inode *inode)
3593 struct hsm_user_request *hur;
3596 len = sizeof(struct hsm_user_request) +
3597 sizeof(struct hsm_user_item);
3598 hur = kzalloc(len, GFP_NOFS);
3602 hur->hur_request.hr_action = HUA_RESTORE;
3603 hur->hur_request.hr_archive_id = 0;
3604 hur->hur_request.hr_flags = 0;
3605 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3606 sizeof(hur->hur_user_item[0].hui_fid));
3607 hur->hur_user_item[0].hui_extent.length = -1;
3608 hur->hur_request.hr_itemcount = 1;
3609 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,