4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2011, 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
38 * Author: Peter Braam <braam@clusterfs.com>
39 * Author: Phil Schwan <phil@clusterfs.com>
40 * Author: Andreas Dilger <adilger@clusterfs.com>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include "../include/lustre_dlm.h"
45 #include "../include/lustre_lite.h"
46 #include <linux/pagemap.h>
47 #include <linux/file.h>
48 #include "llite_internal.h"
49 #include "../include/lustre/ll_fiemap.h"
51 #include "../include/cl_object.h"
54 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
56 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
59 static enum llioc_iter
60 ll_iocontrol_call(struct inode *inode, struct file *file,
61 unsigned int cmd, unsigned long arg, int *rcp);
63 static struct ll_file_data *ll_file_data_get(void)
65 struct ll_file_data *fd;
67 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
70 fd->fd_write_failed = false;
74 static void ll_file_data_put(struct ll_file_data *fd)
77 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
80 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
81 struct lustre_handle *fh)
83 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
84 op_data->op_attr.ia_mode = inode->i_mode;
85 op_data->op_attr.ia_atime = inode->i_atime;
86 op_data->op_attr.ia_mtime = inode->i_mtime;
87 op_data->op_attr.ia_ctime = inode->i_ctime;
88 op_data->op_attr.ia_size = i_size_read(inode);
89 op_data->op_attr_blocks = inode->i_blocks;
90 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
91 ll_inode_to_ext_flags(inode->i_flags);
92 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
94 op_data->op_handle = *fh;
96 if (ll_i2info(inode)->lli_flags & LLIF_DATA_MODIFIED)
97 op_data->op_bias |= MDS_DATA_MODIFIED;
101 * Closes the IO epoch and packs all the attributes into @op_data for
104 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
105 struct obd_client_handle *och)
107 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
108 ATTR_MTIME | ATTR_MTIME_SET |
109 ATTR_CTIME | ATTR_CTIME_SET;
111 if (!(och->och_flags & FMODE_WRITE))
114 if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
115 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
117 ll_ioepoch_close(inode, op_data, &och, 0);
120 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
121 ll_prep_md_op_data(op_data, inode, NULL, NULL,
122 0, 0, LUSTRE_OPC_ANY, NULL);
125 static int ll_close_inode_openhandle(struct obd_export *md_exp,
127 struct obd_client_handle *och,
128 const __u64 *data_version)
130 struct obd_export *exp = ll_i2mdexp(inode);
131 struct md_op_data *op_data;
132 struct ptlrpc_request *req = NULL;
133 struct obd_device *obd = class_exp2obd(exp);
139 * XXX: in case of LMV, is this correct to access
142 CERROR("Invalid MDC connection handle %#llx\n",
143 ll_i2mdexp(inode)->exp_handle.h_cookie);
148 op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
150 /* XXX We leak openhandle and request here. */
155 ll_prepare_close(inode, op_data, och);
156 if (data_version != NULL) {
157 /* Pass in data_version implies release. */
158 op_data->op_bias |= MDS_HSM_RELEASE;
159 op_data->op_data_version = *data_version;
160 op_data->op_lease_handle = och->och_lease_handle;
161 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
163 epoch_close = op_data->op_flags & MF_EPOCH_CLOSE;
164 rc = md_close(md_exp, op_data, och->och_mod, &req);
166 /* This close must have the epoch closed. */
167 LASSERT(epoch_close);
168 /* MDS has instructed us to obtain Size-on-MDS attribute from
169 * OSTs and send setattr to back to MDS. */
170 rc = ll_som_update(inode, op_data);
172 CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
177 CERROR("inode %lu mdc close failed: rc = %d\n",
181 /* DATA_MODIFIED flag was successfully sent on close, cancel data
182 * modification flag. */
183 if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
184 struct ll_inode_info *lli = ll_i2info(inode);
186 spin_lock(&lli->lli_lock);
187 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
188 spin_unlock(&lli->lli_lock);
192 rc = ll_objects_destroy(req, inode);
194 CERROR("inode %lu ll_objects destroy: rc = %d\n",
197 if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
198 struct mdt_body *body;
200 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
201 if (!(body->valid & OBD_MD_FLRELEASED))
205 ll_finish_md_op_data(op_data);
208 if (exp_connect_som(exp) && !epoch_close &&
209 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
210 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
212 md_clear_open_replay_data(md_exp, och);
213 /* Free @och if it is not waiting for DONE_WRITING. */
214 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
217 if (req) /* This is close request */
218 ptlrpc_req_finished(req);
222 int ll_md_real_close(struct inode *inode, fmode_t fmode)
224 struct ll_inode_info *lli = ll_i2info(inode);
225 struct obd_client_handle **och_p;
226 struct obd_client_handle *och;
230 if (fmode & FMODE_WRITE) {
231 och_p = &lli->lli_mds_write_och;
232 och_usecount = &lli->lli_open_fd_write_count;
233 } else if (fmode & FMODE_EXEC) {
234 och_p = &lli->lli_mds_exec_och;
235 och_usecount = &lli->lli_open_fd_exec_count;
237 LASSERT(fmode & FMODE_READ);
238 och_p = &lli->lli_mds_read_och;
239 och_usecount = &lli->lli_open_fd_read_count;
242 mutex_lock(&lli->lli_och_mutex);
243 if (*och_usecount > 0) {
244 /* There are still users of this handle, so skip
246 mutex_unlock(&lli->lli_och_mutex);
252 mutex_unlock(&lli->lli_och_mutex);
255 /* There might be a race and this handle may already
257 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
264 static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
267 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
268 struct ll_inode_info *lli = ll_i2info(inode);
270 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
271 struct lustre_handle lockh;
272 ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_OPEN} };
275 /* clear group lock, if present */
276 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
277 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
279 if (fd->fd_lease_och != NULL) {
282 /* Usually the lease is not released when the
283 * application crashed, we need to release here. */
284 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
285 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
286 PFID(&lli->lli_fid), rc, lease_broken);
288 fd->fd_lease_och = NULL;
291 if (fd->fd_och != NULL) {
292 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
297 /* Let's see if we have good enough OPEN lock on the file and if
298 we can skip talking to MDS */
300 mutex_lock(&lli->lli_och_mutex);
301 if (fd->fd_omode & FMODE_WRITE) {
303 LASSERT(lli->lli_open_fd_write_count);
304 lli->lli_open_fd_write_count--;
305 } else if (fd->fd_omode & FMODE_EXEC) {
307 LASSERT(lli->lli_open_fd_exec_count);
308 lli->lli_open_fd_exec_count--;
311 LASSERT(lli->lli_open_fd_read_count);
312 lli->lli_open_fd_read_count--;
314 mutex_unlock(&lli->lli_och_mutex);
316 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
317 LDLM_IBITS, &policy, lockmode, &lockh))
318 rc = ll_md_real_close(inode, fd->fd_omode);
321 LUSTRE_FPRIVATE(file) = NULL;
322 ll_file_data_put(fd);
327 /* While this returns an error code, fput() the caller does not, so we need
328 * to make every effort to clean up all of our state here. Also, applications
329 * rarely check close errors and even if an error is returned they will not
330 * re-try the close call.
332 int ll_file_release(struct inode *inode, struct file *file)
334 struct ll_file_data *fd;
335 struct ll_sb_info *sbi = ll_i2sbi(inode);
336 struct ll_inode_info *lli = ll_i2info(inode);
339 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
340 inode->i_generation, inode);
342 #ifdef CONFIG_FS_POSIX_ACL
343 if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
344 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
347 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
348 fd->fd_flags &= ~LL_FILE_RMTACL;
349 rct_del(&sbi->ll_rct, current_pid());
350 et_search_free(&sbi->ll_et, current_pid());
355 if (!is_root_inode(inode))
356 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
357 fd = LUSTRE_FPRIVATE(file);
360 /* The last ref on @file, maybe not the owner pid of statahead.
361 * Different processes can open the same dir, "ll_opendir_key" means:
362 * it is me that should stop the statahead thread. */
363 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
364 lli->lli_opendir_pid != 0)
365 ll_stop_statahead(inode, lli->lli_opendir_key);
367 if (is_root_inode(inode)) {
368 LUSTRE_FPRIVATE(file) = NULL;
369 ll_file_data_put(fd);
373 if (!S_ISDIR(inode->i_mode)) {
374 lov_read_and_clear_async_rc(lli->lli_clob);
375 lli->lli_async_rc = 0;
378 rc = ll_md_close(sbi->ll_md_exp, inode, file);
380 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
381 libcfs_debug_dumplog();
386 static int ll_intent_file_open(struct dentry *dentry, void *lmm,
387 int lmmsize, struct lookup_intent *itp)
389 struct inode *inode = d_inode(dentry);
390 struct ll_sb_info *sbi = ll_i2sbi(inode);
391 struct dentry *parent = dentry->d_parent;
392 const char *name = dentry->d_name.name;
393 const int len = dentry->d_name.len;
394 struct md_op_data *op_data;
395 struct ptlrpc_request *req;
396 __u32 opc = LUSTRE_OPC_ANY;
399 /* Usually we come here only for NFSD, and we want open lock.
400 But we can also get here with pre 2.6.15 patchless kernels, and in
401 that case that lock is also ok */
402 /* We can also get here if there was cached open handle in revalidate_it
403 * but it disappeared while we were getting from there to ll_file_open.
404 * But this means this file was closed and immediately opened which
405 * makes a good candidate for using OPEN lock */
406 /* If lmmsize & lmm are not 0, we are just setting stripe info
407 * parameters. No need for the open lock */
408 if (lmm == NULL && lmmsize == 0) {
409 itp->it_flags |= MDS_OPEN_LOCK;
410 if (itp->it_flags & FMODE_WRITE)
411 opc = LUSTRE_OPC_CREATE;
414 op_data = ll_prep_md_op_data(NULL, d_inode(parent),
418 return PTR_ERR(op_data);
420 itp->it_flags |= MDS_OPEN_BY_FID;
421 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
422 0 /*unused */, &req, ll_md_blocking_ast, 0);
423 ll_finish_md_op_data(op_data);
425 /* reason for keep own exit path - don`t flood log
426 * with messages with -ESTALE errors.
428 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
429 it_open_error(DISP_OPEN_OPEN, itp))
431 ll_release_openhandle(inode, itp);
435 if (it_disposition(itp, DISP_LOOKUP_NEG)) {
440 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
441 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
442 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
446 rc = ll_prep_inode(&inode, req, NULL, itp);
447 if (!rc && itp->d.lustre.it_lock_mode)
448 ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL);
451 ptlrpc_req_finished(req);
452 ll_intent_drop_lock(itp);
458 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
459 * not believe attributes if a few ioepoch holders exist. Attributes for
460 * previous ioepoch if new one is opened are also skipped by MDS.
462 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
464 if (ioepoch && lli->lli_ioepoch != ioepoch) {
465 lli->lli_ioepoch = ioepoch;
466 CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
467 ioepoch, PFID(&lli->lli_fid));
471 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
472 struct obd_client_handle *och)
474 struct ptlrpc_request *req = it->d.lustre.it_data;
475 struct mdt_body *body;
477 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
478 och->och_fh = body->handle;
479 och->och_fid = body->fid1;
480 och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
481 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
482 och->och_flags = it->it_flags;
484 return md_set_open_replay_data(md_exp, och, it);
487 static int ll_local_open(struct file *file, struct lookup_intent *it,
488 struct ll_file_data *fd, struct obd_client_handle *och)
490 struct inode *inode = file_inode(file);
491 struct ll_inode_info *lli = ll_i2info(inode);
493 LASSERT(!LUSTRE_FPRIVATE(file));
498 struct ptlrpc_request *req = it->d.lustre.it_data;
499 struct mdt_body *body;
502 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
506 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
507 ll_ioepoch_open(lli, body->ioepoch);
510 LUSTRE_FPRIVATE(file) = fd;
511 ll_readahead_init(inode, &fd->fd_ras);
512 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
516 /* Open a file, and (for the very first open) create objects on the OSTs at
517 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
518 * creation or open until ll_lov_setstripe() ioctl is called.
520 * If we already have the stripe MD locally then we don't request it in
521 * md_open(), by passing a lmm_size = 0.
523 * It is up to the application to ensure no other processes open this file
524 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
525 * used. We might be able to avoid races of that sort by getting lli_open_sem
526 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
527 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
529 int ll_file_open(struct inode *inode, struct file *file)
531 struct ll_inode_info *lli = ll_i2info(inode);
532 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
533 .it_flags = file->f_flags };
534 struct obd_client_handle **och_p = NULL;
535 __u64 *och_usecount = NULL;
536 struct ll_file_data *fd;
537 int rc = 0, opendir_set = 0;
539 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
540 inode->i_generation, inode, file->f_flags);
542 it = file->private_data; /* XXX: compat macro */
543 file->private_data = NULL; /* prevent ll_local_open assertion */
545 fd = ll_file_data_get();
552 if (S_ISDIR(inode->i_mode)) {
553 spin_lock(&lli->lli_sa_lock);
554 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
555 lli->lli_opendir_pid == 0) {
556 lli->lli_opendir_key = fd;
557 lli->lli_opendir_pid = current_pid();
560 spin_unlock(&lli->lli_sa_lock);
563 if (is_root_inode(inode)) {
564 LUSTRE_FPRIVATE(file) = fd;
568 if (!it || !it->d.lustre.it_disposition) {
569 /* Convert f_flags into access mode. We cannot use file->f_mode,
570 * because everything but O_ACCMODE mask was stripped from
572 if ((oit.it_flags + 1) & O_ACCMODE)
574 if (file->f_flags & O_TRUNC)
575 oit.it_flags |= FMODE_WRITE;
577 /* kernel only call f_op->open in dentry_open. filp_open calls
578 * dentry_open after call to open_namei that checks permissions.
579 * Only nfsd_open call dentry_open directly without checking
580 * permissions and because of that this code below is safe. */
581 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
582 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
584 /* We do not want O_EXCL here, presumably we opened the file
585 * already? XXX - NFS implications? */
586 oit.it_flags &= ~O_EXCL;
588 /* bug20584, if "it_flags" contains O_CREAT, the file will be
589 * created if necessary, then "IT_CREAT" should be set to keep
590 * consistent with it */
591 if (oit.it_flags & O_CREAT)
592 oit.it_op |= IT_CREAT;
598 /* Let's see if we have file open on MDS already. */
599 if (it->it_flags & FMODE_WRITE) {
600 och_p = &lli->lli_mds_write_och;
601 och_usecount = &lli->lli_open_fd_write_count;
602 } else if (it->it_flags & FMODE_EXEC) {
603 och_p = &lli->lli_mds_exec_och;
604 och_usecount = &lli->lli_open_fd_exec_count;
606 och_p = &lli->lli_mds_read_och;
607 och_usecount = &lli->lli_open_fd_read_count;
610 mutex_lock(&lli->lli_och_mutex);
611 if (*och_p) { /* Open handle is present */
612 if (it_disposition(it, DISP_OPEN_OPEN)) {
613 /* Well, there's extra open request that we do not need,
614 let's close it somehow. This will decref request. */
615 rc = it_open_error(DISP_OPEN_OPEN, it);
617 mutex_unlock(&lli->lli_och_mutex);
621 ll_release_openhandle(inode, it);
625 rc = ll_local_open(file, it, fd, NULL);
628 mutex_unlock(&lli->lli_och_mutex);
632 LASSERT(*och_usecount == 0);
633 if (!it->d.lustre.it_disposition) {
634 /* We cannot just request lock handle now, new ELC code
635 means that one of other OPEN locks for this file
636 could be cancelled, and since blocking ast handler
637 would attempt to grab och_mutex as well, that would
638 result in a deadlock */
639 mutex_unlock(&lli->lli_och_mutex);
640 it->it_create_mode |= M_CHECK_STALE;
641 rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it);
642 it->it_create_mode &= ~M_CHECK_STALE;
648 *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
656 /* md_intent_lock() didn't get a request ref if there was an
657 * open error, so don't do cleanup on the request here
659 /* XXX (green): Should not we bail out on any error here, not
660 * just open error? */
661 rc = it_open_error(DISP_OPEN_OPEN, it);
665 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
667 rc = ll_local_open(file, it, fd, *och_p);
671 mutex_unlock(&lli->lli_och_mutex);
674 /* Must do this outside lli_och_mutex lock to prevent deadlock where
675 different kind of OPEN lock for this same inode gets cancelled
676 by ldlm_cancel_lru */
677 if (!S_ISREG(inode->i_mode))
680 if (!lli->lli_has_smd &&
681 (cl_is_lov_delay_create(file->f_flags) ||
682 (file->f_mode & FMODE_WRITE) == 0)) {
683 CDEBUG(D_INODE, "object creation was delayed\n");
686 cl_lov_delay_create_clear(&file->f_flags);
691 if (och_p && *och_p) {
696 mutex_unlock(&lli->lli_och_mutex);
699 if (opendir_set != 0)
700 ll_stop_statahead(inode, lli->lli_opendir_key);
701 ll_file_data_put(fd);
703 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
706 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
707 ptlrpc_req_finished(it->d.lustre.it_data);
708 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
714 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
715 struct ldlm_lock_desc *desc, void *data, int flag)
718 struct lustre_handle lockh;
721 case LDLM_CB_BLOCKING:
722 ldlm_lock2handle(lock, &lockh);
723 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
725 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
729 case LDLM_CB_CANCELING:
737 * Acquire a lease and open the file.
739 static struct obd_client_handle *
740 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
743 struct lookup_intent it = { .it_op = IT_OPEN };
744 struct ll_sb_info *sbi = ll_i2sbi(inode);
745 struct md_op_data *op_data;
746 struct ptlrpc_request *req;
747 struct lustre_handle old_handle = { 0 };
748 struct obd_client_handle *och = NULL;
752 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
753 return ERR_PTR(-EINVAL);
756 struct ll_inode_info *lli = ll_i2info(inode);
757 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
758 struct obd_client_handle **och_p;
761 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
762 return ERR_PTR(-EPERM);
764 /* Get the openhandle of the file */
766 mutex_lock(&lli->lli_och_mutex);
767 if (fd->fd_lease_och != NULL) {
768 mutex_unlock(&lli->lli_och_mutex);
772 if (fd->fd_och == NULL) {
773 if (file->f_mode & FMODE_WRITE) {
774 LASSERT(lli->lli_mds_write_och != NULL);
775 och_p = &lli->lli_mds_write_och;
776 och_usecount = &lli->lli_open_fd_write_count;
778 LASSERT(lli->lli_mds_read_och != NULL);
779 och_p = &lli->lli_mds_read_och;
780 och_usecount = &lli->lli_open_fd_read_count;
782 if (*och_usecount == 1) {
789 mutex_unlock(&lli->lli_och_mutex);
790 if (rc < 0) /* more than 1 opener */
793 LASSERT(fd->fd_och != NULL);
794 old_handle = fd->fd_och->och_fh;
797 och = kzalloc(sizeof(*och), GFP_NOFS);
799 return ERR_PTR(-ENOMEM);
801 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
802 LUSTRE_OPC_ANY, NULL);
803 if (IS_ERR(op_data)) {
804 rc = PTR_ERR(op_data);
808 /* To tell the MDT this openhandle is from the same owner */
809 op_data->op_handle = old_handle;
811 it.it_flags = fmode | open_flags;
812 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
813 rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
814 ll_md_blocking_lease_ast,
815 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
816 * it can be cancelled which may mislead applications that the lease is
818 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
819 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
820 * doesn't deal with openhandle, so normal openhandle will be leaked. */
821 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
822 ll_finish_md_op_data(op_data);
823 ptlrpc_req_finished(req);
827 if (it_disposition(&it, DISP_LOOKUP_NEG)) {
832 rc = it_open_error(DISP_OPEN_OPEN, &it);
836 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
837 ll_och_fill(sbi->ll_md_exp, &it, och);
839 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
844 /* already get lease, handle lease lock */
845 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
846 if (it.d.lustre.it_lock_mode == 0 ||
847 it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
848 /* open lock must return for lease */
849 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
850 PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
851 it.d.lustre.it_lock_bits);
856 ll_intent_release(&it);
860 rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
862 CERROR("Close openhandle returned %d\n", rc2);
864 /* cancel open lock */
865 if (it.d.lustre.it_lock_mode != 0) {
866 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
867 it.d.lustre.it_lock_mode);
868 it.d.lustre.it_lock_mode = 0;
871 ll_intent_release(&it);
878 * Release lease and close the file.
879 * It will check if the lease has ever broken.
881 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
884 struct ldlm_lock *lock;
885 bool cancelled = true;
888 lock = ldlm_handle2lock(&och->och_lease_handle);
890 lock_res_and_lock(lock);
891 cancelled = ldlm_is_cancel(lock);
892 unlock_res_and_lock(lock);
896 CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
897 PFID(&ll_i2info(inode)->lli_fid), cancelled);
900 ldlm_cli_cancel(&och->och_lease_handle, 0);
901 if (lease_broken != NULL)
902 *lease_broken = cancelled;
904 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
909 /* Fills the obdo with the attributes for the lsm */
910 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
911 struct obdo *obdo, __u64 ioepoch, int sync)
913 struct ptlrpc_request_set *set;
914 struct obd_info oinfo = { };
917 LASSERT(lsm != NULL);
921 oinfo.oi_oa->o_oi = lsm->lsm_oi;
922 oinfo.oi_oa->o_mode = S_IFREG;
923 oinfo.oi_oa->o_ioepoch = ioepoch;
924 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
925 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
926 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
927 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
928 OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
929 OBD_MD_FLDATAVERSION;
931 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
932 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
935 set = ptlrpc_prep_set();
937 CERROR("can't allocate ptlrpc set\n");
940 rc = obd_getattr_async(exp, &oinfo, set);
942 rc = ptlrpc_set_wait(set);
943 ptlrpc_set_destroy(set);
946 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
947 OBD_MD_FLATIME | OBD_MD_FLMTIME |
948 OBD_MD_FLCTIME | OBD_MD_FLSIZE |
949 OBD_MD_FLDATAVERSION);
954 * Performs the getattr on the inode and updates its fields.
955 * If @sync != 0, perform the getattr under the server-side lock.
957 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
958 __u64 ioepoch, int sync)
960 struct lov_stripe_md *lsm;
963 lsm = ccc_inode_lsm_get(inode);
964 rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
965 obdo, ioepoch, sync);
967 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
969 obdo_refresh_inode(inode, obdo, obdo->o_valid);
970 CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
971 POSTID(oi), i_size_read(inode),
972 (unsigned long long)inode->i_blocks,
973 1UL << inode->i_blkbits);
975 ccc_inode_lsm_put(inode, lsm);
979 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
981 struct ll_inode_info *lli = ll_i2info(inode);
982 struct cl_object *obj = lli->lli_clob;
983 struct cl_attr *attr = ccc_env_thread_attr(env);
987 ll_inode_size_lock(inode);
988 /* merge timestamps the most recently obtained from mds with
989 timestamps obtained from osts */
990 LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
991 LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
992 LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
994 lvb.lvb_size = i_size_read(inode);
995 lvb.lvb_blocks = inode->i_blocks;
996 lvb.lvb_mtime = LTIME_S(inode->i_mtime);
997 lvb.lvb_atime = LTIME_S(inode->i_atime);
998 lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1000 cl_object_attr_lock(obj);
1001 rc = cl_object_attr_get(env, obj, attr);
1002 cl_object_attr_unlock(obj);
1005 if (lvb.lvb_atime < attr->cat_atime)
1006 lvb.lvb_atime = attr->cat_atime;
1007 if (lvb.lvb_ctime < attr->cat_ctime)
1008 lvb.lvb_ctime = attr->cat_ctime;
1009 if (lvb.lvb_mtime < attr->cat_mtime)
1010 lvb.lvb_mtime = attr->cat_mtime;
1012 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1013 PFID(&lli->lli_fid), attr->cat_size);
1014 cl_isize_write_nolock(inode, attr->cat_size);
1016 inode->i_blocks = attr->cat_blocks;
1018 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1019 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1020 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1022 ll_inode_size_unlock(inode);
1027 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1030 struct obdo obdo = { 0 };
1033 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, &obdo, 0, 0);
1035 st->st_size = obdo.o_size;
1036 st->st_blocks = obdo.o_blocks;
1037 st->st_mtime = obdo.o_mtime;
1038 st->st_atime = obdo.o_atime;
1039 st->st_ctime = obdo.o_ctime;
1044 static bool file_is_noatime(const struct file *file)
1046 const struct vfsmount *mnt = file->f_path.mnt;
1047 const struct inode *inode = file_inode(file);
1049 /* Adapted from file_accessed() and touch_atime().*/
1050 if (file->f_flags & O_NOATIME)
1053 if (inode->i_flags & S_NOATIME)
1056 if (IS_NOATIME(inode))
1059 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1062 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1065 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1071 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1073 struct inode *inode = file_inode(file);
1075 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1077 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1078 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1079 file->f_flags & O_DIRECT ||
1082 io->ci_obj = ll_i2info(inode)->lli_clob;
1083 io->ci_lockreq = CILR_MAYBE;
1084 if (ll_file_nolock(file)) {
1085 io->ci_lockreq = CILR_NEVER;
1086 io->ci_no_srvlock = 1;
1087 } else if (file->f_flags & O_APPEND) {
1088 io->ci_lockreq = CILR_MANDATORY;
1091 io->ci_noatime = file_is_noatime(file);
1095 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1096 struct file *file, enum cl_io_type iot,
1097 loff_t *ppos, size_t count)
1099 struct ll_inode_info *lli = ll_i2info(file_inode(file));
1100 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1105 io = ccc_env_thread_io(env);
1106 ll_io_init(io, file, iot == CIT_WRITE);
1108 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1109 struct vvp_io *vio = vvp_env_io(env);
1110 struct ccc_io *cio = ccc_env_io(env);
1111 int write_mutex_locked = 0;
1113 cio->cui_fd = LUSTRE_FPRIVATE(file);
1114 vio->cui_io_subtype = args->via_io_subtype;
1116 switch (vio->cui_io_subtype) {
1118 cio->cui_iter = args->u.normal.via_iter;
1119 cio->cui_iocb = args->u.normal.via_iocb;
1120 if ((iot == CIT_WRITE) &&
1121 !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1122 if (mutex_lock_interruptible(&lli->
1124 result = -ERESTARTSYS;
1127 write_mutex_locked = 1;
1128 } else if (iot == CIT_READ) {
1129 down_read(&lli->lli_trunc_sem);
1133 vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1134 vio->u.splice.cui_flags = args->u.splice.via_flags;
1137 CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
1140 result = cl_io_loop(env, io);
1141 if (write_mutex_locked)
1142 mutex_unlock(&lli->lli_write_mutex);
1143 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1144 up_read(&lli->lli_trunc_sem);
1146 /* cl_io_rw_init() handled IO */
1147 result = io->ci_result;
1150 if (io->ci_nob > 0) {
1151 result = io->ci_nob;
1152 *ppos = io->u.ci_wr.wr.crw_pos;
1156 cl_io_fini(env, io);
1157 /* If any bit been read/written (result != 0), we just return
1158 * short read/write instead of restart io. */
1159 if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1160 CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n",
1161 iot == CIT_READ ? "read" : "write",
1162 file, *ppos, count);
1163 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1167 if (iot == CIT_READ) {
1169 ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1170 LPROC_LL_READ_BYTES, result);
1171 } else if (iot == CIT_WRITE) {
1173 ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1174 LPROC_LL_WRITE_BYTES, result);
1175 fd->fd_write_failed = false;
1176 } else if (result != -ERESTARTSYS) {
1177 fd->fd_write_failed = true;
1184 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1187 struct vvp_io_args *args;
1191 env = cl_env_get(&refcheck);
1193 return PTR_ERR(env);
1195 args = vvp_env_args(env, IO_NORMAL);
1196 args->u.normal.via_iter = to;
1197 args->u.normal.via_iocb = iocb;
1199 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1200 &iocb->ki_pos, iov_iter_count(to));
1201 cl_env_put(env, &refcheck);
1206 * Write to a file (through the page cache).
1208 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1211 struct vvp_io_args *args;
1215 env = cl_env_get(&refcheck);
1217 return PTR_ERR(env);
1219 args = vvp_env_args(env, IO_NORMAL);
1220 args->u.normal.via_iter = from;
1221 args->u.normal.via_iocb = iocb;
1223 result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1224 &iocb->ki_pos, iov_iter_count(from));
1225 cl_env_put(env, &refcheck);
1230 * Send file content (through pagecache) somewhere with helper
1232 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1233 struct pipe_inode_info *pipe, size_t count,
1237 struct vvp_io_args *args;
1241 env = cl_env_get(&refcheck);
1243 return PTR_ERR(env);
1245 args = vvp_env_args(env, IO_SPLICE);
1246 args->u.splice.via_pipe = pipe;
1247 args->u.splice.via_flags = flags;
1249 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1250 cl_env_put(env, &refcheck);
1254 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
1256 struct obd_export *exp = ll_i2dtexp(inode);
1257 struct obd_trans_info oti = { 0 };
1258 struct obdo *oa = NULL;
1261 struct lov_stripe_md *lsm = NULL, *lsm2;
1267 lsm = ccc_inode_lsm_get(inode);
1268 if (!lsm_has_objects(lsm)) {
1273 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1274 (lsm->lsm_stripe_count));
1276 lsm2 = libcfs_kvzalloc(lsm_size, GFP_NOFS);
1283 oa->o_nlink = ost_idx;
1284 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1285 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1286 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1287 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1288 obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1289 memcpy(lsm2, lsm, lsm_size);
1290 ll_inode_size_lock(inode);
1291 rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1292 ll_inode_size_unlock(inode);
1297 ccc_inode_lsm_put(inode, lsm);
1302 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1304 struct ll_recreate_obj ucreat;
1307 if (!capable(CFS_CAP_SYS_ADMIN))
1310 if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1314 ostid_set_seq_mdt0(&oi);
1315 ostid_set_id(&oi, ucreat.lrc_id);
1316 return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1319 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1325 if (!capable(CFS_CAP_SYS_ADMIN))
1328 if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1331 fid_to_ostid(&fid, &oi);
1332 ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1333 return ll_lov_recreate(inode, &oi, ost_idx);
1336 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1337 int flags, struct lov_user_md *lum, int lum_size)
1339 struct lov_stripe_md *lsm = NULL;
1340 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1343 lsm = ccc_inode_lsm_get(inode);
1345 ccc_inode_lsm_put(inode, lsm);
1346 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1352 ll_inode_size_lock(inode);
1353 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1356 rc = oit.d.lustre.it_status;
1360 ll_release_openhandle(inode, &oit);
1363 ll_inode_size_unlock(inode);
1364 ll_intent_release(&oit);
1365 ccc_inode_lsm_put(inode, lsm);
1369 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1373 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1374 struct lov_mds_md **lmmp, int *lmm_size,
1375 struct ptlrpc_request **request)
1377 struct ll_sb_info *sbi = ll_i2sbi(inode);
1378 struct mdt_body *body;
1379 struct lov_mds_md *lmm = NULL;
1380 struct ptlrpc_request *req = NULL;
1381 struct md_op_data *op_data;
1384 rc = ll_get_default_mdsize(sbi, &lmmsize);
1388 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1389 strlen(filename), lmmsize,
1390 LUSTRE_OPC_ANY, NULL);
1391 if (IS_ERR(op_data))
1392 return PTR_ERR(op_data);
1394 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1395 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1396 ll_finish_md_op_data(op_data);
1398 CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
1403 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1404 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1406 lmmsize = body->eadatasize;
1408 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1414 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1415 LASSERT(lmm != NULL);
1417 if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1418 (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1424 * This is coming from the MDS, so is probably in
1425 * little endian. We convert it to host endian before
1426 * passing it to userspace.
1428 if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) {
1431 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1432 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1435 /* if function called for directory - we should
1436 * avoid swab not existent lsm objects */
1437 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1438 lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1439 if (S_ISREG(body->mode))
1440 lustre_swab_lov_user_md_objects(
1441 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1443 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1444 lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1445 if (S_ISREG(body->mode))
1446 lustre_swab_lov_user_md_objects(
1447 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1454 *lmm_size = lmmsize;
1459 static int ll_lov_setea(struct inode *inode, struct file *file,
1462 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1463 struct lov_user_md *lump;
1464 int lum_size = sizeof(struct lov_user_md) +
1465 sizeof(struct lov_user_ost_data);
1468 if (!capable(CFS_CAP_SYS_ADMIN))
1471 lump = libcfs_kvzalloc(lum_size, GFP_NOFS);
1475 if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1480 rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump,
1482 cl_lov_delay_create_clear(&file->f_flags);
1488 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1491 struct lov_user_md_v3 lumv3;
1492 struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1493 struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1494 struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1496 int flags = FMODE_WRITE;
1498 /* first try with v1 which is smaller than v3 */
1499 lum_size = sizeof(struct lov_user_md_v1);
1500 if (copy_from_user(lumv1, lumv1p, lum_size))
1503 if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1504 lum_size = sizeof(struct lov_user_md_v3);
1505 if (copy_from_user(&lumv3, lumv3p, lum_size))
1509 rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1,
1511 cl_lov_delay_create_clear(&file->f_flags);
1513 struct lov_stripe_md *lsm;
1516 put_user(0, &lumv1p->lmm_stripe_count);
1518 ll_layout_refresh(inode, &gen);
1519 lsm = ccc_inode_lsm_get(inode);
1520 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1521 0, lsm, (void *)arg);
1522 ccc_inode_lsm_put(inode, lsm);
1527 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1529 struct lov_stripe_md *lsm;
1532 lsm = ccc_inode_lsm_get(inode);
1534 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1536 ccc_inode_lsm_put(inode, lsm);
1541 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1543 struct ll_inode_info *lli = ll_i2info(inode);
1544 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1545 struct ccc_grouplock grouplock;
1549 CWARN("group id for group lock must not be 0\n");
1553 if (ll_file_nolock(file))
1556 spin_lock(&lli->lli_lock);
1557 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1558 CWARN("group lock already existed with gid %lu\n",
1559 fd->fd_grouplock.cg_gid);
1560 spin_unlock(&lli->lli_lock);
1563 LASSERT(fd->fd_grouplock.cg_lock == NULL);
1564 spin_unlock(&lli->lli_lock);
1566 rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1567 arg, (file->f_flags & O_NONBLOCK), &grouplock);
1571 spin_lock(&lli->lli_lock);
1572 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1573 spin_unlock(&lli->lli_lock);
1574 CERROR("another thread just won the race\n");
1575 cl_put_grouplock(&grouplock);
1579 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1580 fd->fd_grouplock = grouplock;
1581 spin_unlock(&lli->lli_lock);
1583 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1587 static int ll_put_grouplock(struct inode *inode, struct file *file,
1590 struct ll_inode_info *lli = ll_i2info(inode);
1591 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1592 struct ccc_grouplock grouplock;
1594 spin_lock(&lli->lli_lock);
1595 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1596 spin_unlock(&lli->lli_lock);
1597 CWARN("no group lock held\n");
1600 LASSERT(fd->fd_grouplock.cg_lock != NULL);
1602 if (fd->fd_grouplock.cg_gid != arg) {
1603 CWARN("group lock %lu doesn't match current id %lu\n",
1604 arg, fd->fd_grouplock.cg_gid);
1605 spin_unlock(&lli->lli_lock);
1609 grouplock = fd->fd_grouplock;
1610 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1611 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1612 spin_unlock(&lli->lli_lock);
1614 cl_put_grouplock(&grouplock);
1615 CDEBUG(D_INFO, "group lock %lu released\n", arg);
1620 * Close inode open handle
1622 * \param inode [in] inode in question
1623 * \param it [in,out] intent which contains open info and result
1626 * \retval <0 failure
1628 int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
1630 struct obd_client_handle *och;
1635 /* Root ? Do nothing. */
1636 if (is_root_inode(inode))
1639 /* No open handle to close? Move away */
1640 if (!it_disposition(it, DISP_OPEN_OPEN))
1643 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1645 och = kzalloc(sizeof(*och), GFP_NOFS);
1651 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1653 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1656 /* this one is in place of ll_file_open */
1657 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1658 ptlrpc_req_finished(it->d.lustre.it_data);
1659 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1665 * Get size for inode for which FIEMAP mapping is requested.
1666 * Make the FIEMAP get_info call and returns the result.
1668 static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1671 struct obd_export *exp = ll_i2dtexp(inode);
1672 struct lov_stripe_md *lsm = NULL;
1673 struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1674 __u32 vallen = num_bytes;
1677 /* Checks for fiemap flags */
1678 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1679 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1683 /* Check for FIEMAP_FLAG_SYNC */
1684 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1685 rc = filemap_fdatawrite(inode->i_mapping);
1690 lsm = ccc_inode_lsm_get(inode);
1694 /* If the stripe_count > 1 and the application does not understand
1695 * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1697 if (lsm->lsm_stripe_count > 1 &&
1698 !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
1703 fm_key.oa.o_oi = lsm->lsm_oi;
1704 fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1706 if (i_size_read(inode) == 0) {
1707 rc = ll_glimpse_size(inode);
1712 obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1713 obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1714 /* If filesize is 0, then there would be no objects for mapping */
1715 if (fm_key.oa.o_size == 0) {
1716 fiemap->fm_mapped_extents = 0;
1721 memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1723 rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1726 CERROR("obd_get_info failed: rc = %d\n", rc);
1729 ccc_inode_lsm_put(inode, lsm);
1733 int ll_fid2path(struct inode *inode, void __user *arg)
1735 struct obd_export *exp = ll_i2mdexp(inode);
1736 const struct getinfo_fid2path __user *gfin = arg;
1737 struct getinfo_fid2path *gfout;
1742 if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1743 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1746 /* Only need to get the buflen */
1747 if (get_user(pathlen, &gfin->gf_pathlen))
1750 if (pathlen > PATH_MAX)
1753 outsize = sizeof(*gfout) + pathlen;
1755 gfout = kzalloc(outsize, GFP_NOFS);
1759 if (copy_from_user(gfout, arg, sizeof(*gfout))) {
1764 /* Call mdc_iocontrol */
1765 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1769 if (copy_to_user(arg, gfout, outsize))
1777 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1779 struct ll_user_fiemap *fiemap_s;
1780 size_t num_bytes, ret_bytes;
1781 unsigned int extent_count;
1784 /* Get the extent count so we can calculate the size of
1785 * required fiemap buffer */
1786 if (get_user(extent_count,
1787 &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1791 (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1793 num_bytes = sizeof(*fiemap_s) + (extent_count *
1794 sizeof(struct ll_fiemap_extent));
1796 fiemap_s = libcfs_kvzalloc(num_bytes, GFP_NOFS);
1797 if (fiemap_s == NULL)
1800 /* get the fiemap value */
1801 if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1802 sizeof(*fiemap_s))) {
1807 /* If fm_extent_count is non-zero, read the first extent since
1808 * it is used to calculate end_offset and device from previous
1811 if (copy_from_user(&fiemap_s->fm_extents[0],
1812 (char __user *)arg + sizeof(*fiemap_s),
1813 sizeof(struct ll_fiemap_extent))) {
1819 rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1823 ret_bytes = sizeof(struct ll_user_fiemap);
1825 if (extent_count != 0)
1826 ret_bytes += (fiemap_s->fm_mapped_extents *
1827 sizeof(struct ll_fiemap_extent));
1829 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1838 * Read the data_version for inode.
1840 * This value is computed using stripe object version on OST.
1841 * Version is computed using server side locking.
1843 * @param extent_lock Take extent lock. Not needed if a process is already
1844 * holding the OST object group locks.
1846 int ll_data_version(struct inode *inode, __u64 *data_version,
1849 struct lov_stripe_md *lsm = NULL;
1850 struct ll_sb_info *sbi = ll_i2sbi(inode);
1851 struct obdo *obdo = NULL;
1854 /* If no stripe, we consider version is 0. */
1855 lsm = ccc_inode_lsm_get(inode);
1856 if (!lsm_has_objects(lsm)) {
1858 CDEBUG(D_INODE, "No object for inode\n");
1863 obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
1869 rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, obdo, 0, extent_lock);
1871 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1874 *data_version = obdo->o_data_version;
1879 ccc_inode_lsm_put(inode, lsm);
1884 * Trigger a HSM release request for the provided inode.
1886 int ll_hsm_release(struct inode *inode)
1888 struct cl_env_nest nest;
1890 struct obd_client_handle *och = NULL;
1891 __u64 data_version = 0;
1895 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1896 ll_get_fsname(inode->i_sb, NULL, 0),
1897 PFID(&ll_i2info(inode)->lli_fid));
1899 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1905 /* Grab latest data_version and [am]time values */
1906 rc = ll_data_version(inode, &data_version, 1);
1910 env = cl_env_nested_get(&nest);
1916 ll_merge_lvb(env, inode);
1917 cl_env_nested_put(&nest, env);
1919 /* Release the file.
1920 * NB: lease lock handle is released in mdc_hsm_release_pack() because
1921 * we still need it to pack l_remote_handle to MDT. */
1922 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1928 if (och != NULL && !IS_ERR(och)) /* close the file */
1929 ll_lease_close(och, inode, NULL);
1934 struct ll_swap_stack {
1935 struct iattr ia1, ia2;
1937 struct inode *inode1, *inode2;
1938 bool check_dv1, check_dv2;
1941 static int ll_swap_layouts(struct file *file1, struct file *file2,
1942 struct lustre_swap_layouts *lsl)
1944 struct mdc_swap_layouts msl;
1945 struct md_op_data *op_data;
1948 struct ll_swap_stack *llss = NULL;
1951 llss = kzalloc(sizeof(*llss), GFP_NOFS);
1955 llss->inode1 = file_inode(file1);
1956 llss->inode2 = file_inode(file2);
1958 if (!S_ISREG(llss->inode2->i_mode)) {
1963 if (inode_permission(llss->inode1, MAY_WRITE) ||
1964 inode_permission(llss->inode2, MAY_WRITE)) {
1969 if (llss->inode2->i_sb != llss->inode1->i_sb) {
1974 /* we use 2 bool because it is easier to swap than 2 bits */
1975 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1976 llss->check_dv1 = true;
1978 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1979 llss->check_dv2 = true;
1981 /* we cannot use lsl->sl_dvX directly because we may swap them */
1982 llss->dv1 = lsl->sl_dv1;
1983 llss->dv2 = lsl->sl_dv2;
1985 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1986 if (rc == 0) /* same file, done! */ {
1991 if (rc < 0) { /* sequentialize it */
1992 swap(llss->inode1, llss->inode2);
1994 swap(llss->dv1, llss->dv2);
1995 swap(llss->check_dv1, llss->check_dv2);
1999 if (gid != 0) { /* application asks to flush dirty cache */
2000 rc = ll_get_grouplock(llss->inode1, file1, gid);
2004 rc = ll_get_grouplock(llss->inode2, file2, gid);
2006 ll_put_grouplock(llss->inode1, file1, gid);
2011 /* to be able to restore mtime and atime after swap
2012 * we need to first save them */
2014 (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2015 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2016 llss->ia1.ia_atime = llss->inode1->i_atime;
2017 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2018 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2019 llss->ia2.ia_atime = llss->inode2->i_atime;
2020 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2023 /* ultimate check, before swapping the layouts we check if
2024 * dataversion has changed (if requested) */
2025 if (llss->check_dv1) {
2026 rc = ll_data_version(llss->inode1, &dv, 0);
2029 if (dv != llss->dv1) {
2035 if (llss->check_dv2) {
2036 rc = ll_data_version(llss->inode2, &dv, 0);
2039 if (dv != llss->dv2) {
2045 /* struct md_op_data is used to send the swap args to the mdt
2046 * only flags is missing, so we use struct mdc_swap_layouts
2047 * through the md_op_data->op_data */
2048 /* flags from user space have to be converted before they are send to
2049 * server, no flag is sent today, they are only used on the client */
2052 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2053 0, LUSTRE_OPC_ANY, &msl);
2054 if (IS_ERR(op_data)) {
2055 rc = PTR_ERR(op_data);
2059 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2060 sizeof(*op_data), op_data, NULL);
2061 ll_finish_md_op_data(op_data);
2065 ll_put_grouplock(llss->inode2, file2, gid);
2066 ll_put_grouplock(llss->inode1, file1, gid);
2069 /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2073 /* clear useless flags */
2074 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2075 llss->ia1.ia_valid &= ~ATTR_MTIME;
2076 llss->ia2.ia_valid &= ~ATTR_MTIME;
2079 if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2080 llss->ia1.ia_valid &= ~ATTR_ATIME;
2081 llss->ia2.ia_valid &= ~ATTR_ATIME;
2084 /* update time if requested */
2086 if (llss->ia2.ia_valid != 0) {
2087 mutex_lock(&llss->inode1->i_mutex);
2088 rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
2089 mutex_unlock(&llss->inode1->i_mutex);
2092 if (llss->ia1.ia_valid != 0) {
2095 mutex_lock(&llss->inode2->i_mutex);
2096 rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
2097 mutex_unlock(&llss->inode2->i_mutex);
2108 static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2110 struct md_op_data *op_data;
2113 /* Detect out-of range masks */
2114 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2117 /* Non-root users are forbidden to set or clear flags which are
2118 * NOT defined in HSM_USER_MASK. */
2119 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2120 !capable(CFS_CAP_SYS_ADMIN))
2123 /* Detect out-of range archive id */
2124 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2125 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2128 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2129 LUSTRE_OPC_ANY, hss);
2130 if (IS_ERR(op_data))
2131 return PTR_ERR(op_data);
2133 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2134 sizeof(*op_data), op_data, NULL);
2136 ll_finish_md_op_data(op_data);
2141 static int ll_hsm_import(struct inode *inode, struct file *file,
2142 struct hsm_user_import *hui)
2144 struct hsm_state_set *hss = NULL;
2145 struct iattr *attr = NULL;
2149 if (!S_ISREG(inode->i_mode))
2153 hss = kzalloc(sizeof(*hss), GFP_NOFS);
2157 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2158 hss->hss_archive_id = hui->hui_archive_id;
2159 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2160 rc = ll_hsm_state_set(inode, hss);
2164 attr = kzalloc(sizeof(*attr), GFP_NOFS);
2170 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2171 attr->ia_mode |= S_IFREG;
2172 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2173 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2174 attr->ia_size = hui->hui_size;
2175 attr->ia_mtime.tv_sec = hui->hui_mtime;
2176 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2177 attr->ia_atime.tv_sec = hui->hui_atime;
2178 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2180 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2181 ATTR_UID | ATTR_GID |
2182 ATTR_MTIME | ATTR_MTIME_SET |
2183 ATTR_ATIME | ATTR_ATIME_SET;
2185 mutex_lock(&inode->i_mutex);
2187 rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2191 mutex_unlock(&inode->i_mutex);
2200 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2202 struct inode *inode = file_inode(file);
2203 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2206 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2207 inode->i_generation, inode, cmd);
2208 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2210 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2211 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2215 case LL_IOC_GETFLAGS:
2216 /* Get the current value of the file flags */
2217 return put_user(fd->fd_flags, (int *)arg);
2218 case LL_IOC_SETFLAGS:
2219 case LL_IOC_CLRFLAGS:
2220 /* Set or clear specific file flags */
2221 /* XXX This probably needs checks to ensure the flags are
2222 * not abused, and to handle any flag side effects.
2224 if (get_user(flags, (int *) arg))
2227 if (cmd == LL_IOC_SETFLAGS) {
2228 if ((flags & LL_FILE_IGNORE_LOCK) &&
2229 !(file->f_flags & O_DIRECT)) {
2230 CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
2235 fd->fd_flags |= flags;
2237 fd->fd_flags &= ~flags;
2240 case LL_IOC_LOV_SETSTRIPE:
2241 return ll_lov_setstripe(inode, file, arg);
2242 case LL_IOC_LOV_SETEA:
2243 return ll_lov_setea(inode, file, arg);
2244 case LL_IOC_LOV_SWAP_LAYOUTS: {
2246 struct lustre_swap_layouts lsl;
2248 if (copy_from_user(&lsl, (char *)arg,
2249 sizeof(struct lustre_swap_layouts)))
2252 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2255 file2 = fget(lsl.sl_fd);
2260 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2261 rc = ll_swap_layouts(file, file2, &lsl);
2265 case LL_IOC_LOV_GETSTRIPE:
2266 return ll_lov_getstripe(inode, arg);
2267 case LL_IOC_RECREATE_OBJ:
2268 return ll_lov_recreate_obj(inode, arg);
2269 case LL_IOC_RECREATE_FID:
2270 return ll_lov_recreate_fid(inode, arg);
2271 case FSFILT_IOC_FIEMAP:
2272 return ll_ioctl_fiemap(inode, arg);
2273 case FSFILT_IOC_GETFLAGS:
2274 case FSFILT_IOC_SETFLAGS:
2275 return ll_iocontrol(inode, file, cmd, arg);
2276 case FSFILT_IOC_GETVERSION_OLD:
2277 case FSFILT_IOC_GETVERSION:
2278 return put_user(inode->i_generation, (int *)arg);
2279 case LL_IOC_GROUP_LOCK:
2280 return ll_get_grouplock(inode, file, arg);
2281 case LL_IOC_GROUP_UNLOCK:
2282 return ll_put_grouplock(inode, file, arg);
2283 case IOC_OBD_STATFS:
2284 return ll_obd_statfs(inode, (void *)arg);
2286 /* We need to special case any other ioctls we want to handle,
2287 * to send them to the MDS/OST as appropriate and to properly
2288 * network encode the arg field.
2289 case FSFILT_IOC_SETVERSION_OLD:
2290 case FSFILT_IOC_SETVERSION:
2292 case LL_IOC_FLUSHCTX:
2293 return ll_flush_ctx(inode);
2294 case LL_IOC_PATH2FID: {
2295 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2296 sizeof(struct lu_fid)))
2301 case OBD_IOC_FID2PATH:
2302 return ll_fid2path(inode, (void *)arg);
2303 case LL_IOC_DATA_VERSION: {
2304 struct ioc_data_version idv;
2307 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2310 rc = ll_data_version(inode, &idv.idv_version,
2311 !(idv.idv_flags & LL_DV_NOFLUSH));
2313 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2319 case LL_IOC_GET_MDTIDX: {
2322 mdtidx = ll_get_mdt_idx(inode);
2326 if (put_user((int)mdtidx, (int *)arg))
2331 case OBD_IOC_GETDTNAME:
2332 case OBD_IOC_GETMDNAME:
2333 return ll_get_obd_name(inode, cmd, arg);
2334 case LL_IOC_HSM_STATE_GET: {
2335 struct md_op_data *op_data;
2336 struct hsm_user_state *hus;
2339 hus = kzalloc(sizeof(*hus), GFP_NOFS);
2343 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2344 LUSTRE_OPC_ANY, hus);
2345 if (IS_ERR(op_data)) {
2347 return PTR_ERR(op_data);
2350 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2353 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2356 ll_finish_md_op_data(op_data);
2360 case LL_IOC_HSM_STATE_SET: {
2361 struct hsm_state_set *hss;
2364 hss = memdup_user((char *)arg, sizeof(*hss));
2366 return PTR_ERR(hss);
2368 rc = ll_hsm_state_set(inode, hss);
2373 case LL_IOC_HSM_ACTION: {
2374 struct md_op_data *op_data;
2375 struct hsm_current_action *hca;
2378 hca = kzalloc(sizeof(*hca), GFP_NOFS);
2382 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2383 LUSTRE_OPC_ANY, hca);
2384 if (IS_ERR(op_data)) {
2386 return PTR_ERR(op_data);
2389 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2392 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2395 ll_finish_md_op_data(op_data);
2399 case LL_IOC_SET_LEASE: {
2400 struct ll_inode_info *lli = ll_i2info(inode);
2401 struct obd_client_handle *och = NULL;
2407 if (!(file->f_mode & FMODE_WRITE))
2412 if (!(file->f_mode & FMODE_READ))
2417 mutex_lock(&lli->lli_och_mutex);
2418 if (fd->fd_lease_och != NULL) {
2419 och = fd->fd_lease_och;
2420 fd->fd_lease_och = NULL;
2422 mutex_unlock(&lli->lli_och_mutex);
2425 mode = och->och_flags &
2426 (FMODE_READ|FMODE_WRITE);
2427 rc = ll_lease_close(och, inode, &lease_broken);
2428 if (rc == 0 && lease_broken)
2434 /* return the type of lease or error */
2435 return rc < 0 ? rc : (int)mode;
2440 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2442 /* apply for lease */
2443 och = ll_lease_open(inode, file, mode, 0);
2445 return PTR_ERR(och);
2448 mutex_lock(&lli->lli_och_mutex);
2449 if (fd->fd_lease_och == NULL) {
2450 fd->fd_lease_och = och;
2453 mutex_unlock(&lli->lli_och_mutex);
2455 /* impossible now that only excl is supported for now */
2456 ll_lease_close(och, inode, &lease_broken);
2461 case LL_IOC_GET_LEASE: {
2462 struct ll_inode_info *lli = ll_i2info(inode);
2463 struct ldlm_lock *lock = NULL;
2466 mutex_lock(&lli->lli_och_mutex);
2467 if (fd->fd_lease_och != NULL) {
2468 struct obd_client_handle *och = fd->fd_lease_och;
2470 lock = ldlm_handle2lock(&och->och_lease_handle);
2472 lock_res_and_lock(lock);
2473 if (!ldlm_is_cancel(lock))
2474 rc = och->och_flags &
2475 (FMODE_READ | FMODE_WRITE);
2476 unlock_res_and_lock(lock);
2477 ldlm_lock_put(lock);
2480 mutex_unlock(&lli->lli_och_mutex);
2483 case LL_IOC_HSM_IMPORT: {
2484 struct hsm_user_import *hui;
2486 hui = memdup_user((void *)arg, sizeof(*hui));
2488 return PTR_ERR(hui);
2490 rc = ll_hsm_import(inode, file, hui);
2498 if (ll_iocontrol_call(inode, file, cmd, arg, &err) ==
2502 return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2509 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2511 struct inode *inode = file_inode(file);
2512 loff_t retval, eof = 0;
2514 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2515 (origin == SEEK_CUR) ? file->f_pos : 0);
2516 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2517 inode->i_ino, inode->i_generation, inode, retval, retval,
2519 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2521 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2522 retval = ll_glimpse_size(inode);
2525 eof = i_size_read(inode);
2528 retval = generic_file_llseek_size(file, offset, origin,
2529 ll_file_maxbytes(inode), eof);
2533 static int ll_flush(struct file *file, fl_owner_t id)
2535 struct inode *inode = file_inode(file);
2536 struct ll_inode_info *lli = ll_i2info(inode);
2537 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2540 LASSERT(!S_ISDIR(inode->i_mode));
2542 /* catch async errors that were recorded back when async writeback
2543 * failed for pages in this mapping. */
2544 rc = lli->lli_async_rc;
2545 lli->lli_async_rc = 0;
2546 err = lov_read_and_clear_async_rc(lli->lli_clob);
2550 /* The application has been told write failure already.
2551 * Do not report failure again. */
2552 if (fd->fd_write_failed)
2554 return rc ? -EIO : 0;
2558 * Called to make sure a portion of file has been written out.
2559 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2561 * Return how many pages have been written.
2563 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2564 enum cl_fsync_mode mode, int ignore_layout)
2566 struct cl_env_nest nest;
2569 struct cl_fsync_io *fio;
2572 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2573 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2576 env = cl_env_nested_get(&nest);
2578 return PTR_ERR(env);
2580 io = ccc_env_thread_io(env);
2581 io->ci_obj = cl_i2info(inode)->lli_clob;
2582 io->ci_ignore_layout = ignore_layout;
2584 /* initialize parameters for sync */
2585 fio = &io->u.ci_fsync;
2586 fio->fi_start = start;
2588 fio->fi_fid = ll_inode2fid(inode);
2589 fio->fi_mode = mode;
2590 fio->fi_nr_written = 0;
2592 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2593 result = cl_io_loop(env, io);
2595 result = io->ci_result;
2597 result = fio->fi_nr_written;
2598 cl_io_fini(env, io);
2599 cl_env_nested_put(&nest, env);
2604 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2606 struct inode *inode = file_inode(file);
2607 struct ll_inode_info *lli = ll_i2info(inode);
2608 struct ptlrpc_request *req;
2611 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2612 inode->i_generation, inode);
2613 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2615 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2616 mutex_lock(&inode->i_mutex);
2618 /* catch async errors that were recorded back when async writeback
2619 * failed for pages in this mapping. */
2620 if (!S_ISDIR(inode->i_mode)) {
2621 err = lli->lli_async_rc;
2622 lli->lli_async_rc = 0;
2625 err = lov_read_and_clear_async_rc(lli->lli_clob);
2630 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2634 ptlrpc_req_finished(req);
2636 if (S_ISREG(inode->i_mode)) {
2637 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2639 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2640 if (rc == 0 && err < 0)
2643 fd->fd_write_failed = true;
2645 fd->fd_write_failed = false;
2648 mutex_unlock(&inode->i_mutex);
2653 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2655 struct inode *inode = file_inode(file);
2656 struct ll_sb_info *sbi = ll_i2sbi(inode);
2657 struct ldlm_enqueue_info einfo = {
2658 .ei_type = LDLM_FLOCK,
2659 .ei_cb_cp = ldlm_flock_completion_ast,
2660 .ei_cbdata = file_lock,
2662 struct md_op_data *op_data;
2663 struct lustre_handle lockh = {0};
2664 ldlm_policy_data_t flock = { {0} };
2669 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2670 inode->i_ino, file_lock);
2672 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2674 if (file_lock->fl_flags & FL_FLOCK)
2675 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2676 else if (!(file_lock->fl_flags & FL_POSIX))
2679 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2680 flock.l_flock.pid = file_lock->fl_pid;
2681 flock.l_flock.start = file_lock->fl_start;
2682 flock.l_flock.end = file_lock->fl_end;
2684 /* Somewhat ugly workaround for svc lockd.
2685 * lockd installs custom fl_lmops->lm_compare_owner that checks
2686 * for the fl_owner to be the same (which it always is on local node
2687 * I guess between lockd processes) and then compares pid.
2688 * As such we assign pid to the owner field to make it all work,
2689 * conflict with normal locks is unlikely since pid space and
2690 * pointer space for current->files are not intersecting */
2691 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2692 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2694 switch (file_lock->fl_type) {
2696 einfo.ei_mode = LCK_PR;
2699 /* An unlock request may or may not have any relation to
2700 * existing locks so we may not be able to pass a lock handle
2701 * via a normal ldlm_lock_cancel() request. The request may even
2702 * unlock a byte range in the middle of an existing lock. In
2703 * order to process an unlock request we need all of the same
2704 * information that is given with a normal read or write record
2705 * lock request. To avoid creating another ldlm unlock (cancel)
2706 * message we'll treat a LCK_NL flock request as an unlock. */
2707 einfo.ei_mode = LCK_NL;
2710 einfo.ei_mode = LCK_PW;
2713 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2714 file_lock->fl_type);
2729 flags = LDLM_FL_BLOCK_NOWAIT;
2735 flags = LDLM_FL_TEST_LOCK;
2736 /* Save the old mode so that if the mode in the lock changes we
2737 * can decrement the appropriate reader or writer refcount. */
2738 file_lock->fl_type = einfo.ei_mode;
2741 CERROR("unknown fcntl lock command: %d\n", cmd);
2745 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2746 LUSTRE_OPC_ANY, NULL);
2747 if (IS_ERR(op_data))
2748 return PTR_ERR(op_data);
2750 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2751 inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2752 flock.l_flock.start, flock.l_flock.end);
2754 rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2755 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2757 if ((file_lock->fl_flags & FL_FLOCK) &&
2758 (rc == 0 || file_lock->fl_type == F_UNLCK))
2759 rc2 = flock_lock_file_wait(file, file_lock);
2760 if ((file_lock->fl_flags & FL_POSIX) &&
2761 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2762 !(flags & LDLM_FL_TEST_LOCK))
2763 rc2 = posix_lock_file_wait(file, file_lock);
2765 if (rc2 && file_lock->fl_type != F_UNLCK) {
2766 einfo.ei_mode = LCK_NL;
2767 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2768 op_data, &lockh, &flock, 0, NULL /* req */, flags);
2772 ll_finish_md_op_data(op_data);
2778 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2784 * test if some locks matching bits and l_req_mode are acquired
2785 * - bits can be in different locks
2786 * - if found clear the common lock bits in *bits
2787 * - the bits not found, are kept in *bits
2789 * \param bits [IN] searched lock bits [IN]
2790 * \param l_req_mode [IN] searched lock mode
2791 * \retval boolean, true iff all bits are found
2793 int ll_have_md_lock(struct inode *inode, __u64 *bits, ldlm_mode_t l_req_mode)
2795 struct lustre_handle lockh;
2796 ldlm_policy_data_t policy;
2797 ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2798 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2806 fid = &ll_i2info(inode)->lli_fid;
2807 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2808 ldlm_lockname[mode]);
2810 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2811 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2812 policy.l_inodebits.bits = *bits & (1 << i);
2813 if (policy.l_inodebits.bits == 0)
2816 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2817 &policy, mode, &lockh)) {
2818 struct ldlm_lock *lock;
2820 lock = ldlm_handle2lock(&lockh);
2823 ~(lock->l_policy_data.l_inodebits.bits);
2824 LDLM_LOCK_PUT(lock);
2826 *bits &= ~policy.l_inodebits.bits;
2833 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2834 struct lustre_handle *lockh, __u64 flags,
2837 ldlm_policy_data_t policy = { .l_inodebits = {bits} };
2841 fid = &ll_i2info(inode)->lli_fid;
2842 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2844 rc = md_lock_match(ll_i2mdexp(inode), flags | LDLM_FL_BLOCK_GRANTED,
2845 fid, LDLM_IBITS, &policy, mode, lockh);
2850 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2852 /* Already unlinked. Just update nlink and return success */
2853 if (rc == -ENOENT) {
2855 /* This path cannot be hit for regular files unless in
2856 * case of obscure races, so no need to validate size.
2858 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2860 } else if (rc != 0) {
2861 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2862 "%s: revalidate FID "DFID" error: rc = %d\n",
2863 ll_get_fsname(inode->i_sb, NULL, 0),
2864 PFID(ll_inode2fid(inode)), rc);
2870 static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2872 struct inode *inode = d_inode(dentry);
2873 struct ptlrpc_request *req = NULL;
2874 struct obd_export *exp;
2877 LASSERT(inode != NULL);
2879 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n",
2880 inode->i_ino, inode->i_generation, inode, dentry);
2882 exp = ll_i2mdexp(inode);
2884 /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2885 * But under CMD case, it caused some lock issues, should be fixed
2886 * with new CMD ibits lock. See bug 12718 */
2887 if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2888 struct lookup_intent oit = { .it_op = IT_GETATTR };
2889 struct md_op_data *op_data;
2891 if (ibits == MDS_INODELOCK_LOOKUP)
2892 oit.it_op = IT_LOOKUP;
2894 /* Call getattr by fid, so do not provide name at all. */
2895 op_data = ll_prep_md_op_data(NULL, inode,
2897 LUSTRE_OPC_ANY, NULL);
2898 if (IS_ERR(op_data))
2899 return PTR_ERR(op_data);
2901 oit.it_create_mode |= M_CHECK_STALE;
2902 rc = md_intent_lock(exp, op_data, NULL, 0,
2903 /* we are not interested in name
2906 ll_md_blocking_ast, 0);
2907 ll_finish_md_op_data(op_data);
2908 oit.it_create_mode &= ~M_CHECK_STALE;
2910 rc = ll_inode_revalidate_fini(inode, rc);
2914 rc = ll_revalidate_it_finish(req, &oit, inode);
2916 ll_intent_release(&oit);
2920 /* Unlinked? Unhash dentry, so it is not picked up later by
2921 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2922 here to preserve get_cwd functionality on 2.6.
2924 if (!d_inode(dentry)->i_nlink)
2925 d_lustre_invalidate(dentry, 0);
2927 ll_lookup_finish_locks(&oit, inode);
2928 } else if (!ll_have_md_lock(d_inode(dentry), &ibits, LCK_MINMODE)) {
2929 struct ll_sb_info *sbi = ll_i2sbi(d_inode(dentry));
2930 u64 valid = OBD_MD_FLGETATTR;
2931 struct md_op_data *op_data;
2934 if (S_ISREG(inode->i_mode)) {
2935 rc = ll_get_default_mdsize(sbi, &ealen);
2938 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2941 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2942 0, ealen, LUSTRE_OPC_ANY,
2944 if (IS_ERR(op_data))
2945 return PTR_ERR(op_data);
2947 op_data->op_valid = valid;
2948 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2949 ll_finish_md_op_data(op_data);
2951 rc = ll_inode_revalidate_fini(inode, rc);
2955 rc = ll_prep_inode(&inode, req, NULL, NULL);
2958 ptlrpc_req_finished(req);
2962 static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2964 struct inode *inode = d_inode(dentry);
2967 rc = __ll_inode_revalidate(dentry, ibits);
2971 /* if object isn't regular file, don't validate size */
2972 if (!S_ISREG(inode->i_mode)) {
2973 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2974 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2975 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2977 /* In case of restore, the MDT has the right size and has
2978 * already send it back without granting the layout lock,
2979 * inode is up-to-date so glimpse is useless.
2980 * Also to glimpse we need the layout, in case of a running
2981 * restore the MDT holds the layout lock so the glimpse will
2982 * block up to the end of restore (getattr will block)
2984 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
2985 rc = ll_glimpse_size(inode);
2990 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2992 struct inode *inode = d_inode(de);
2993 struct ll_sb_info *sbi = ll_i2sbi(inode);
2994 struct ll_inode_info *lli = ll_i2info(inode);
2997 res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
2998 MDS_INODELOCK_LOOKUP);
2999 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3004 stat->dev = inode->i_sb->s_dev;
3005 if (ll_need_32bit_api(sbi))
3006 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3008 stat->ino = inode->i_ino;
3009 stat->mode = inode->i_mode;
3010 stat->nlink = inode->i_nlink;
3011 stat->uid = inode->i_uid;
3012 stat->gid = inode->i_gid;
3013 stat->rdev = inode->i_rdev;
3014 stat->atime = inode->i_atime;
3015 stat->mtime = inode->i_mtime;
3016 stat->ctime = inode->i_ctime;
3017 stat->blksize = 1 << inode->i_blkbits;
3019 stat->size = i_size_read(inode);
3020 stat->blocks = inode->i_blocks;
3025 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3026 __u64 start, __u64 len)
3030 struct ll_user_fiemap *fiemap;
3031 unsigned int extent_count = fieinfo->fi_extents_max;
3033 num_bytes = sizeof(*fiemap) + (extent_count *
3034 sizeof(struct ll_fiemap_extent));
3035 fiemap = libcfs_kvzalloc(num_bytes, GFP_NOFS);
3040 fiemap->fm_flags = fieinfo->fi_flags;
3041 fiemap->fm_extent_count = fieinfo->fi_extents_max;
3042 fiemap->fm_start = start;
3043 fiemap->fm_length = len;
3044 if (extent_count > 0)
3045 memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3046 sizeof(struct ll_fiemap_extent));
3048 rc = ll_do_fiemap(inode, fiemap, num_bytes);
3050 fieinfo->fi_flags = fiemap->fm_flags;
3051 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3052 if (extent_count > 0)
3053 memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3054 fiemap->fm_mapped_extents *
3055 sizeof(struct ll_fiemap_extent));
3061 struct posix_acl *ll_get_acl(struct inode *inode, int type)
3063 struct ll_inode_info *lli = ll_i2info(inode);
3064 struct posix_acl *acl = NULL;
3066 spin_lock(&lli->lli_lock);
3067 /* VFS' acl_permission_check->check_acl will release the refcount */
3068 acl = posix_acl_dup(lli->lli_posix_acl);
3069 spin_unlock(&lli->lli_lock);
3075 int ll_inode_permission(struct inode *inode, int mask)
3079 #ifdef MAY_NOT_BLOCK
3080 if (mask & MAY_NOT_BLOCK)
3084 /* as root inode are NOT getting validated in lookup operation,
3085 * need to do it before permission check. */
3087 if (is_root_inode(inode)) {
3088 rc = __ll_inode_revalidate(inode->i_sb->s_root,
3089 MDS_INODELOCK_LOOKUP);
3094 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3095 inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3097 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3098 return lustre_check_remote_perm(inode, mask);
3100 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3101 rc = generic_permission(inode, mask);
3106 /* -o localflock - only provides locally consistent flock locks */
3107 struct file_operations ll_file_operations = {
3108 .read_iter = ll_file_read_iter,
3109 .write_iter = ll_file_write_iter,
3110 .unlocked_ioctl = ll_file_ioctl,
3111 .open = ll_file_open,
3112 .release = ll_file_release,
3113 .mmap = ll_file_mmap,
3114 .llseek = ll_file_seek,
3115 .splice_read = ll_file_splice_read,
3120 struct file_operations ll_file_operations_flock = {
3121 .read_iter = ll_file_read_iter,
3122 .write_iter = ll_file_write_iter,
3123 .unlocked_ioctl = ll_file_ioctl,
3124 .open = ll_file_open,
3125 .release = ll_file_release,
3126 .mmap = ll_file_mmap,
3127 .llseek = ll_file_seek,
3128 .splice_read = ll_file_splice_read,
3131 .flock = ll_file_flock,
3132 .lock = ll_file_flock
3135 /* These are for -o noflock - to return ENOSYS on flock calls */
3136 struct file_operations ll_file_operations_noflock = {
3137 .read_iter = ll_file_read_iter,
3138 .write_iter = ll_file_write_iter,
3139 .unlocked_ioctl = ll_file_ioctl,
3140 .open = ll_file_open,
3141 .release = ll_file_release,
3142 .mmap = ll_file_mmap,
3143 .llseek = ll_file_seek,
3144 .splice_read = ll_file_splice_read,
3147 .flock = ll_file_noflock,
3148 .lock = ll_file_noflock
3151 struct inode_operations ll_file_inode_operations = {
3152 .setattr = ll_setattr,
3153 .getattr = ll_getattr,
3154 .permission = ll_inode_permission,
3155 .setxattr = ll_setxattr,
3156 .getxattr = ll_getxattr,
3157 .listxattr = ll_listxattr,
3158 .removexattr = ll_removexattr,
3159 .fiemap = ll_fiemap,
3160 .get_acl = ll_get_acl,
3163 /* dynamic ioctl number support routines */
3164 static struct llioc_ctl_data {
3165 struct rw_semaphore ioc_sem;
3166 struct list_head ioc_head;
3168 __RWSEM_INITIALIZER(llioc.ioc_sem),
3169 LIST_HEAD_INIT(llioc.ioc_head)
3174 struct list_head iocd_list;
3175 unsigned int iocd_size;
3176 llioc_callback_t iocd_cb;
3177 unsigned int iocd_count;
3178 unsigned int iocd_cmd[0];
3181 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3184 struct llioc_data *in_data = NULL;
3186 if (cb == NULL || cmd == NULL ||
3187 count > LLIOC_MAX_CMD || count < 0)
3190 size = sizeof(*in_data) + count * sizeof(unsigned int);
3191 in_data = kzalloc(size, GFP_NOFS);
3195 memset(in_data, 0, sizeof(*in_data));
3196 in_data->iocd_size = size;
3197 in_data->iocd_cb = cb;
3198 in_data->iocd_count = count;
3199 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3201 down_write(&llioc.ioc_sem);
3202 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3203 up_write(&llioc.ioc_sem);
3207 EXPORT_SYMBOL(ll_iocontrol_register);
3209 void ll_iocontrol_unregister(void *magic)
3211 struct llioc_data *tmp;
3216 down_write(&llioc.ioc_sem);
3217 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3219 list_del(&tmp->iocd_list);
3220 up_write(&llioc.ioc_sem);
3226 up_write(&llioc.ioc_sem);
3228 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3230 EXPORT_SYMBOL(ll_iocontrol_unregister);
3232 static enum llioc_iter
3233 ll_iocontrol_call(struct inode *inode, struct file *file,
3234 unsigned int cmd, unsigned long arg, int *rcp)
3236 enum llioc_iter ret = LLIOC_CONT;
3237 struct llioc_data *data;
3238 int rc = -EINVAL, i;
3240 down_read(&llioc.ioc_sem);
3241 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3242 for (i = 0; i < data->iocd_count; i++) {
3243 if (cmd != data->iocd_cmd[i])
3246 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3250 if (ret == LLIOC_STOP)
3253 up_read(&llioc.ioc_sem);
3260 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3262 struct ll_inode_info *lli = ll_i2info(inode);
3263 struct cl_env_nest nest;
3267 if (lli->lli_clob == NULL)
3270 env = cl_env_nested_get(&nest);
3272 return PTR_ERR(env);
3274 result = cl_conf_set(env, lli->lli_clob, conf);
3275 cl_env_nested_put(&nest, env);
3277 if (conf->coc_opc == OBJECT_CONF_SET) {
3278 struct ldlm_lock *lock = conf->coc_lock;
3280 LASSERT(lock != NULL);
3281 LASSERT(ldlm_has_layout(lock));
3283 /* it can only be allowed to match after layout is
3284 * applied to inode otherwise false layout would be
3285 * seen. Applying layout should happen before dropping
3286 * the intent lock. */
3287 ldlm_lock_allow_match(lock);
3293 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3294 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3297 struct ll_sb_info *sbi = ll_i2sbi(inode);
3298 struct ptlrpc_request *req;
3299 struct mdt_body *body;
3305 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3306 PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3307 lock->l_lvb_data, lock->l_lvb_len);
3309 if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3312 /* if layout lock was granted right away, the layout is returned
3313 * within DLM_LVB of dlm reply; otherwise if the lock was ever
3314 * blocked and then granted via completion ast, we have to fetch
3315 * layout here. Please note that we can't use the LVB buffer in
3316 * completion AST because it doesn't have a large enough buffer */
3317 rc = ll_get_default_mdsize(sbi, &lmmsize);
3319 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3320 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3325 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3331 lmmsize = body->eadatasize;
3332 if (lmmsize == 0) /* empty layout */ {
3337 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3343 lvbdata = libcfs_kvzalloc(lmmsize, GFP_NOFS);
3344 if (lvbdata == NULL) {
3349 memcpy(lvbdata, lmm, lmmsize);
3350 lock_res_and_lock(lock);
3351 if (lock->l_lvb_data != NULL)
3352 kvfree(lock->l_lvb_data);
3354 lock->l_lvb_data = lvbdata;
3355 lock->l_lvb_len = lmmsize;
3356 unlock_res_and_lock(lock);
3359 ptlrpc_req_finished(req);
3364 * Apply the layout to the inode. Layout lock is held and will be released
3367 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3368 struct inode *inode, __u32 *gen, bool reconf)
3370 struct ll_inode_info *lli = ll_i2info(inode);
3371 struct ll_sb_info *sbi = ll_i2sbi(inode);
3372 struct ldlm_lock *lock;
3373 struct lustre_md md = { NULL };
3374 struct cl_object_conf conf;
3377 bool wait_layout = false;
3379 LASSERT(lustre_handle_is_used(lockh));
3381 lock = ldlm_handle2lock(lockh);
3382 LASSERT(lock != NULL);
3383 LASSERT(ldlm_has_layout(lock));
3385 LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3386 inode, PFID(&lli->lli_fid), reconf);
3388 /* in case this is a caching lock and reinstate with new inode */
3389 md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3391 lock_res_and_lock(lock);
3392 lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3393 unlock_res_and_lock(lock);
3394 /* checking lvb_ready is racy but this is okay. The worst case is
3395 * that multi processes may configure the file on the same time. */
3396 if (lvb_ready || !reconf) {
3399 /* layout_gen must be valid if layout lock is not
3400 * cancelled and stripe has already set */
3401 *gen = ll_layout_version_get(lli);
3407 rc = ll_layout_fetch(inode, lock);
3411 /* for layout lock, lmm is returned in lock's lvb.
3412 * lvb_data is immutable if the lock is held so it's safe to access it
3413 * without res lock. See the description in ldlm_lock_decref_internal()
3414 * for the condition to free lvb_data of layout lock */
3415 if (lock->l_lvb_data != NULL) {
3416 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3417 lock->l_lvb_data, lock->l_lvb_len);
3419 *gen = LL_LAYOUT_GEN_EMPTY;
3421 *gen = md.lsm->lsm_layout_gen;
3424 CERROR("%s: file "DFID" unpackmd error: %d\n",
3425 ll_get_fsname(inode->i_sb, NULL, 0),
3426 PFID(&lli->lli_fid), rc);
3432 /* set layout to file. Unlikely this will fail as old layout was
3433 * surely eliminated */
3434 memset(&conf, 0, sizeof(conf));
3435 conf.coc_opc = OBJECT_CONF_SET;
3436 conf.coc_inode = inode;
3437 conf.coc_lock = lock;
3438 conf.u.coc_md = &md;
3439 rc = ll_layout_conf(inode, &conf);
3442 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3444 /* refresh layout failed, need to wait */
3445 wait_layout = rc == -EBUSY;
3448 LDLM_LOCK_PUT(lock);
3449 ldlm_lock_decref(lockh, mode);
3451 /* wait for IO to complete if it's still being used. */
3453 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3454 ll_get_fsname(inode->i_sb, NULL, 0),
3455 inode, PFID(&lli->lli_fid));
3457 memset(&conf, 0, sizeof(conf));
3458 conf.coc_opc = OBJECT_CONF_WAIT;
3459 conf.coc_inode = inode;
3460 rc = ll_layout_conf(inode, &conf);
3464 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3465 PFID(&lli->lli_fid), rc);
3471 * This function checks if there exists a LAYOUT lock on the client side,
3472 * or enqueues it if it doesn't have one in cache.
3474 * This function will not hold layout lock so it may be revoked any time after
3475 * this function returns. Any operations depend on layout should be redone
3478 * This function should be called before lov_io_init() to get an uptodate
3479 * layout version, the caller should save the version number and after IO
3480 * is finished, this function should be called again to verify that layout
3481 * is not changed during IO time.
3483 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3485 struct ll_inode_info *lli = ll_i2info(inode);
3486 struct ll_sb_info *sbi = ll_i2sbi(inode);
3487 struct md_op_data *op_data;
3488 struct lookup_intent it;
3489 struct lustre_handle lockh;
3491 struct ldlm_enqueue_info einfo = {
3492 .ei_type = LDLM_IBITS,
3494 .ei_cb_bl = ll_md_blocking_ast,
3495 .ei_cb_cp = ldlm_completion_ast,
3499 *gen = ll_layout_version_get(lli);
3500 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3504 LASSERT(fid_is_sane(ll_inode2fid(inode)));
3505 LASSERT(S_ISREG(inode->i_mode));
3507 /* take layout lock mutex to enqueue layout lock exclusively. */
3508 mutex_lock(&lli->lli_layout_mutex);
3511 /* mostly layout lock is caching on the local side, so try to match
3512 * it before grabbing layout lock mutex. */
3513 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3514 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3515 if (mode != 0) { /* hit cached lock */
3516 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3520 mutex_unlock(&lli->lli_layout_mutex);
3524 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3525 0, 0, LUSTRE_OPC_ANY, NULL);
3526 if (IS_ERR(op_data)) {
3527 mutex_unlock(&lli->lli_layout_mutex);
3528 return PTR_ERR(op_data);
3531 /* have to enqueue one */
3532 memset(&it, 0, sizeof(it));
3533 it.it_op = IT_LAYOUT;
3534 lockh.cookie = 0ULL;
3536 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3537 ll_get_fsname(inode->i_sb, NULL, 0), inode,
3538 PFID(&lli->lli_fid));
3540 rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3542 if (it.d.lustre.it_data != NULL)
3543 ptlrpc_req_finished(it.d.lustre.it_data);
3544 it.d.lustre.it_data = NULL;
3546 ll_finish_md_op_data(op_data);
3548 mode = it.d.lustre.it_lock_mode;
3549 it.d.lustre.it_lock_mode = 0;
3550 ll_intent_drop_lock(&it);
3553 /* set lock data in case this is a new lock */
3554 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3555 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3559 mutex_unlock(&lli->lli_layout_mutex);
3565 * This function send a restore request to the MDT
3567 int ll_layout_restore(struct inode *inode)
3569 struct hsm_user_request *hur;
3572 len = sizeof(struct hsm_user_request) +
3573 sizeof(struct hsm_user_item);
3574 hur = kzalloc(len, GFP_NOFS);
3578 hur->hur_request.hr_action = HUA_RESTORE;
3579 hur->hur_request.hr_archive_id = 0;
3580 hur->hur_request.hr_flags = 0;
3581 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3582 sizeof(hur->hur_user_item[0].hui_fid));
3583 hur->hur_user_item[0].hui_extent.length = -1;
3584 hur->hur_request.hr_itemcount = 1;
3585 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,