2 * Copyright 2016 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <folly/MemoryMapping.h>
23 #include <folly/Format.h>
24 #include <folly/portability/GFlags.h>
25 #include <folly/portability/SysMman.h>
28 #include <folly/experimental/io/HugePages.h>
32 #include <sys/types.h>
33 #include <system_error>
35 static constexpr ssize_t kDefaultMlockChunkSize =
37 // Linux implementations of unmap/mlock/munlock take a kernel
38 // semaphore and block other threads from doing other memory
39 // operations. Split the operations in chunks.
42 // MSVC doesn't have this problem, and calling munmap many times
43 // with the same address is a bad idea with the windows implementation.
48 DEFINE_int64(mlock_chunk_size, kDefaultMlockChunkSize,
49 "Maximum bytes to mlock/munlock/munmap at once "
50 "(will be rounded up to PAGESIZE). Ignored if negative.");
53 #define MAP_POPULATE 0
58 MemoryMapping::MemoryMapping(MemoryMapping&& other) noexcept {
62 MemoryMapping::MemoryMapping(File file, off_t offset, off_t length,
64 : file_(std::move(file)),
65 options_(std::move(options)) {
70 MemoryMapping::MemoryMapping(const char* name, off_t offset, off_t length,
72 : MemoryMapping(File(name, options.writable ? O_RDWR : O_RDONLY),
77 MemoryMapping::MemoryMapping(int fd, off_t offset, off_t length,
79 : MemoryMapping(File(fd), offset, length, options) { }
81 MemoryMapping::MemoryMapping(AnonymousType, off_t length, Options options)
82 : options_(std::move(options)) {
89 void getDeviceOptions(dev_t device, off_t& pageSize, bool& autoExtend) {
90 auto ps = getHugePageSizeForDevice(device);
97 inline void getDeviceOptions(dev_t device, off_t& pageSize,
103 void MemoryMapping::init(off_t offset, off_t length) {
104 const bool grow = options_.grow;
105 const bool anon = !file_;
106 CHECK(!(grow && anon));
108 off_t& pageSize = options_.pageSize;
112 // On Linux, hugetlbfs file systems don't require ftruncate() to grow the
113 // file, and (on kernels before 2.6.24) don't even allow it. Also, the file
114 // size is always a multiple of the page size.
115 bool autoExtend = false;
119 CHECK_ERR(fstat(file_.fd(), &st));
122 getDeviceOptions(st.st_dev, pageSize, autoExtend);
126 DCHECK_EQ(offset, 0);
127 CHECK_EQ(pageSize, 0);
132 pageSize = off_t(sysconf(_SC_PAGESIZE));
135 CHECK_GT(pageSize, 0);
136 CHECK_EQ(pageSize & (pageSize - 1), 0); // power of two
139 // Round down the start of the mapped region
140 off_t skipStart = offset % pageSize;
144 if (mapLength_ != -1) {
145 mapLength_ += skipStart;
147 // Round up the end of the mapped region
148 mapLength_ = (mapLength_ + pageSize - 1) / pageSize * pageSize;
151 off_t remaining = anon ? length : st.st_size - offset;
153 if (mapLength_ == -1) {
154 length = mapLength_ = remaining;
156 if (length > remaining) {
159 PCHECK(0 == ftruncate(file_.fd(), offset + length))
160 << "ftruncate() failed, couldn't grow file to "
164 // Extend mapping to multiple of page size, don't use ftruncate
165 remaining = mapLength_;
171 if (mapLength_ > remaining) {
172 mapLength_ = remaining;
180 int flags = options_.shared ? MAP_SHARED : MAP_PRIVATE;
181 if (anon) flags |= MAP_ANONYMOUS;
182 if (options_.prefault) flags |= MAP_POPULATE;
184 // The standard doesn't actually require PROT_NONE to be zero...
185 int prot = PROT_NONE;
186 if (options_.readable || options_.writable) {
187 prot = ((options_.readable ? PROT_READ : 0) |
188 (options_.writable ? PROT_WRITE : 0));
191 unsigned char* start = static_cast<unsigned char*>(
192 mmap(options_.address, mapLength_, prot, flags, file_.fd(), offset));
193 PCHECK(start != MAP_FAILED)
194 << " offset=" << offset
195 << " length=" << mapLength_;
197 data_.reset(start + skipStart, length);
203 off_t memOpChunkSize(off_t length, off_t pageSize) {
204 off_t chunkSize = length;
205 if (FLAGS_mlock_chunk_size <= 0) {
209 chunkSize = off_t(FLAGS_mlock_chunk_size);
210 off_t r = chunkSize % pageSize;
212 chunkSize += (pageSize - r);
218 * Run @op in chunks over the buffer @mem of @bufSize length.
221 * - success: true + amountSucceeded == bufSize (op success on whole buffer)
222 * - failure: false + amountSucceeded == nr bytes on which op succeeded.
224 bool memOpInChunks(std::function<int(void*, size_t)> op,
225 void* mem, size_t bufSize, off_t pageSize,
226 size_t& amountSucceeded) {
227 // Linux' unmap/mlock/munlock take a kernel semaphore and block other threads
228 // from doing other memory operations. If the size of the buffer is big the
229 // semaphore can be down for seconds (for benchmarks see
230 // http://kostja-osipov.livejournal.com/42963.html). Doing the operations in
231 // chunks breaks the locking into intervals and lets other threads do memory
232 // operations of their own.
234 size_t chunkSize = memOpChunkSize(off_t(bufSize), pageSize);
236 char* addr = static_cast<char*>(mem);
239 while (amountSucceeded < bufSize) {
240 size_t size = std::min(chunkSize, bufSize - amountSucceeded);
241 if (op(addr + amountSucceeded, size) != 0) {
244 amountSucceeded += size;
250 } // anonymous namespace
252 bool MemoryMapping::mlock(LockMode lock) {
253 size_t amountSucceeded = 0;
254 locked_ = memOpInChunks(::mlock, mapStart_, mapLength_, options_.pageSize,
261 folly::format("mlock({}) failed at {}", mapLength_, amountSucceeded);
262 if (lock == LockMode::TRY_LOCK && errno == EPERM) {
263 PLOG(WARNING) << msg;
264 } else if (lock == LockMode::TRY_LOCK && errno == ENOMEM) {
270 // only part of the buffer was mlocked, unlock it back
271 if (!memOpInChunks(::munlock, mapStart_, amountSucceeded, options_.pageSize,
273 PLOG(WARNING) << "munlock()";
279 void MemoryMapping::munlock(bool dontneed) {
280 if (!locked_) return;
282 size_t amountSucceeded = 0;
283 if (!memOpInChunks(::munlock, mapStart_, mapLength_, options_.pageSize,
285 PLOG(WARNING) << "munlock()";
287 if (mapLength_ && dontneed &&
288 ::madvise(mapStart_, mapLength_, MADV_DONTNEED)) {
289 PLOG(WARNING) << "madvise()";
294 void MemoryMapping::hintLinearScan() {
295 advise(MADV_SEQUENTIAL);
298 MemoryMapping::~MemoryMapping() {
300 size_t amountSucceeded = 0;
301 if (!memOpInChunks(::munmap, mapStart_, mapLength_, options_.pageSize,
303 PLOG(FATAL) << folly::format("munmap({}) failed at {}",
304 mapLength_, amountSucceeded);
309 void MemoryMapping::advise(int advice) const { advise(advice, 0, mapLength_); }
311 void MemoryMapping::advise(int advice, size_t offset, size_t length) const {
312 CHECK_LE(offset + length, size_t(mapLength_))
313 << " offset: " << offset
314 << " length: " << length
315 << " mapLength_: " << mapLength_;
317 // Include the entire start page: round down to page boundary.
318 const auto offMisalign = offset % options_.pageSize;
319 offset -= offMisalign;
320 length += offMisalign;
322 // Round the last page down to page boundary.
323 if (offset + length != size_t(mapLength_)) {
324 length -= length % options_.pageSize;
331 char* mapStart = static_cast<char*>(mapStart_) + offset;
332 PLOG_IF(WARNING, ::madvise(mapStart, length, advice)) << "madvise";
335 MemoryMapping& MemoryMapping::operator=(MemoryMapping other) {
340 void MemoryMapping::swap(MemoryMapping& other) noexcept {
342 swap(this->file_, other.file_);
343 swap(this->mapStart_, other.mapStart_);
344 swap(this->mapLength_, other.mapLength_);
345 swap(this->options_, other.options_);
346 swap(this->locked_, other.locked_);
347 swap(this->data_, other.data_);
350 void swap(MemoryMapping& a, MemoryMapping& b) noexcept { a.swap(b); }
352 void alignedForwardMemcpy(void* dst, const void* src, size_t size) {
353 assert(reinterpret_cast<uintptr_t>(src) % alignof(unsigned long) == 0);
354 assert(reinterpret_cast<uintptr_t>(dst) % alignof(unsigned long) == 0);
356 auto srcl = static_cast<const unsigned long*>(src);
357 auto dstl = static_cast<unsigned long*>(dst);
359 while (size >= sizeof(unsigned long)) {
361 size -= sizeof(unsigned long);
364 auto srcc = reinterpret_cast<const unsigned char*>(srcl);
365 auto dstc = reinterpret_cast<unsigned char*>(dstl);
373 void mmapFileCopy(const char* src, const char* dest, mode_t mode) {
374 MemoryMapping srcMap(src);
375 srcMap.hintLinearScan();
377 MemoryMapping destMap(
378 File(dest, O_RDWR | O_CREAT | O_TRUNC, mode),
380 off_t(srcMap.range().size()),
381 MemoryMapping::writable());
383 alignedForwardMemcpy(destMap.writableRange().data(),
384 srcMap.range().data(),
385 srcMap.range().size());