2 * Copyright 2016 Facebook, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include <sys/types.h>
20 #include <sys/socket.h>
21 #include <folly/SocketAddress.h>
22 #include <folly/io/ShutdownSocketSet.h>
23 #include <folly/io/IOBuf.h>
24 #include <folly/io/async/AsyncTimeout.h>
25 #include <folly/io/async/AsyncSocketException.h>
26 #include <folly/io/async/AsyncTransport.h>
27 #include <folly/io/async/EventHandler.h>
28 #include <folly/io/async/DelayedDestruction.h>
37 * A class for performing asynchronous I/O on a socket.
39 * AsyncSocket allows users to asynchronously wait for data on a socket, and
40 * to asynchronously send data.
42 * The APIs for reading and writing are intentionally asymmetric. Waiting for
43 * data to read is a persistent API: a callback is installed, and is notified
44 * whenever new data is available. It continues to be notified of new events
45 * until it is uninstalled.
47 * AsyncSocket does not provide read timeout functionality, because it
48 * typically cannot determine when the timeout should be active. Generally, a
49 * timeout should only be enabled when processing is blocked waiting on data
50 * from the remote endpoint. For server sockets, the timeout should not be
51 * active if the server is currently processing one or more outstanding
52 * requests for this socket. For client sockets, the timeout should not be
53 * active if there are no requests pending on the socket. Additionally, if a
54 * client has multiple pending requests, it will ususally want a separate
55 * timeout for each request, rather than a single read timeout.
57 * The write API is fairly intuitive: a user can request to send a block of
58 * data, and a callback will be informed once the entire block has been
59 * transferred to the kernel, or on error. AsyncSocket does provide a send
60 * timeout, since most callers want to give up if the remote end stops
61 * responding and no further progress can be made sending the data.
64 class AsyncSocket : virtual public AsyncTransportWrapper {
66 typedef std::unique_ptr<AsyncSocket, Destructor> UniquePtr;
68 class ConnectCallback {
70 virtual ~ConnectCallback() = default;
73 * connectSuccess() will be invoked when the connection has been
74 * successfully established.
76 virtual void connectSuccess() noexcept = 0;
79 * connectErr() will be invoked if the connection attempt fails.
81 * @param ex An exception describing the error that occurred.
83 virtual void connectErr(const AsyncSocketException& ex)
87 explicit AsyncSocket();
89 * Create a new unconnected AsyncSocket.
91 * connect() must later be called on this socket to establish a connection.
93 explicit AsyncSocket(EventBase* evb);
95 void setShutdownSocketSet(ShutdownSocketSet* ss);
98 * Create a new AsyncSocket and begin the connection process.
100 * @param evb EventBase that will manage this socket.
101 * @param address The address to connect to.
102 * @param connectTimeout Optional timeout in milliseconds for the connection
105 AsyncSocket(EventBase* evb,
106 const folly::SocketAddress& address,
107 uint32_t connectTimeout = 0);
110 * Create a new AsyncSocket and begin the connection process.
112 * @param evb EventBase that will manage this socket.
113 * @param ip IP address to connect to (dotted-quad).
114 * @param port Destination port in host byte order.
115 * @param connectTimeout Optional timeout in milliseconds for the connection
118 AsyncSocket(EventBase* evb,
119 const std::string& ip,
121 uint32_t connectTimeout = 0);
124 * Create a AsyncSocket from an already connected socket file descriptor.
126 * Note that while AsyncSocket enables TCP_NODELAY for sockets it creates
127 * when connecting, it does not change the socket options when given an
128 * existing file descriptor. If callers want TCP_NODELAY enabled when using
129 * this version of the constructor, they need to explicitly call
130 * setNoDelay(true) after the constructor returns.
132 * @param evb EventBase that will manage this socket.
133 * @param fd File descriptor to take over (should be a connected socket).
135 AsyncSocket(EventBase* evb, int fd);
138 * Helper function to create a shared_ptr<AsyncSocket>.
140 * This passes in the correct destructor object, since AsyncSocket's
141 * destructor is protected and cannot be invoked directly.
143 static std::shared_ptr<AsyncSocket> newSocket(EventBase* evb) {
144 return std::shared_ptr<AsyncSocket>(new AsyncSocket(evb),
149 * Helper function to create a shared_ptr<AsyncSocket>.
151 static std::shared_ptr<AsyncSocket> newSocket(
153 const folly::SocketAddress& address,
154 uint32_t connectTimeout = 0) {
155 return std::shared_ptr<AsyncSocket>(
156 new AsyncSocket(evb, address, connectTimeout),
161 * Helper function to create a shared_ptr<AsyncSocket>.
163 static std::shared_ptr<AsyncSocket> newSocket(
165 const std::string& ip,
167 uint32_t connectTimeout = 0) {
168 return std::shared_ptr<AsyncSocket>(
169 new AsyncSocket(evb, ip, port, connectTimeout),
174 * Helper function to create a shared_ptr<AsyncSocket>.
176 static std::shared_ptr<AsyncSocket> newSocket(EventBase* evb, int fd) {
177 return std::shared_ptr<AsyncSocket>(new AsyncSocket(evb, fd),
182 * Destroy the socket.
184 * AsyncSocket::destroy() must be called to destroy the socket.
185 * The normal destructor is private, and should not be invoked directly.
186 * This prevents callers from deleting a AsyncSocket while it is invoking a
189 virtual void destroy() override;
192 * Get the EventBase used by this socket.
194 EventBase* getEventBase() const override {
199 * Get the file descriptor used by the AsyncSocket.
201 virtual int getFd() const {
206 * Extract the file descriptor from the AsyncSocket.
208 * This will immediately cause any installed callbacks to be invoked with an
209 * error. The AsyncSocket may no longer be used after the file descriptor
210 * has been extracted.
212 * Returns the file descriptor. The caller assumes ownership of the
213 * descriptor, and it will not be closed when the AsyncSocket is destroyed.
215 virtual int detachFd();
218 * Uniquely identifies a handle to a socket option value. Each
219 * combination of level and option name corresponds to one socket
224 bool operator<(const OptionKey& other) const {
225 if (level == other.level) {
226 return optname < other.optname;
228 return level < other.level;
230 int apply(int fd, int val) const {
231 return setsockopt(fd, level, optname, &val, sizeof(val));
237 // Maps from a socket option key to its value
238 typedef std::map<OptionKey, int> OptionMap;
240 static const OptionMap emptyOptionMap;
241 static const folly::SocketAddress& anyAddress();
244 * Initiate a connection.
246 * @param callback The callback to inform when the connection attempt
248 * @param address The address to connect to.
249 * @param timeout A timeout value, in milliseconds. If the connection
250 * does not succeed within this period,
251 * callback->connectError() will be invoked.
253 virtual void connect(ConnectCallback* callback,
254 const folly::SocketAddress& address,
256 const OptionMap &options = emptyOptionMap,
257 const folly::SocketAddress& bindAddr = anyAddress()
259 void connect(ConnectCallback* callback, const std::string& ip, uint16_t port,
261 const OptionMap &options = emptyOptionMap) noexcept;
264 * If a connect request is in-flight, cancels it and closes the socket
265 * immediately. Otherwise, this is a no-op.
267 * This does not invoke any connection related callbacks. Call this to
268 * prevent any connect callback while cleaning up, etc.
270 void cancelConnect();
273 * Set the send timeout.
275 * If write requests do not make any progress for more than the specified
276 * number of milliseconds, fail all pending writes and close the socket.
278 * If write requests are currently pending when setSendTimeout() is called,
279 * the timeout interval is immediately restarted using the new value.
281 * (See the comments for AsyncSocket for an explanation of why AsyncSocket
282 * provides setSendTimeout() but not setRecvTimeout().)
284 * @param milliseconds The timeout duration, in milliseconds. If 0, no
285 * timeout will be used.
287 void setSendTimeout(uint32_t milliseconds) override;
290 * Get the send timeout.
292 * @return Returns the current send timeout, in milliseconds. A return value
293 * of 0 indicates that no timeout is set.
295 uint32_t getSendTimeout() const override {
300 * Set the maximum number of reads to execute from the underlying
301 * socket each time the EventBase detects that new ingress data is
302 * available. The default is unlimited, but callers can use this method
303 * to limit the amount of data read from the socket per event loop
306 * @param maxReads Maximum number of reads per data-available event;
307 * a value of zero means unlimited.
309 void setMaxReadsPerEvent(uint16_t maxReads) {
310 maxReadsPerEvent_ = maxReads;
314 * Get the maximum number of reads this object will execute from
315 * the underlying socket each time the EventBase detects that new
316 * ingress data is available.
318 * @returns Maximum number of reads per data-available event; a value
319 * of zero means unlimited.
321 uint16_t getMaxReadsPerEvent() const {
322 return maxReadsPerEvent_;
325 // Read and write methods
326 void setReadCB(ReadCallback* callback) override;
327 ReadCallback* getReadCallback() const override;
329 void write(WriteCallback* callback, const void* buf, size_t bytes,
330 WriteFlags flags = WriteFlags::NONE) override;
331 void writev(WriteCallback* callback, const iovec* vec, size_t count,
332 WriteFlags flags = WriteFlags::NONE) override;
333 void writeChain(WriteCallback* callback,
334 std::unique_ptr<folly::IOBuf>&& buf,
335 WriteFlags flags = WriteFlags::NONE) override;
338 virtual void writeRequest(WriteRequest* req);
339 void writeRequestReady() {
343 // Methods inherited from AsyncTransport
344 void close() override;
345 void closeNow() override;
346 void closeWithReset() override;
347 void shutdownWrite() override;
348 void shutdownWriteNow() override;
350 bool readable() const override;
351 bool isPending() const override;
352 virtual bool hangup() const;
353 bool good() const override;
354 bool error() const override;
355 void attachEventBase(EventBase* eventBase) override;
356 void detachEventBase() override;
357 bool isDetachable() const override;
359 void getLocalAddress(
360 folly::SocketAddress* address) const override;
362 folly::SocketAddress* address) const override;
364 bool isEorTrackingEnabled() const override { return false; }
366 void setEorTracking(bool /*track*/) override {}
368 bool connecting() const override {
369 return (state_ == StateEnum::CONNECTING);
372 virtual bool isClosedByPeer() const {
373 return (state_ == StateEnum::CLOSED &&
374 (readErr_ == READ_EOF || readErr_ == READ_ERROR));
377 virtual bool isClosedBySelf() const {
378 return (state_ == StateEnum::CLOSED &&
379 (readErr_ != READ_EOF && readErr_ != READ_ERROR));
382 size_t getAppBytesWritten() const override {
383 return appBytesWritten_;
386 size_t getRawBytesWritten() const override {
387 return getAppBytesWritten();
390 size_t getAppBytesReceived() const override {
391 return appBytesReceived_;
394 size_t getRawBytesReceived() const override {
395 return getAppBytesReceived();
398 std::chrono::nanoseconds getConnectTime() const {
399 return connectEndTime_ - connectStartTime_;
402 // Methods controlling socket options
405 * Force writes to be transmitted immediately.
407 * This controls the TCP_NODELAY socket option. When enabled, TCP segments
408 * are sent as soon as possible, even if it is not a full frame of data.
409 * When disabled, the data may be buffered briefly to try and wait for a full
412 * By default, TCP_NODELAY is enabled for AsyncSocket objects.
414 * This method will fail if the socket is not currently open.
416 * @return Returns 0 if the TCP_NODELAY flag was successfully updated,
417 * or a non-zero errno value on error.
419 int setNoDelay(bool noDelay);
423 * Set the FD_CLOEXEC flag so that the socket will be closed if the program
424 * later forks and execs.
426 void setCloseOnExec();
429 * Set the Flavor of Congestion Control to be used for this Socket
430 * Please check '/lib/modules/<kernel>/kernel/net/ipv4' for tcp_*.ko
431 * first to make sure the module is available for plugging in
432 * Alternatively you can choose from net.ipv4.tcp_allowed_congestion_control
434 int setCongestionFlavor(const std::string &cname);
437 * Forces ACKs to be sent immediately
439 * @return Returns 0 if the TCP_QUICKACK flag was successfully updated,
440 * or a non-zero errno value on error.
442 int setQuickAck(bool quickack);
445 * Set the send bufsize
447 int setSendBufSize(size_t bufsize);
450 * Set the recv bufsize
452 int setRecvBufSize(size_t bufsize);
455 * Sets a specific tcp personality
456 * Available only on kernels 3.2 and greater
458 #define SO_SET_NAMESPACE 41
459 int setTCPProfile(int profd);
462 * Set TCP_CORK on the socket, and turn on/off the persistentCork_ flag
464 * When persistentCork_ is true, CorkGuard in AsyncSSLSocket will not be
465 * able to toggle TCP_CORK
468 void setPersistentCork(bool cork);
471 * Generic API for reading a socket option.
473 * @param level same as the "level" parameter in getsockopt().
474 * @param optname same as the "optname" parameter in getsockopt().
475 * @param optval pointer to the variable in which the option value should
477 * @param optlen value-result argument, initially containing the size of
478 * the buffer pointed to by optval, and modified on return
479 * to indicate the actual size of the value returned.
480 * @return same as the return value of getsockopt().
482 template <typename T>
483 int getSockOpt(int level, int optname, T* optval, socklen_t* optlen) {
484 return getsockopt(fd_, level, optname, (void*) optval, optlen);
488 * Generic API for setting a socket option.
490 * @param level same as the "level" parameter in getsockopt().
491 * @param optname same as the "optname" parameter in getsockopt().
492 * @param optval the option value to set.
493 * @return same as the return value of setsockopt().
495 template <typename T>
496 int setSockOpt(int level, int optname, const T *optval) {
497 return setsockopt(fd_, level, optname, optval, sizeof(T));
500 virtual void setPeek(bool peek) {
504 enum class StateEnum : uint8_t {
512 void setBufferCallback(BufferCallback* cb);
515 * A WriteRequest object tracks information about a pending write operation.
519 WriteRequest(AsyncSocket* socket, WriteCallback* callback) :
520 socket_(socket), callback_(callback) {}
522 virtual void start() {};
524 virtual void destroy() = 0;
526 virtual bool performWrite() = 0;
528 virtual void consume() = 0;
530 virtual bool isComplete() = 0;
532 WriteRequest* getNext() const {
536 WriteCallback* getCallback() const {
540 uint32_t getTotalBytesWritten() const {
541 return totalBytesWritten_;
544 void append(WriteRequest* next) {
545 assert(next_ == nullptr);
549 void fail(const char* fn, const AsyncSocketException& ex) {
550 socket_->failWrite(fn, ex);
553 void bytesWritten(size_t count) {
554 totalBytesWritten_ += count;
555 socket_->appBytesWritten_ += count;
559 // protected destructor, to ensure callers use destroy()
560 virtual ~WriteRequest() {}
562 AsyncSocket* socket_; ///< parent socket
563 WriteRequest* next_{nullptr}; ///< pointer to next WriteRequest
564 WriteCallback* callback_; ///< completion callback
565 uint32_t totalBytesWritten_{0}; ///< total bytes written
569 enum ReadResultEnum {
577 * Protected destructor.
579 * Users of AsyncSocket must never delete it directly. Instead, invoke
580 * destroy() instead. (See the documentation in DelayedDestruction.h for
585 friend std::ostream& operator << (std::ostream& os, const StateEnum& state);
588 /// shutdownWrite() called, but we are still waiting on writes to drain
589 SHUT_WRITE_PENDING = 0x01,
590 /// writes have been completely shut down
593 * Reads have been shutdown.
595 * At the moment we don't distinguish between remote read shutdown
596 * (received EOF from the remote end) and local read shutdown. We can
597 * only receive EOF when a read callback is set, and we immediately inform
598 * it of the EOF. Therefore there doesn't seem to be any reason to have a
599 * separate state of "received EOF but the local side may still want to
602 * We also don't currently provide any API for only shutting down the read
603 * side of a socket. (This is a no-op as far as TCP is concerned, anyway.)
608 class BytesWriteRequest;
610 class WriteTimeout : public AsyncTimeout {
612 WriteTimeout(AsyncSocket* socket, EventBase* eventBase)
613 : AsyncTimeout(eventBase)
616 virtual void timeoutExpired() noexcept {
617 socket_->timeoutExpired();
621 AsyncSocket* socket_;
624 class IoHandler : public EventHandler {
626 IoHandler(AsyncSocket* socket, EventBase* eventBase)
627 : EventHandler(eventBase, -1)
629 IoHandler(AsyncSocket* socket, EventBase* eventBase, int fd)
630 : EventHandler(eventBase, fd)
633 virtual void handlerReady(uint16_t events) noexcept {
634 socket_->ioReady(events);
638 AsyncSocket* socket_;
643 class ImmediateReadCB : public folly::EventBase::LoopCallback {
645 explicit ImmediateReadCB(AsyncSocket* socket) : socket_(socket) {}
646 void runLoopCallback() noexcept override {
647 DestructorGuard dg(socket_);
648 socket_->checkForImmediateRead();
651 AsyncSocket* socket_;
655 * Schedule checkForImmediateRead to be executed in the next loop
658 void scheduleImmediateRead() noexcept {
660 eventBase_->runInLoop(&immediateReadHandler_);
664 // event notification methods
665 void ioReady(uint16_t events) noexcept;
666 virtual void checkForImmediateRead() noexcept;
667 virtual void handleInitialReadWrite() noexcept;
668 virtual void prepareReadBuffer(void** buf, size_t* buflen) noexcept;
669 virtual void handleRead() noexcept;
670 virtual void handleWrite() noexcept;
671 virtual void handleConnect() noexcept;
672 void timeoutExpired() noexcept;
675 * Attempt to read from the socket.
677 * @param buf The buffer to read data into.
678 * @param buflen The length of the buffer.
680 * @return Returns the number of bytes read, or READ_EOF on EOF, or
681 * READ_ERROR on error, or READ_BLOCKING if the operation will
684 virtual ssize_t performRead(void** buf, size_t* buflen, size_t* offset);
687 * Populate an iovec array from an IOBuf and attempt to write it.
689 * @param callback Write completion/error callback.
690 * @param vec Target iovec array; caller retains ownership.
691 * @param count Number of IOBufs to write, beginning at start of buf.
692 * @param buf Chain of iovecs.
693 * @param flags set of flags for the underlying write calls, like cork
695 void writeChainImpl(WriteCallback* callback, iovec* vec,
696 size_t count, std::unique_ptr<folly::IOBuf>&& buf,
700 * Write as much data as possible to the socket without blocking,
701 * and queue up any leftover data to send when the socket can
702 * handle writes again.
704 * @param callback The callback to invoke when the write is completed.
705 * @param vec Array of buffers to write; this method will make a
706 * copy of the vector (but not the buffers themselves)
707 * if the write has to be completed asynchronously.
708 * @param count Number of elements in vec.
709 * @param buf The IOBuf that manages the buffers referenced by
710 * vec, or a pointer to nullptr if the buffers are not
711 * associated with an IOBuf. Note that ownership of
712 * the IOBuf is transferred here; upon completion of
713 * the write, the AsyncSocket deletes the IOBuf.
714 * @param flags Set of write flags.
716 void writeImpl(WriteCallback* callback, const iovec* vec, size_t count,
717 std::unique_ptr<folly::IOBuf>&& buf,
718 WriteFlags flags = WriteFlags::NONE);
721 * Attempt to write to the socket.
723 * @param vec The iovec array pointing to the buffers to write.
724 * @param count The length of the iovec array.
725 * @param flags Set of write flags.
726 * @param countWritten On return, the value pointed to by this parameter
727 * will contain the number of iovec entries that were
729 * @param partialWritten On return, the value pointed to by this parameter
730 * will contain the number of bytes written in the
731 * partially written iovec entry.
733 * @return Returns the total number of bytes written, or -1 on error. If no
734 * data can be written immediately, 0 is returned.
736 virtual ssize_t performWrite(const iovec* vec, uint32_t count,
737 WriteFlags flags, uint32_t* countWritten,
738 uint32_t* partialWritten);
740 bool updateEventRegistration();
743 * Update event registration.
745 * @param enable Flags of events to enable. Set it to 0 if no events
746 * need to be enabled in this call.
747 * @param disable Flags of events
748 * to disable. Set it to 0 if no events need to be disabled in this
751 * @return true iff the update is successful.
753 bool updateEventRegistration(uint16_t enable, uint16_t disable);
755 // Actually close the file descriptor and set it to -1 so we don't
756 // accidentally close it again.
759 // error handling methods
762 void fail(const char* fn, const AsyncSocketException& ex);
763 void failConnect(const char* fn, const AsyncSocketException& ex);
764 void failRead(const char* fn, const AsyncSocketException& ex);
765 void failWrite(const char* fn, WriteCallback* callback, size_t bytesWritten,
766 const AsyncSocketException& ex);
767 void failWrite(const char* fn, const AsyncSocketException& ex);
768 void failAllWrites(const AsyncSocketException& ex);
769 void invokeConnectErr(const AsyncSocketException& ex);
770 void invokeConnectSuccess();
771 void invalidState(ConnectCallback* callback);
772 void invalidState(ReadCallback* callback);
773 void invalidState(WriteCallback* callback);
775 std::string withAddr(const std::string& s);
778 * Set TCP_CORK on this socket
780 * @return 0 if Cork is turned on, or non-zero errno on error
782 int setCork(bool cork);
784 StateEnum state_; ///< StateEnum describing current state
785 uint8_t shutdownFlags_; ///< Shutdown state (ShutdownFlags)
786 uint16_t eventFlags_; ///< EventBase::HandlerFlags settings
787 int fd_; ///< The socket file descriptor
788 mutable folly::SocketAddress addr_; ///< The address we tried to connect to
789 mutable folly::SocketAddress localAddr_;
790 ///< The address we are connecting from
791 uint32_t sendTimeout_; ///< The send timeout, in milliseconds
792 uint16_t maxReadsPerEvent_; ///< Max reads per event loop iteration
793 EventBase* eventBase_; ///< The EventBase
794 WriteTimeout writeTimeout_; ///< A timeout for connect and write
795 IoHandler ioHandler_; ///< A EventHandler to monitor the fd
796 ImmediateReadCB immediateReadHandler_; ///< LoopCallback for checking read
798 ConnectCallback* connectCallback_; ///< ConnectCallback
799 ReadCallback* readCallback_; ///< ReadCallback
800 WriteRequest* writeReqHead_; ///< Chain of WriteRequests
801 WriteRequest* writeReqTail_; ///< End of WriteRequest chain
802 ShutdownSocketSet* shutdownSocketSet_;
803 size_t appBytesReceived_; ///< Num of bytes received from socket
804 size_t appBytesWritten_; ///< Num of bytes written to socket
805 bool isBufferMovable_{false};
807 bool peek_{false}; // Peek bytes.
809 int8_t readErr_{READ_NO_ERROR}; ///< The read error encountered, if any.
811 std::chrono::steady_clock::time_point connectStartTime_;
812 std::chrono::steady_clock::time_point connectEndTime_;
814 // Whether this connection is persistently corked
815 bool persistentCork_{false};
816 // Whether we've applied the TCP_CORK option to the socket
819 BufferCallback* bufferCallback_{nullptr};