net: support marking accepting TCP sockets
authorLorenzo Colitti <lorenzo@google.com>
Wed, 26 Mar 2014 04:03:12 +0000 (13:03 +0900)
committerJP Abgrall <jpa@google.com>
Fri, 16 May 2014 20:58:31 +0000 (20:58 +0000)
When using mark-based routing, sockets returned from accept()
may need to be marked differently depending on the incoming
connection request.

This is the case, for example, if different socket marks identify
different networks: a listening socket may want to accept
connections from all networks, but each connection should be
marked with the network that the request came in on, so that
subsequent packets are sent on the correct network.

This patch adds a sysctl to mark TCP sockets based on the fwmark
of the incoming SYN packet. If enabled, and an unmarked socket
receives a SYN, then the SYN packet's fwmark is written to the
connection's inet_request_sock, and later written back to the
accepted socket when the connection is established.  If the
socket already has a nonzero mark, then the behaviour is the same
as it is today, i.e., the listening socket's fwmark is used.

Black-box tested using user-mode linux:

- IPv4/IPv6 SYN+ACK, FIN, etc. packets are routed based on the
  mark of the incoming SYN packet.
- The socket returned by accept() is marked with the mark of the
  incoming SYN packet.
- Tested with syncookies=1 and syncookies=2.

Change-Id: I26bc1eceefd2c588d73b921865ab70e4645ade57
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Documentation/networking/ip-sysctl.txt
include/net/inet_sock.h
include/net/netns/ipv4.h
net/ipv4/inet_connection_sock.c
net/ipv4/syncookies.c
net/ipv4/sysctl_net_ipv4.c
net/ipv4/tcp_ipv4.c
net/ipv6/inet6_connection_sock.c
net/ipv6/syncookies.c
net/ipv6/tcp_ipv6.c

index ecf5abd57c017e015cd709de6214dcf2a1986579..6e5c7c7333bdb3aa73a84a2509670f289abc958b 100644 (file)
@@ -477,6 +477,16 @@ tcp_fastopen - INTEGER
 
        See include/net/tcp.h and the code for more details.
 
+tcp_fwmark_accept - BOOLEAN
+       If set, incoming connections to listening sockets that do not have a
+       socket mark will set the mark of the accepting socket to the fwmark of
+       the incoming SYN packet. This will cause all packets on that connection
+       (starting from the first SYNACK) to be sent with that fwmark. The
+       listening socket's mark is unchanged. Listening sockets that already
+       have a fwmark set via setsockopt(SOL_SOCKET, SO_MARK, ...) are
+       unaffected.
+       Default: 0
+
 tcp_syn_retries - INTEGER
        Number of times initial SYNs for an active TCP connection attempt
        will be retransmitted. Should not be higher than 255. Default value
index 7235ae73a1e8d07a905d6803017d341d951e8f56..9528e10fa0b491f6b65c00e4b9ca88483b73e8b2 100644 (file)
@@ -88,6 +88,7 @@ struct inet_request_sock {
                                acked      : 1,
                                no_srccheck: 1;
        kmemcheck_bitfield_end(flags);
+       u32                     ir_mark;
        struct ip_options_rcu   *opt;
 };
 
@@ -96,6 +97,14 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
        return (struct inet_request_sock *)sk;
 }
 
+static inline u32 inet_request_mark(struct sock *sk, struct sk_buff *skb)
+{
+       if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept)
+               return skb->mark;
+
+       return sk->sk_mark;
+}
+
 struct inet_cork {
        unsigned int            flags;
        __be32                  addr;
index 222461a7cc5de64ae9645b74a0c5fdb6a01d624f..0dd6f0b3eadb96119658697e949e50334dcd9434 100644 (file)
@@ -65,6 +65,7 @@ struct netns_ipv4 {
        int sysctl_tcp_ecn;
 
        int sysctl_fwmark_reflect;
+       int sysctl_tcp_fwmark_accept;
 
        kgid_t sysctl_ping_group_range[2];
        long sysctl_tcp_mem[3];
index 6acb541c90910204f02449e7500138362da6998a..442087d371f69e637a85682378381fbd0927e812 100644 (file)
@@ -417,7 +417,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
        struct net *net = sock_net(sk);
        int flags = inet_sk_flowi_flags(sk);
 
-       flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+       flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark,
                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
                           sk->sk_protocol,
                           flags,
@@ -454,7 +454,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
 
        rcu_read_lock();
        opt = rcu_dereference(newinet->inet_opt);
-       flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
+       flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark,
                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
                           sk->sk_protocol, inet_sk_flowi_flags(sk),
                           (opt && opt->opt.srr) ? opt->opt.faddr : ireq->rmt_addr,
@@ -688,6 +688,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
                inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port;
                newsk->sk_write_space = sk_stream_write_space;
 
+               newsk->sk_mark = inet_rsk(req)->ir_mark;
+
                newicsk->icsk_retransmits = 0;
                newicsk->icsk_backoff     = 0;
                newicsk->icsk_probes_out  = 0;
index b05c96e7af8b810a62bb07d95436eea07c651008..5abb45e281bea0ffdcfd14d9e017742d7b19c71a 100644 (file)
@@ -312,6 +312,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
        ireq->rmt_port          = th->source;
        ireq->loc_addr          = ip_hdr(skb)->daddr;
        ireq->rmt_addr          = ip_hdr(skb)->saddr;
+       ireq->ir_mark           = inet_request_mark(sk, skb);
        ireq->ecn_ok            = ecn_ok;
        ireq->snd_wscale        = tcp_opt.snd_wscale;
        ireq->sack_ok           = tcp_opt.sack_ok;
@@ -348,7 +349,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
         * hasn't changed since we received the original syn, but I see
         * no easy way to do this.
         */
-       flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark,
+       flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
                           RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
                           inet_sk_flowi_flags(sk),
                           (opt && opt->srr) ? opt->faddr : ireq->rmt_addr,
index e6cdcb32b3395a1d7b3ced052f97f2a3e0eee247..cc5fa7da12e55ac83bcef34388434f65364ab993 100644 (file)
@@ -866,6 +866,13 @@ static struct ctl_table ipv4_net_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+       {
+               .procname       = "tcp_fwmark_accept",
+               .data           = &init_net.ipv4.sysctl_tcp_fwmark_accept,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
        { }
 };
 
index 7999fc55c83ba74abeffbd1a9d53bec25985e163..40ec14507f731156464ee6d5b18dfe9c6a48d86a 100644 (file)
@@ -1527,6 +1527,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        ireq->rmt_addr = saddr;
        ireq->no_srccheck = inet_sk(sk)->transparent;
        ireq->opt = tcp_v4_save_options(skb);
+       ireq->ir_mark = inet_request_mark(sk, skb);
 
        if (security_inet_conn_request(sk, skb, req))
                goto drop_and_free;
index e4311cbc8b4ecbf70ea1fb2e2f2415342be382cc..f1493138d21e237de7fdeb150b3053962690f71a 100644 (file)
@@ -81,7 +81,7 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk,
        final_p = fl6_update_dst(fl6, np->opt, &final);
        fl6->saddr = treq->loc_addr;
        fl6->flowi6_oif = treq->iif;
-       fl6->flowi6_mark = sk->sk_mark;
+       fl6->flowi6_mark = inet_rsk(req)->ir_mark;
        fl6->fl6_dport = inet_rsk(req)->rmt_port;
        fl6->fl6_sport = inet_rsk(req)->loc_port;
        security_req_classify_flow(req, flowi6_to_flowi(fl6));
index d5dda20bd717404a07d4dd45567b5bba23997250..1efbc6f44a6a73d727ffdb0c1a2e1d71051bcaf5 100644 (file)
@@ -212,6 +212,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
            ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL)
                ireq6->iif = inet6_iif(skb);
 
+       ireq->ir_mark = inet_request_mark(sk, skb);
+
        req->expires = 0UL;
        req->num_retrans = 0;
        ireq->ecn_ok            = ecn_ok;
@@ -238,7 +240,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
                final_p = fl6_update_dst(&fl6, np->opt, &final);
                fl6.saddr = ireq6->loc_addr;
                fl6.flowi6_oif = sk->sk_bound_dev_if;
-               fl6.flowi6_mark = sk->sk_mark;
+               fl6.flowi6_mark = ireq->ir_mark;
                fl6.fl6_dport = inet_rsk(req)->rmt_port;
                fl6.fl6_sport = inet_sk(sk)->inet_sport;
                security_req_classify_flow(req, flowi6_to_flowi(&fl6));
index 71545cb17ab6552d410aa0e04e3bfb8c41d56140..6e882dadb4f8b553549942c7c9b8525183ba10e1 100644 (file)
@@ -1000,6 +1000,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
                TCP_ECN_create_request(req, skb, sock_net(sk));
 
        treq->iif = sk->sk_bound_dev_if;
+       inet_rsk(req)->ir_mark = inet_request_mark(sk, skb);
 
        /* So that link locals have meaning */
        if (!sk->sk_bound_dev_if &&