diff -Nru a/include/linux/sysctl.h b/include/linux/sysctl.h --- a/include/linux/sysctl.h 2005-03-07 12:18:30 -05:00 +++ b/include/linux/sysctl.h 2005-03-07 12:18:30 -05:00 @@ -345,6 +345,9 @@ NET_TCP_MODERATE_RCVBUF=106, NET_TCP_TSO_WIN_DIVISOR=107, NET_TCP_BIC_BETA=108, + NET_TCP_MTU_PROBING=109, + NET_TCP_INITIAL_MSS=110, + NET_TCP_BASE_MSS=111, }; enum { diff -Nru a/include/linux/tcp.h b/include/linux/tcp.h --- a/include/linux/tcp.h 2005-03-07 12:18:30 -05:00 +++ b/include/linux/tcp.h 2005-03-07 12:18:30 -05:00 @@ -147,6 +147,15 @@ #define TCPF_CA_Loss (1<ack.rcv_mss = hint; +} + +/* Not account for SACKs here. */ +static inline int tcp_mtu_to_mss(struct tcp_sock *tp, int pmtu) +{ + int mss_now; + + /* Calculate base mss without TCP options: + It is MMS_S - sizeof(tcphdr) of rfc1122 + */ + mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); + + /* Clamp it (mss_clamp does not include tcp options) */ + if (mss_now > tp->rx_opt.mss_clamp) + mss_now = tp->rx_opt.mss_clamp; + + /* Now subtract optional transport overhead */ + mss_now -= tp->ext_header_len + tp->ext2_header_len; + + /* Then reserve room for full set of TCP options and 8 bytes of data */ + if (mss_now < 48) + mss_now = 48; + + /* Now subtract TCP options size, not including SACKs */ + mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); + + return mss_now; +} + +/* Inverse of above */ +static inline int tcp_mss_to_mtu(struct tcp_sock *tp, int mss) +{ + int mtu; + + mtu = mss + + tp->tcp_header_len + + tp->ext_header_len + tp->ext2_header_len + + tp->af_specific->net_header_len; + + return mtu; +} + +static inline void tcp_init_mtup(struct tcp_sock *tp) +{ + tp->mtup.search_high = 65536; + tp->mtup.search_low = tcp_mss_to_mtu(tp, sysctl_tcp_base_mss); + tp->mtup.probe_state = TCP_MTUP_PROBE_NONE; + if (sysctl_tcp_mtu_probing) + tp->mss_cache = tp->mss_cache_std = sysctl_tcp_initial_mss; } static __inline__ void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) diff -Nru a/net/ipv4/route.c b/net/ipv4/route.c --- a/net/ipv4/route.c 2005-03-07 12:18:30 -05:00 +++ b/net/ipv4/route.c 2005-03-07 12:18:30 -05:00 @@ -1275,7 +1275,7 @@ rth->fl.iif == 0 && !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) { unsigned short mtu = new_mtu; - + if (new_mtu < 68 || new_mtu >= old_mtu) { /* BSD 4.2 compatibility hack :-( */ diff -Nru a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c --- a/net/ipv4/sysctl_net_ipv4.c 2005-03-07 12:18:30 -05:00 +++ b/net/ipv4/sysctl_net_ipv4.c 2005-03-07 12:18:30 -05:00 @@ -690,6 +690,30 @@ .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = NET_TCP_MTU_PROBING, + .procname = "tcp_mtu_probing", + .data = &sysctl_tcp_mtu_probing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_INITIAL_MSS, + .procname = "tcp_initial_mss", + .data = &sysctl_tcp_initial_mss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_BASE_MSS, + .procname = "tcp_base_mss", + .data = &sysctl_tcp_base_mss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; diff -Nru a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c --- a/net/ipv4/tcp_input.c 2005-03-07 12:18:30 -05:00 +++ b/net/ipv4/tcp_input.c 2005-03-07 12:18:30 -05:00 @@ -1796,6 +1796,67 @@ } } +static void tcp_mtup_probe_failed(struct tcp_sock *tp) +{ + printk("mtup: probe failed\n"); + tp->mtup.search_high = tp->mtup.probe_size - 1; + tp->mtup.probe_state = TCP_MTUP_PROBE_NONE; +} + +void tcp_mtup_verify_failed(struct tcp_sock *tp) +{ + printk("mtup: verify failed\n"); + tp->mss_cache = tp->mss_cache_std = tp->mtup.prior_mss; + tcp_undo_cwr(tp, 1); + /* TODO: more sophistocated heuristic here */ + tp->mtup.search_high = tp->mtup.probe_size; + tp->mtup.probe_state = TCP_MTUP_PROBE_NONE; +} + +/* Do MTU verification on this acknowledged segment */ +static void tcp_mtup_verify(struct tcp_sock *tp, struct sk_buff *skb) +{ + int count; + int len; + + count = tcp_skb_pcount(skb); + len = count > 1 ? tcp_skb_mss(skb) : skb->len; + + /* FIXME?: will not work if ip options change */ + if (tcp_mss_to_mtu(tp, len) >= tp->mtup.probe_size) { + tp->mtup.need_verified -= count; + printk("mtup: need_verified = %d\n", tp->mtup.need_verified); + if (tp->mtup.need_verified <= 0) { + tp->mtup.probe_state = TCP_MTUP_PROBE_NONE; + tp->mtup.search_low = tp->mtup.probe_size; + printk("mtup: verified finished\n"); + } + } +} + +/* Successful probe */ +static void tcp_mtup_probe_success(struct tcp_sock *tp, struct sk_buff *skb) +{ + tp->mtup.prior_mss = tp->mss_cache_std; + tp->prior_ssthresh = tcp_current_ssthresh(tp); + + /* FIXME: breaks with very large cwnd */ + printk("cwnd(1) = %u\n", tp->snd_cwnd); + tp->snd_cwnd = tp->snd_cwnd * + tcp_mss_to_mtu(tp, tp->mss_cache_std) / + tp->mtup.probe_size; + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->rcv_ssthresh = tcp_current_ssthresh(tp); + printk("cwnd(2) = %d\n", tp->snd_cwnd); + tp->mss_cache = tp->mss_cache_std = tcp_mtu_to_mss(tp, tp->mtup.probe_size); + + tp->mtup.probe_state = TCP_MTUP_PROBE_VERIFY; + tp->mtup.need_verified = TCP_MTUP_MIN_VERIFY; + printk("mtup: probe success: %d\n", tp->mtup.probe_size); +} + + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -1927,6 +1988,17 @@ return; } + /* MTU probe failure: don't reduce cwnd */ + if (tp->ca_state < TCP_CA_CWR && + tp->mtup.probe_state == TCP_MTUP_PROBE_OUT && + tp->snd_una == tp->mtup.probe_seq_start) { + tcp_mtup_probe_failed(tp); + /* Restores the reduction we did in tcp_mtup_probe() */ + tp->snd_cwnd++; + tcp_simple_retransmit(sk); + return; + } + /* Otherwise enter Recovery state */ if (IsReno(tp)) @@ -1938,7 +2010,7 @@ tp->prior_ssthresh = 0; tp->undo_marker = tp->snd_una; tp->undo_retrans = tp->retrans_out; - + if (tp->ca_state < TCP_CA_CWR) { if (!(flag&FLAG_ECE)) tp->prior_ssthresh = tcp_current_ssthresh(tp); @@ -2409,7 +2481,6 @@ return acked; } - /* Remove acknowledged frames from the retransmission queue. */ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) { @@ -2448,6 +2519,15 @@ acked |= FLAG_SYN_ACKED; tp->retrans_stamp = 0; } + + /* MTU probing checks */ + if (tp->mtup.probe_state == TCP_MTUP_PROBE_OUT) { + if (!after(tp->mtup.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) { + tcp_mtup_probe_success(tp, skb); + } + } else if (tp->mtup.probe_state == TCP_MTUP_PROBE_VERIFY) { + tcp_mtup_verify(tp, skb); + } if (sacked) { if (sacked & TCPCB_RETRANS) { @@ -4542,6 +4622,7 @@ if (tp->rx_opt.sack_ok && sysctl_tcp_fack) tp->rx_opt.sack_ok |= 2; + tcp_init_mtup(tp); tcp_sync_mss(sk, tp->pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -4647,6 +4728,7 @@ if (tp->ecn_flags&TCP_ECN_OK) sk->sk_no_largesend = 1; + tcp_init_mtup(tp); tcp_sync_mss(sk, tp->pmtu_cookie); tcp_initialize_rcv_mss(sk); diff -Nru a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c --- a/net/ipv4/tcp_ipv4.c 2005-03-07 12:18:30 -05:00 +++ b/net/ipv4/tcp_ipv4.c 2005-03-07 12:18:30 -05:00 @@ -1581,6 +1581,7 @@ newtp->ext2_header_len = dst->header_len; newinet->id = newtp->write_seq ^ jiffies; + tcp_init_mtup(newtp); tcp_sync_mss(newsk, dst_pmtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(newsk); diff -Nru a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c --- a/net/ipv4/tcp_output.c 2005-03-07 12:18:30 -05:00 +++ b/net/ipv4/tcp_output.c 2005-03-07 12:18:30 -05:00 @@ -51,6 +51,14 @@ */ int sysctl_tcp_tso_win_divisor = 8; +int sysctl_tcp_mtu_probing = 0; +int sysctl_tcp_initial_mss = 1024; +int sysctl_tcp_base_mss = 512; + +EXPORT_SYMBOL(sysctl_tcp_mtu_probing); +EXPORT_SYMBOL(sysctl_tcp_initial_mss); +EXPORT_SYMBOL(sysctl_tcp_base_mss); + static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) { @@ -633,28 +641,19 @@ struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); int mss_now; + + if (dst) { + int pmtu_dst = dst_pmtu(dst); + if (tp->mtup.search_high > pmtu_dst) { + tp->mtup.search_high = pmtu_dst; + printk("tcp_sync_mss: search_high down: %d\n", pmtu_dst); + } + + if (dst->ops->get_mss) + pmtu = dst->ops->get_mss(dst, pmtu); + } - if (dst && dst->ops->get_mss) - pmtu = dst->ops->get_mss(dst, pmtu); - - /* Calculate base mss without TCP options: - It is MMS_S - sizeof(tcphdr) of rfc1122 - */ - mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); - - /* Clamp it (mss_clamp does not include tcp options) */ - if (mss_now > tp->rx_opt.mss_clamp) - mss_now = tp->rx_opt.mss_clamp; - - /* Now subtract optional transport overhead */ - mss_now -= tp->ext_header_len + tp->ext2_header_len; - - /* Then reserve room for full set of TCP options and 8 bytes of data */ - if (mss_now < 48) - mss_now = 48; - - /* Now subtract TCP options size, not including SACKs */ - mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); + mss_now = tcp_mtu_to_mss(tp, pmtu); /* Bound mss with half of window */ if (tp->max_window && mss_now > (tp->max_window>>1)) @@ -662,8 +661,18 @@ /* And store cached results */ tp->pmtu_cookie = pmtu; - tp->mss_cache = tp->mss_cache_std = mss_now; + if (sysctl_tcp_mtu_probing) { + if (tp->mss_cache_std > mss_now) { + tp->mss_cache = tp->mss_cache_std = mss_now; + printk("tcp_sync_mss: mss_cache down: %d\n", mss_now); + } + } else { + tp->mss_cache = tp->mss_cache_std = mss_now; + } +// printk("tcp_sync_mss: mss_cache = %d, mss_cache_std = %d, mss_now = %d\n", +// tp->mss_cache, tp->mss_cache_std, mss_now); + return mss_now; } @@ -729,6 +738,139 @@ return mss_now; } +/* Create a new MTU probe if we are ready. + * Returns 0 if we should wait to probe (no cwnd available), + * 1 if a probe was sent, + * -1 otherwise */ +static int tcp_mtu_probe(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb, *nskb, *next; + int len; + int probe_size; + unsigned int pif; + int copy; + int mss_now; + + /* Not currently probing/verifying, + * not in recovery, + * have enough cwnd, and + * not SACKing (the variable headers throw things off) */ + if (!sysctl_tcp_mtu_probing || + tp->mtup.probe_state != TCP_MTUP_PROBE_NONE || + tp->ca_state != TCP_CA_Open || + tp->snd_cwnd < 11 || + tp->rx_opt.eff_sacks) + return -1; + +// printk("tcp_mtu_probe: 2\n"); + /* Very simple search strategy: just double the MSS. */ + mss_now = tcp_current_mss(sk, 0); + probe_size = 2*tp->mss_cache_std; + if (probe_size > tcp_mtu_to_mss(tp, tp->mtup.search_high)) { + /* TODO: set timer for probe_converge_event */ + return -1; + } + +// printk("tcp_mtu_probe: 3\n"); + /* Have enough data in the send queue to probe? */ + len = 0; + if ((skb = sk->sk_send_head) == NULL) + return -1; + while ((len += skb->len) < probe_size && skb != skb->list->prev) + skb = skb->next; + if (len < probe_size) + return -1; + BUG_TRAP(skb->len == TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq); + + printk("tcp_mtu_probe: 4\n"); + /* Receive window check. */ + if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) { + if (tp->snd_wnd < probe_size) + return -1; + else + return 0; + } + + printk("tcp_mtu_probe: 5\n"); + /* Do we need to wait to drain cwnd? */ + pif = tcp_packets_in_flight(tp); + if (pif + 2 > tp->snd_cwnd) { + /* With no packets in flight, don't stall. */ + if (pif == 0) + return -1; + else + return 0; + } + + printk("tcp_mtu_probe: 6\n"); + /* We're allowed to probe. Build it now. */ + if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL) + return -1; + sk_charge_skb(sk, nskb); + + /* Account for SACKs */ + probe_size -= tp->mss_cache_std - mss_now; + + printk("tcp_mtu_probe: 7\n"); + skb = sk->sk_send_head; + TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; + nskb->csum = 0; + len = 0; + while (len < probe_size) { + next = skb->next; + + copy = min_t(int, skb->len, probe_size - len); + printk("skb->len = %d\n", skb->len); + printk("tcp_mtu_probe: copy = %d\n", copy); + nskb->csum = skb_copy_and_csum_bits(skb, 0, + skb_put(nskb, copy), copy, nskb->csum); + skb_pull(skb, copy); + if (skb->len <= 0) { + /* We've eaten all the data from this skb. + * Throw it away. */ + __skb_unlink(skb, skb->list); + sk_stream_free_skb(sk, skb); + } else { + TCP_SKB_CB(skb)->seq += copy; + tcp_set_skb_tso_segs(skb, tp->mss_cache_std); + } + + len += copy; + skb = next; + } + + __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue); + sk->sk_send_head = nskb; + tcp_set_skb_tso_segs(nskb, tp->mss_cache_std); + + printk("tcp_mtu_probe: 8\n"); + /* We're ready to send. If this fails, the probe will + * be resegmented into mss-sized pieces by tcp_write_xmit(). */ + TCP_SKB_CB(nskb)->when = tcp_time_stamp; + tp->mtup.probe_state = TCP_MTUP_PROBE_OUT; + tp->mtup.probe_seq_start = TCP_SKB_CB(nskb)->seq; + tp->mtup.probe_seq_end = TCP_SKB_CB(nskb)->end_seq; + tp->mtup.probe_size = tcp_mss_to_mtu(tp, nskb->len); + printk("len = %d, probe_size = %d\n", nskb->len, tp->mtup.probe_size); + + if (!tcp_transmit_skb(sk, skb_clone(nskb, GFP_ATOMIC))) { + printk("tcp_mtu_probe: 9\n"); + /* Decrement cwnd here because we are sending + * effectively two packets. */ + tp->snd_cwnd--; + update_send_head(sk, tp, nskb); + + return 1; + } + + return -1; +} + +extern void tcp_mtup_verify_failed(struct tcp_sock *tp); + + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -740,6 +882,7 @@ { struct tcp_sock *tp = tcp_sk(sk); unsigned int mss_now; + int result; /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and all @@ -748,6 +891,15 @@ if (sk->sk_state != TCP_CLOSE) { struct sk_buff *skb; int sent_pkts = 0; + + /* Do MTU probing. */ + if ((result = tcp_mtu_probe(sk)) == 0) { + printk("tcp_write_xmit: waiting\n"); + return 0; + } else if (result > 0) { + sent_pkts = 1; + printk("tcp_write_xmit: sent probe\n"); + } /* Account for SACKS, we may need to fragment due to this. * It is just like the real MSS changing on us midstream. @@ -1031,6 +1183,16 @@ struct tcp_sock *tp = tcp_sk(sk); unsigned int cur_mss = tcp_current_mss(sk, 0); int err; + + /* Inconslusive MTU probe */ + if (tp->mtup.probe_state == TCP_MTUP_PROBE_OUT) { + printk("mtup: inconclusive probe\n"); + tp->mtup.probe_state = TCP_MTUP_PROBE_NONE; + } + + /* MTU verification failure */ + if (tp->mtup.probe_state == TCP_MTUP_PROBE_VERIFY) + tcp_mtup_verify_failed(tp); /* Do not sent more than we queued. 1/4 is reserved for possible * copying overhead: frgagmentation, tunneling, mangling etc. @@ -1442,6 +1604,7 @@ if (tp->rx_opt.user_mss) tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; tp->max_window = 0; + tcp_init_mtup(tp); tcp_sync_mss(sk, dst_pmtu(dst)); if (!tp->window_clamp) diff -Nru a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c --- a/net/ipv6/tcp_ipv6.c 2005-03-07 12:18:30 -05:00 +++ b/net/ipv6/tcp_ipv6.c 2005-03-07 12:18:30 -05:00 @@ -1445,6 +1445,7 @@ newnp->opt->opt_flen; newtp->ext2_header_len = dst->header_len; + tcp_init_mtup(newtp); tcp_sync_mss(newsk, dst_pmtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(newsk);