@@ -126,6 +126,10 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
return (struct tcp_request_sock *)req;
}
+#ifdef CONFIG_TCP_ESTATS
+struct tcp_estats;
+#endif
+
struct tcp_sock {
/* inet_connection_sock has to be the first member of tcp_sock */
struct inet_connection_sock inet_conn;
@@ -309,6 +313,10 @@ struct tcp_sock {
struct tcp_md5sig_info __rcu *md5sig_info;
#endif
+#ifdef CONFIG_TCP_ESTATS
+ struct tcp_estats *tcp_stats;
+#endif
+
/* TCP fastopen related information */
struct tcp_fastopen_request *fastopen_req;
/* fastopen_rsk points to request_sock that resulted in this big
new file mode 100644
@@ -0,0 +1,376 @@
+/*
+ * include/net/tcp_estats.h
+ *
+ * Implementation of TCP Extended Statistics MIB (RFC 4898)
+ *
+ * Authors:
+ * John Estabrook <jsestabrook@gmail.com>
+ * Andrew K. Adams <akadams@psc.edu>
+ * Kevin Hogan <kwabena@google.com>
+ * Dominin Hamon <dma@stripysock.com>
+ * John Heffner <johnwheffner@gmail.com>
+ *
+ * The Web10Gig project. See http://www.web10gig.org
+ *
+ * Copyright © 2011, Pittsburgh Supercomputing Center (PSC).
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _TCP_ESTATS_H
+#define _TCP_ESTATS_H
+
+#include <net/sock.h>
+#include <linux/idr.h>
+#include <linux/in.h>
+#include <linux/jump_label.h>
+#include <linux/spinlock.h>
+#include <linux/tcp.h>
+#include <linux/workqueue.h>
+
+/* defines number of seconds that stats persist after connection ends */
+#define TCP_ESTATS_PERSIST_DELAY_SECS 5
+
+enum tcp_estats_sndlim_states {
+ TCP_ESTATS_SNDLIM_NONE = -1,
+ TCP_ESTATS_SNDLIM_SENDER,
+ TCP_ESTATS_SNDLIM_CWND,
+ TCP_ESTATS_SNDLIM_RWIN,
+ TCP_ESTATS_SNDLIM_STARTUP,
+ TCP_ESTATS_SNDLIM_TSODEFER,
+ TCP_ESTATS_SNDLIM_PACE,
+ TCP_ESTATS_SNDLIM_NSTATES /* Keep at end */
+};
+
+enum tcp_estats_addrtype {
+ TCP_ESTATS_ADDRTYPE_IPV4 = 1,
+ TCP_ESTATS_ADDRTYPE_IPV6 = 2
+};
+
+enum tcp_estats_softerror_reason {
+ TCP_ESTATS_SOFTERROR_BELOW_DATA_WINDOW = 1,
+ TCP_ESTATS_SOFTERROR_ABOVE_DATA_WINDOW = 2,
+ TCP_ESTATS_SOFTERROR_BELOW_ACK_WINDOW = 3,
+ TCP_ESTATS_SOFTERROR_ABOVE_ACK_WINDOW = 4,
+ TCP_ESTATS_SOFTERROR_BELOW_TS_WINDOW = 5,
+ TCP_ESTATS_SOFTERROR_ABOVE_TS_WINDOW = 6,
+ TCP_ESTATS_SOFTERROR_DATA_CHECKSUM = 7,
+ TCP_ESTATS_SOFTERROR_OTHER = 8,
+};
+
+#define TCP_ESTATS_INACTIVE 2
+#define TCP_ESTATS_ACTIVE 1
+
+#define TCP_ESTATS_TABLEMASK_INACTIVE 0x00
+#define TCP_ESTATS_TABLEMASK_ACTIVE 0x01
+#define TCP_ESTATS_TABLEMASK_PERF 0x02
+#define TCP_ESTATS_TABLEMASK_PATH 0x04
+#define TCP_ESTATS_TABLEMASK_STACK 0x08
+#define TCP_ESTATS_TABLEMASK_APP 0x10
+#define TCP_ESTATS_TABLEMASK_EXTRAS 0x40
+
+#ifdef CONFIG_TCP_ESTATS
+
+extern struct static_key tcp_estats_enabled;
+
+#define TCP_ESTATS_CHECK(tp, table, expr) \
+ do { \
+ if (static_key_false(&tcp_estats_enabled)) { \
+ if (likely((tp)->tcp_stats) && \
+ likely((tp)->tcp_stats->tables.table)) { \
+ (expr); \
+ } \
+ } \
+ } while (0)
+
+#define TCP_ESTATS_VAR_INC(tp, table, var) \
+ TCP_ESTATS_CHECK(tp, table, ++((tp)->tcp_stats->tables.table->var))
+#define TCP_ESTATS_VAR_DEC(tp, table, var) \
+ TCP_ESTATS_CHECK(tp, table, --((tp)->tcp_stats->tables.table->var))
+#define TCP_ESTATS_VAR_ADD(tp, table, var, val) \
+ TCP_ESTATS_CHECK(tp, table, \
+ ((tp)->tcp_stats->tables.table->var) += (val))
+#define TCP_ESTATS_VAR_SET(tp, table, var, val) \
+ TCP_ESTATS_CHECK(tp, table, \
+ ((tp)->tcp_stats->tables.table->var) = (val))
+#define TCP_ESTATS_UPDATE(tp, func) \
+ do { \
+ if (static_key_false(&tcp_estats_enabled)) { \
+ if (likely((tp)->tcp_stats)) { \
+ (func); \
+ } \
+ } \
+ } while (0)
+
+/*
+ * Variables that can be read and written directly.
+ *
+ * Contains all variables from RFC 4898. Commented fields are
+ * either not implemented (only StartTimeStamp
+ * remains unimplemented in this release) or have
+ * handlers and do not need struct storage.
+ */
+struct tcp_estats_connection_table {
+ u32 AddressType;
+ union { struct in_addr addr; struct in6_addr addr6; } LocalAddress;
+ union { struct in_addr addr; struct in6_addr addr6; } RemAddress;
+ u16 LocalPort;
+ u16 RemPort;
+};
+
+struct tcp_estats_perf_table {
+ u32 SegsOut;
+ u32 DataSegsOut;
+ u64 DataOctetsOut;
+ u32 SegsRetrans;
+ u32 OctetsRetrans;
+ u32 SegsIn;
+ u32 DataSegsIn;
+ u64 DataOctetsIn;
+ /* ElapsedSecs */
+ /* ElapsedMicroSecs */
+ /* StartTimeStamp */
+ /* CurMSS */
+ /* PipeSize */
+ u32 MaxPipeSize;
+ /* SmoothedRTT */
+ /* CurRTO */
+ u32 CongSignals;
+ /* CurCwnd */
+ /* CurSsthresh */
+ u32 Timeouts;
+ /* CurRwinSent */
+ u32 MaxRwinSent;
+ u32 ZeroRwinSent;
+ /* CurRwinRcvd */
+ u32 MaxRwinRcvd;
+ u32 ZeroRwinRcvd;
+ /* SndLimTransRwin */
+ /* SndLimTransCwnd */
+ /* SndLimTransSnd */
+ /* SndLimTimeRwin */
+ /* SndLimTimeCwnd */
+ /* SndLimTimeSnd */
+ u32 snd_lim_trans[TCP_ESTATS_SNDLIM_NSTATES];
+ u32 snd_lim_time[TCP_ESTATS_SNDLIM_NSTATES];
+};
+
+struct tcp_estats_path_table {
+ /* RetranThresh */
+ u32 NonRecovDAEpisodes;
+ u32 SumOctetsReordered;
+ u32 NonRecovDA;
+ u32 SampleRTT;
+ /* RTTVar */
+ u32 MaxRTT;
+ u32 MinRTT;
+ u64 SumRTT;
+ u32 CountRTT;
+ u32 MaxRTO;
+ u32 MinRTO;
+ u8 IpTtl;
+ u8 IpTosIn;
+ /* IpTosOut */
+ u32 PreCongSumCwnd;
+ u32 PreCongSumRTT;
+ u32 PostCongSumRTT;
+ u32 PostCongCountRTT;
+ u32 ECNsignals;
+ u32 DupAckEpisodes;
+ /* RcvRTT */
+ u32 DupAcksOut;
+ u32 CERcvd;
+ u32 ECESent;
+};
+
+struct tcp_estats_stack_table {
+ u32 ActiveOpen;
+ /* MSSSent */
+ /* MSSRcvd */
+ /* WinScaleSent */
+ /* WinScaleRcvd */
+ /* TimeStamps */
+ /* ECN */
+ /* WillSendSACK */
+ /* WillUseSACK */
+ /* State */
+ /* Nagle */
+ u32 MaxSsCwnd;
+ u32 MaxCaCwnd;
+ u32 MaxSsthresh;
+ u32 MinSsthresh;
+ /* InRecovery */
+ u32 DupAcksIn;
+ u32 SpuriousFrDetected;
+ u32 SpuriousRtoDetected;
+ u32 SoftErrors;
+ u32 SoftErrorReason;
+ u32 SlowStart;
+ u32 CongAvoid;
+ u32 OtherReductions;
+ u32 CongOverCount;
+ u32 FastRetran;
+ u32 SubsequentTimeouts;
+ /* CurTimeoutCount */
+ u32 AbruptTimeouts;
+ u32 SACKsRcvd;
+ u32 SACKBlocksRcvd;
+ u32 SendStall;
+ u32 DSACKDups;
+ u32 MaxMSS;
+ u32 MinMSS;
+ u32 SndInitial;
+ u32 RecInitial;
+ /* CurRetxQueue */
+ /* MaxRetxQueue */
+ /* CurReasmQueue */
+ u32 MaxReasmQueue;
+ u32 EarlyRetrans;
+ u32 EarlyRetransDelay;
+};
+
+struct tcp_estats_app_table {
+ /* SndUna */
+ /* SndNxt */
+ u32 SndMax;
+ u64 ThruOctetsAcked;
+ /* RcvNxt */
+ u64 ThruOctetsReceived;
+ /* CurAppWQueue */
+ u32 MaxAppWQueue;
+ /* CurAppRQueue */
+ u32 MaxAppRQueue;
+};
+
+/*
+ currently, no backing store is needed for tuning elements in
+ web10g - they are all read or written to directly in other
+ data structures (such as the socket)
+*/
+
+struct tcp_estats_extras_table {
+ /* OtherReductionsCV */
+ u32 OtherReductionsCM;
+ u32 Priority;
+};
+
+struct tcp_estats_tables {
+ struct tcp_estats_connection_table *connection_table;
+ struct tcp_estats_perf_table *perf_table;
+ struct tcp_estats_path_table *path_table;
+ struct tcp_estats_stack_table *stack_table;
+ struct tcp_estats_app_table *app_table;
+ struct tcp_estats_extras_table *extras_table;
+};
+
+struct tcp_estats {
+ int tcpe_cid; /* idr map id */
+
+ struct sock *sk;
+ kuid_t uid;
+ kgid_t gid;
+ int ids;
+
+ atomic_t users;
+
+ enum tcp_estats_sndlim_states limstate;
+ ktime_t limstate_ts;
+#ifdef CONFIG_TCP_ESTATS_STRICT_ELAPSEDTIME
+ ktime_t start_ts;
+ ktime_t current_ts;
+#else
+ unsigned long start_ts;
+ unsigned long current_ts;
+#endif
+ struct timeval start_tv;
+
+ int queued;
+ struct work_struct create_notify;
+ struct work_struct establish_notify;
+ struct delayed_work destroy_notify;
+
+ struct tcp_estats_tables tables;
+
+ struct rcu_head rcu;
+};
+
+extern struct idr tcp_estats_idr;
+
+extern int tcp_estats_wq_enabled;
+extern struct workqueue_struct *tcp_estats_wq;
+extern void (*create_notify_func)(struct work_struct *work);
+extern void (*establish_notify_func)(struct work_struct *work);
+extern void (*destroy_notify_func)(struct work_struct *work);
+
+extern unsigned long persist_delay;
+extern spinlock_t tcp_estats_idr_lock;
+
+/* For the TCP code */
+extern int tcp_estats_create(struct sock *sk, enum tcp_estats_addrtype t,
+ int active);
+extern void tcp_estats_destroy(struct sock *sk);
+extern void tcp_estats_establish(struct sock *sk);
+extern void tcp_estats_free(struct rcu_head *rcu);
+
+extern void tcp_estats_update_snd_nxt(struct tcp_sock *tp);
+extern void tcp_estats_update_acked(struct tcp_sock *tp, u32 ack);
+extern void tcp_estats_update_rtt(struct sock *sk, unsigned long rtt_sample);
+extern void tcp_estats_update_timeout(struct sock *sk);
+extern void tcp_estats_update_mss(struct tcp_sock *tp);
+extern void tcp_estats_update_rwin_rcvd(struct tcp_sock *tp);
+extern void tcp_estats_update_sndlim(struct tcp_sock *tp,
+ enum tcp_estats_sndlim_states why);
+extern void tcp_estats_update_rcvd(struct tcp_sock *tp, u32 seq);
+extern void tcp_estats_update_rwin_sent(struct tcp_sock *tp);
+extern void tcp_estats_update_congestion(struct tcp_sock *tp);
+extern void tcp_estats_update_post_congestion(struct tcp_sock *tp);
+extern void tcp_estats_update_segsend(struct sock *sk, int pcount,
+ u32 seq, u32 end_seq, int flags);
+extern void tcp_estats_update_segrecv(struct tcp_sock *tp, struct sk_buff *skb);
+extern void tcp_estats_update_finish_segrecv(struct tcp_sock *tp);
+extern void tcp_estats_update_writeq(struct sock *sk);
+extern void tcp_estats_update_recvq(struct sock *sk);
+
+extern void tcp_estats_init(void);
+
+static inline void tcp_estats_use(struct tcp_estats *stats)
+{
+ atomic_inc(&stats->users);
+}
+
+static inline int tcp_estats_use_if_valid(struct tcp_estats *stats)
+{
+ return atomic_inc_not_zero(&stats->users);
+}
+
+static inline void tcp_estats_unuse(struct tcp_estats *stats)
+{
+ if (atomic_dec_and_test(&stats->users)) {
+ sock_put(stats->sk);
+ stats->sk = NULL;
+ call_rcu(&stats->rcu, tcp_estats_free);
+ }
+}
+
+#else /* !CONFIG_TCP_ESTATS */
+
+#define tcp_estats_enabled (0)
+
+#define TCP_ESTATS_VAR_INC(tp, table, var) do {} while (0)
+#define TCP_ESTATS_VAR_DEC(tp, table, var) do {} while (0)
+#define TCP_ESTATS_VAR_ADD(tp, table, var, val) do {} while (0)
+#define TCP_ESTATS_VAR_SET(tp, table, var, val) do {} while (0)
+#define TCP_ESTATS_UPDATE(tp, func) do {} while (0)
+
+static inline void tcp_estats_init(void) { }
+static inline void tcp_estats_establish(struct sock *sk) { }
+static inline void tcp_estats_create(struct sock *sk,
+ enum tcp_estats_addrtype t,
+ int active) { }
+static inline void tcp_estats_destroy(struct sock *sk) { }
+
+#endif /* CONFIG_TCP_ESTATS */
+
+#endif /* _TCP_ESTATS_H */
@@ -680,3 +680,28 @@ config TCP_MD5SIG
on the Internet.
If unsure, say N.
+
+config TCP_ESTATS
+ bool "TCP: Extended TCP statistics (RFC4898) MIB"
+ ---help---
+ RFC 4898 specifies a number of extended statistics for TCP. This
+ data can be accessed using netlink. See http://www.web10g.org for
+ more details.
+
+if TCP_ESTATS
+
+config TCP_ESTATS_STRICT_ELAPSEDTIME
+ bool "TCP: ESTATS strict ElapsedSecs/Msecs counters"
+ depends on TCP_ESTATS
+ default n
+ ---help---
+ Elapsed time since beginning of connection.
+ RFC4898 defines ElapsedSecs/Msecs as being updated via ktime_get
+ at each protocol event (sending or receiving of a segment);
+ as this can be a performance hit, leaving this config option off
+ will update elapsed based on on the jiffies counter instead.
+ Set to Y for strict conformance with the MIB.
+
+ If unsure, say N.
+
+endif
@@ -36,6 +36,7 @@ obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
obj-$(CONFIG_IP_PNP) += ipconfig.o
+obj-$(CONFIG_TCP_ESTATS) += tcp_estats.o
obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
@@ -42,6 +42,11 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
static int ip_ping_group_range_min[] = { 0, 0 };
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
+/* Extended statistics (RFC4898). */
+#ifdef CONFIG_TCP_ESTATS
+int sysctl_tcp_estats __read_mostly;
+#endif /* CONFIG_TCP_ESTATS */
+
/* Update system visible IP port range */
static void set_local_port_range(struct net *net, int range[2])
{
@@ -767,6 +772,15 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &one
},
+#ifdef CONFIG_TCP_ESTATS
+ {
+ .procname = "tcp_estats",
+ .data = &sysctl_tcp_estats,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif /* CONFIG TCP ESTATS */
{ }
};
new file mode 100644
@@ -0,0 +1,736 @@
+/*
+ * net/ipv4/tcp_estats.c
+ *
+ * Implementation of TCP ESTATS MIB (RFC 4898)
+ *
+ * Authors:
+ * John Estabrook <jsestabrook@gmail.com>
+ * Andrew K. Adams <akadams@psc.edu>
+ * Kevin Hogan <kwabena@google.com>
+ * Dominin Hamon <dma@stripysock.com>
+ * John Heffner <johnwheffner@gmail.com>
+ *
+ * The Web10Gig project. See http://www.web10gig.org
+ *
+ * Copyright © 2011, Pittsburgh Supercomputing Center (PSC).
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/export.h>
+#ifndef CONFIG_TCP_ESTATS_STRICT_ELAPSEDTIME
+#include <linux/jiffies.h>
+#endif
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <net/tcp_estats.h>
+#include <net/tcp.h>
+#include <asm/atomic.h>
+#include <asm/byteorder.h>
+
+#define ESTATS_INF32 0xffffffff
+
+#define ESTATS_MAX_CID 5000000
+
+extern int sysctl_tcp_estats;
+
+struct idr tcp_estats_idr;
+EXPORT_SYMBOL(tcp_estats_idr);
+static int next_id = 1;
+DEFINE_SPINLOCK(tcp_estats_idr_lock);
+EXPORT_SYMBOL(tcp_estats_idr_lock);
+
+int tcp_estats_wq_enabled __read_mostly = 0;
+EXPORT_SYMBOL(tcp_estats_wq_enabled);
+struct workqueue_struct *tcp_estats_wq = NULL;
+EXPORT_SYMBOL(tcp_estats_wq);
+void (*create_notify_func)(struct work_struct *work);
+EXPORT_SYMBOL(create_notify_func);
+void (*establish_notify_func)(struct work_struct *work);
+EXPORT_SYMBOL(establish_notify_func);
+void (*destroy_notify_func)(struct work_struct *work);
+EXPORT_SYMBOL(destroy_notify_func);
+unsigned long persist_delay = 0;
+EXPORT_SYMBOL(persist_delay);
+
+struct static_key tcp_estats_enabled __read_mostly = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL(tcp_estats_enabled);
+
+/* if HAVE_JUMP_LABEL is defined, then static_key_slow_inc/dec uses a
+ * mutex in its implementation, and hence can't be called if in_interrupt().
+ * if HAVE_JUMP_LABEL is NOT defined, then no mutex is used, hence no need
+ * for deferring enable/disable */
+#ifdef HAVE_JUMP_LABEL
+static atomic_t tcp_estats_enabled_deferred;
+
+static void tcp_estats_handle_deferred_enable_disable(void)
+{
+ int count = atomic_xchg(&tcp_estats_enabled_deferred, 0);
+
+ while (count > 0) {
+ static_key_slow_inc(&tcp_estats_enabled);
+ --count;
+ }
+
+ while (count < 0) {
+ static_key_slow_dec(&tcp_estats_enabled);
+ ++count;
+ }
+}
+#endif
+
+static inline void tcp_estats_enable(void)
+{
+#ifdef HAVE_JUMP_LABEL
+ if (in_interrupt()) {
+ atomic_inc(&tcp_estats_enabled_deferred);
+ return;
+ }
+ tcp_estats_handle_deferred_enable_disable();
+#endif
+ static_key_slow_inc(&tcp_estats_enabled);
+}
+
+static inline void tcp_estats_disable(void)
+{
+#ifdef HAVE_JUMP_LABEL
+ if (in_interrupt()) {
+ atomic_dec(&tcp_estats_enabled_deferred);
+ return;
+ }
+ tcp_estats_handle_deferred_enable_disable();
+#endif
+ static_key_slow_dec(&tcp_estats_enabled);
+}
+
+/* Calculates the required amount of memory for any enabled tables. */
+int tcp_estats_get_allocation_size(int sysctl)
+{
+ int size = sizeof(struct tcp_estats) +
+ sizeof(struct tcp_estats_connection_table);
+
+ if (sysctl & TCP_ESTATS_TABLEMASK_PERF)
+ size += sizeof(struct tcp_estats_perf_table);
+ if (sysctl & TCP_ESTATS_TABLEMASK_PATH)
+ size += sizeof(struct tcp_estats_path_table);
+ if (sysctl & TCP_ESTATS_TABLEMASK_STACK)
+ size += sizeof(struct tcp_estats_stack_table);
+ if (sysctl & TCP_ESTATS_TABLEMASK_APP)
+ size += sizeof(struct tcp_estats_app_table);
+ if (sysctl & TCP_ESTATS_TABLEMASK_EXTRAS)
+ size += sizeof(struct tcp_estats_extras_table);
+ return size;
+}
+
+/* Called whenever a TCP/IPv4 sock is created.
+ * net/ipv4/tcp_ipv4.c: tcp_v4_syn_recv_sock,
+ * tcp_v4_init_sock
+ * Allocates a stats structure and initializes values.
+ */
+int tcp_estats_create(struct sock *sk, enum tcp_estats_addrtype addrtype,
+ int active)
+{
+ struct tcp_estats *stats;
+ struct tcp_estats_tables *tables;
+ struct tcp_sock *tp = tcp_sk(sk);
+ void *estats_mem;
+ int sysctl;
+ int ret;
+
+ /* Read the sysctl once before calculating memory needs and initializing
+ * tables to avoid raciness. */
+ sysctl = ACCESS_ONCE(sysctl_tcp_estats);
+ if (likely(sysctl == TCP_ESTATS_TABLEMASK_INACTIVE)) {
+ return 0;
+ }
+
+ estats_mem = kzalloc(tcp_estats_get_allocation_size(sysctl), gfp_any());
+ if (!estats_mem)
+ return -ENOMEM;
+
+ stats = estats_mem;
+ estats_mem += sizeof(struct tcp_estats);
+
+ tables = &stats->tables;
+
+ tables->connection_table = estats_mem;
+ estats_mem += sizeof(struct tcp_estats_connection_table);
+
+ if (sysctl & TCP_ESTATS_TABLEMASK_PERF) {
+ tables->perf_table = estats_mem;
+ estats_mem += sizeof(struct tcp_estats_perf_table);
+ }
+ if (sysctl & TCP_ESTATS_TABLEMASK_PATH) {
+ tables->path_table = estats_mem;
+ estats_mem += sizeof(struct tcp_estats_path_table);
+ }
+ if (sysctl & TCP_ESTATS_TABLEMASK_STACK) {
+ tables->stack_table = estats_mem;
+ estats_mem += sizeof(struct tcp_estats_stack_table);
+ }
+ if (sysctl & TCP_ESTATS_TABLEMASK_APP) {
+ tables->app_table = estats_mem;
+ estats_mem += sizeof(struct tcp_estats_app_table);
+ }
+ if (sysctl & TCP_ESTATS_TABLEMASK_EXTRAS) {
+ tables->extras_table = estats_mem;
+ estats_mem += sizeof(struct tcp_estats_extras_table);
+ }
+
+ stats->tcpe_cid = -1;
+ stats->queued = 0;
+
+ tables->connection_table->AddressType = addrtype;
+
+ sock_hold(sk);
+ stats->sk = sk;
+ atomic_set(&stats->users, 0);
+
+ stats->limstate = TCP_ESTATS_SNDLIM_STARTUP;
+ stats->limstate_ts = ktime_get();
+#ifdef CONFIG_TCP_ESTATS_STRICT_ELAPSEDTIME
+ stats->start_ts = stats->current_ts = stats->limstate_ts;
+#else
+ stats->start_ts = stats->current_ts = jiffies;
+#endif
+ do_gettimeofday(&stats->start_tv);
+
+ /* order is important -
+ * must have stats hooked into tp and tcp_estats_enabled()
+ * in order to have the TCP_ESTATS_VAR_<> macros work */
+ tp->tcp_stats = stats;
+ tcp_estats_enable();
+
+ TCP_ESTATS_VAR_SET(tp, stack_table, ActiveOpen, active);
+
+ TCP_ESTATS_VAR_SET(tp, app_table, SndMax, tp->snd_nxt);
+ TCP_ESTATS_VAR_SET(tp, stack_table, SndInitial, tp->snd_nxt);
+
+ TCP_ESTATS_VAR_SET(tp, path_table, MinRTT, ESTATS_INF32);
+ TCP_ESTATS_VAR_SET(tp, path_table, MinRTO, ESTATS_INF32);
+ TCP_ESTATS_VAR_SET(tp, stack_table, MinMSS, ESTATS_INF32);
+ TCP_ESTATS_VAR_SET(tp, stack_table, MinSsthresh, ESTATS_INF32);
+
+ tcp_estats_use(stats);
+
+ if (tcp_estats_wq_enabled) {
+ tcp_estats_use(stats);
+ stats->queued = 1;
+ stats->tcpe_cid = 0;
+ INIT_WORK(&stats->create_notify, create_notify_func);
+ ret = queue_work(tcp_estats_wq, &stats->create_notify);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(tcp_estats_create);
+
+void tcp_estats_destroy(struct sock *sk)
+{
+ struct tcp_estats *stats = tcp_sk(sk)->tcp_stats;
+
+ if (stats == NULL)
+ return;
+
+ /* Attribute final sndlim time. */
+ tcp_estats_update_sndlim(tcp_sk(stats->sk), stats->limstate);
+
+ if (tcp_estats_wq_enabled && stats->queued) {
+ INIT_DELAYED_WORK(&stats->destroy_notify,
+ destroy_notify_func);
+ queue_delayed_work(tcp_estats_wq, &stats->destroy_notify,
+ persist_delay);
+ }
+ tcp_estats_unuse(stats);
+}
+
+/* Do not call directly. Called from tcp_estats_unuse() through call_rcu. */
+void tcp_estats_free(struct rcu_head *rcu)
+{
+ struct tcp_estats *stats = container_of(rcu, struct tcp_estats, rcu);
+ tcp_estats_disable();
+ kfree(stats);
+}
+EXPORT_SYMBOL(tcp_estats_free);
+
+/* Called when a connection enters the ESTABLISHED state, and has all its
+ * state initialized.
+ * net/ipv4/tcp_input.c: tcp_rcv_state_process,
+ * tcp_rcv_synsent_state_process
+ * Here we link the statistics structure in so it is visible in the /proc
+ * fs, and do some final init.
+ */
+void tcp_estats_establish(struct sock *sk)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_estats *stats = tp->tcp_stats;
+ struct tcp_estats_connection_table *conn_table;
+
+ if (stats == NULL)
+ return;
+
+ conn_table = stats->tables.connection_table;
+
+ /* Let's set these here, since they can't change once the
+ * connection is established.
+ */
+ conn_table->LocalPort = inet->inet_num;
+ conn_table->RemPort = ntohs(inet->inet_dport);
+
+ if (conn_table->AddressType == TCP_ESTATS_ADDRTYPE_IPV4) {
+ memcpy(&conn_table->LocalAddress.addr, &inet->inet_rcv_saddr,
+ sizeof(struct in_addr));
+ memcpy(&conn_table->RemAddress.addr, &inet->inet_daddr,
+ sizeof(struct in_addr));
+ }
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ else if (conn_table->AddressType == TCP_ESTATS_ADDRTYPE_IPV6) {
+ memcpy(&conn_table->LocalAddress.addr6, &(sk)->sk_v6_rcv_saddr,
+ sizeof(struct in6_addr));
+ /* ipv6 daddr now uses a different struct than saddr */
+ memcpy(&conn_table->RemAddress.addr6, &(sk)->sk_v6_daddr,
+ sizeof(struct in6_addr));
+ }
+#endif
+ else {
+ pr_err("TCP ESTATS: AddressType not valid.\n");
+ }
+
+ tcp_estats_update_finish_segrecv(tp);
+ tcp_estats_update_rwin_rcvd(tp);
+ tcp_estats_update_rwin_sent(tp);
+
+ TCP_ESTATS_VAR_SET(tp, stack_table, RecInitial, tp->rcv_nxt);
+
+ tcp_estats_update_sndlim(tp, TCP_ESTATS_SNDLIM_SENDER);
+
+ if (tcp_estats_wq_enabled && stats->queued) {
+ INIT_WORK(&stats->establish_notify, establish_notify_func);
+ queue_work(tcp_estats_wq, &stats->establish_notify);
+ }
+}
+
+/*
+ * Statistics update functions
+ */
+
+void tcp_estats_update_snd_nxt(struct tcp_sock *tp)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+
+ if (stats->tables.app_table) {
+ if (after(tp->snd_nxt, stats->tables.app_table->SndMax))
+ stats->tables.app_table->SndMax = tp->snd_nxt;
+ }
+}
+
+void tcp_estats_update_acked(struct tcp_sock *tp, u32 ack)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+
+ if (stats->tables.app_table)
+ stats->tables.app_table->ThruOctetsAcked += ack - tp->snd_una;
+}
+
+void tcp_estats_update_rtt(struct sock *sk, unsigned long rtt_sample)
+{
+ struct tcp_estats *stats = tcp_sk(sk)->tcp_stats;
+ struct tcp_estats_path_table *path_table = stats->tables.path_table;
+ unsigned long rtt_sample_msec = rtt_sample/1000;
+ u32 rto;
+
+ if (path_table == NULL)
+ return;
+
+ path_table->SampleRTT = rtt_sample_msec;
+
+ if (rtt_sample_msec > path_table->MaxRTT)
+ path_table->MaxRTT = rtt_sample_msec;
+ if (rtt_sample_msec < path_table->MinRTT)
+ path_table->MinRTT = rtt_sample_msec;
+
+ path_table->CountRTT++;
+ path_table->SumRTT += rtt_sample_msec;
+
+ rto = jiffies_to_msecs(inet_csk(sk)->icsk_rto);
+ if (rto > path_table->MaxRTO)
+ path_table->MaxRTO = rto;
+ if (rto < path_table->MinRTO)
+ path_table->MinRTO = rto;
+}
+
+void tcp_estats_update_timeout(struct sock *sk)
+{
+ if (inet_csk(sk)->icsk_backoff)
+ TCP_ESTATS_VAR_INC(tcp_sk(sk), stack_table, SubsequentTimeouts);
+ else
+ TCP_ESTATS_VAR_INC(tcp_sk(sk), perf_table, Timeouts);
+
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open)
+ TCP_ESTATS_VAR_INC(tcp_sk(sk), stack_table, AbruptTimeouts);
+}
+
+void tcp_estats_update_mss(struct tcp_sock *tp)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+ struct tcp_estats_stack_table *stack_table = stats->tables.stack_table;
+ int mss = tp->mss_cache;
+
+ if (stack_table == NULL)
+ return;
+
+ if (mss > stack_table->MaxMSS)
+ stack_table->MaxMSS = mss;
+ if (mss < stack_table->MinMSS)
+ stack_table->MinMSS = mss;
+}
+
+void tcp_estats_update_finish_segrecv(struct tcp_sock *tp)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+ struct tcp_estats_tables *tables = &stats->tables;
+ struct tcp_estats_perf_table *perf_table = tables->perf_table;
+ struct tcp_estats_stack_table *stack_table = tables->stack_table;
+ u32 mss = tp->mss_cache;
+ u32 cwnd;
+ u32 ssthresh;
+ u32 pipe_size;
+
+#ifdef CONFIG_TCP_ESTATS_STRICT_ELAPSEDTIME
+ stats->current_ts = ktime_get();
+#else
+ stats->current_ts = jiffies;
+#endif
+
+ if (stack_table != NULL) {
+ cwnd = tp->snd_cwnd * mss;
+ if (tp->snd_cwnd <= tp->snd_ssthresh) {
+ if (cwnd > stack_table->MaxSsCwnd)
+ stack_table->MaxSsCwnd = cwnd;
+ } else if (cwnd > stack_table->MaxCaCwnd) {
+ stack_table->MaxCaCwnd = cwnd;
+ }
+ }
+
+ if (perf_table != NULL) {
+ pipe_size = tcp_packets_in_flight(tp) * mss;
+ if (pipe_size > perf_table->MaxPipeSize)
+ perf_table->MaxPipeSize = pipe_size;
+ }
+
+ /* Discard initiail ssthresh set at infinity. */
+ if (tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH) {
+ return;
+ }
+
+ if (stack_table != NULL) {
+ ssthresh = tp->snd_ssthresh * tp->mss_cache;
+ if (ssthresh > stack_table->MaxSsthresh)
+ stack_table->MaxSsthresh = ssthresh;
+ if (ssthresh < stack_table->MinSsthresh)
+ stack_table->MinSsthresh = ssthresh;
+ }
+}
+EXPORT_SYMBOL(tcp_estats_update_finish_segrecv);
+
+void tcp_estats_update_rwin_rcvd(struct tcp_sock *tp)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+ struct tcp_estats_perf_table *perf_table = stats->tables.perf_table;
+ u32 win = tp->snd_wnd;
+
+ if (perf_table == NULL)
+ return;
+
+ if (win > perf_table->MaxRwinRcvd)
+ perf_table->MaxRwinRcvd = win;
+ if (win == 0)
+ perf_table->ZeroRwinRcvd++;
+}
+
+void tcp_estats_update_rwin_sent(struct tcp_sock *tp)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+ struct tcp_estats_perf_table *perf_table = stats->tables.perf_table;
+ u32 win = tp->rcv_wnd;
+
+ if (perf_table == NULL)
+ return;
+
+ if (win > perf_table->MaxRwinSent)
+ perf_table->MaxRwinSent = win;
+ if (win == 0)
+ perf_table->ZeroRwinSent++;
+}
+
+void tcp_estats_update_sndlim(struct tcp_sock *tp,
+ enum tcp_estats_sndlim_states state)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+ struct tcp_estats_perf_table *perf_table = stats->tables.perf_table;
+ ktime_t now;
+
+ if (state <= TCP_ESTATS_SNDLIM_NONE ||
+ state >= TCP_ESTATS_SNDLIM_NSTATES) {
+ pr_err("tcp_estats_update_sndlim: BUG: state out of range %d\n",
+ state);
+ return;
+ }
+
+ if (perf_table == NULL)
+ return;
+
+ now = ktime_get();
+ perf_table->snd_lim_time[stats->limstate]
+ += ktime_to_us(ktime_sub(now, stats->limstate_ts));
+ stats->limstate_ts = now;
+ if (stats->limstate != state) {
+ stats->limstate = state;
+ perf_table->snd_lim_trans[state]++;
+ }
+}
+
+void tcp_estats_update_congestion(struct tcp_sock *tp)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+ struct tcp_estats_path_table *path_table = stats->tables.path_table;
+
+ TCP_ESTATS_VAR_INC(tp, perf_table, CongSignals);
+
+ if (path_table != NULL) {
+ path_table->PreCongSumCwnd += tp->snd_cwnd * tp->mss_cache;
+ path_table->PreCongSumRTT += path_table->SampleRTT;
+ }
+}
+
+void tcp_estats_update_post_congestion(struct tcp_sock *tp)
+{
+ struct tcp_estats *stats = tp->tcp_stats;
+ struct tcp_estats_path_table *path_table = stats->tables.path_table;
+
+ if (path_table != NULL) {
+ path_table->PostCongCountRTT++;
+ path_table->PostCongSumRTT += path_table->SampleRTT;
+ }
+}
+
+void tcp_estats_update_segsend(struct sock *sk, int pcount,
+ u32 seq, u32 end_seq, int flags)
+{
+ struct tcp_estats *stats = tcp_sk(sk)->tcp_stats;
+ struct tcp_estats_perf_table *perf_table = stats->tables.perf_table;
+ struct tcp_estats_app_table *app_table = stats->tables.app_table;
+
+ int data_len = end_seq - seq;
+
+#ifdef CONFIG_TCP_ESTATS_STRICT_ELAPSEDTIME
+ stats->current_ts = ktime_get();
+#else
+ stats->current_ts = jiffies;
+#endif
+
+ if (perf_table == NULL)
+ return;
+
+ /* We know we're sending a segment. */
+ perf_table->SegsOut += pcount;
+
+ /* A pure ACK contains no data; everything else is data. */
+ if (data_len > 0) {
+ perf_table->DataSegsOut += pcount;
+ perf_table->DataOctetsOut += data_len;
+ }
+
+ /* Check for retransmission. */
+ if (flags & TCPHDR_SYN) {
+ if (inet_csk(sk)->icsk_retransmits)
+ perf_table->SegsRetrans++;
+ } else if (app_table != NULL &&
+ before(seq, app_table->SndMax)) {
+ perf_table->SegsRetrans += pcount;
+ perf_table->OctetsRetrans += data_len;
+ }
+}
+
+void tcp_estats_update_segrecv(struct tcp_sock *tp, struct sk_buff *skb)
+{
+ struct tcp_estats_tables *tables = &tp->tcp_stats->tables;
+ struct tcp_estats_path_table *path_table = tables->path_table;
+ struct tcp_estats_perf_table *perf_table = tables->perf_table;
+ struct tcp_estats_stack_table *stack_table = tables->stack_table;
+ struct tcphdr *th = tcp_hdr(skb);
+ struct iphdr *iph = ip_hdr(skb);
+
+ if (perf_table != NULL)
+ perf_table->SegsIn++;
+
+ if (skb->len == th->doff * 4) {
+ if (stack_table != NULL &&
+ TCP_SKB_CB(skb)->ack_seq == tp->snd_una)
+ stack_table->DupAcksIn++;
+ } else {
+ if (perf_table != NULL) {
+ perf_table->DataSegsIn++;
+ perf_table->DataOctetsIn += skb->len - th->doff * 4;
+ }
+ }
+
+ if (path_table != NULL) {
+ path_table->IpTtl = iph->ttl;
+ path_table->IpTosIn = iph->tos;
+ }
+}
+EXPORT_SYMBOL(tcp_estats_update_segrecv);
+
+void tcp_estats_update_rcvd(struct tcp_sock *tp, u32 seq)
+{
+ /* After much debate, it was decided that "seq - rcv_nxt" is
+ indeed what we want, as opposed to what Krishnan suggested
+ to better match the RFC: "seq - tp->rcv_wup" */
+ TCP_ESTATS_VAR_ADD(tp, app_table, ThruOctetsReceived,
+ seq - tp->rcv_nxt);
+}
+
+void tcp_estats_update_writeq(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_estats_app_table *app_table =
+ tp->tcp_stats->tables.app_table;
+ int len;
+
+ if (app_table == NULL)
+ return;
+
+ len = tp->write_seq - app_table->SndMax;
+
+ if (len > app_table->MaxAppWQueue)
+ app_table->MaxAppWQueue = len;
+}
+
+static inline u32 ofo_qlen(struct tcp_sock *tp)
+{
+ if (!skb_peek(&tp->out_of_order_queue))
+ return 0;
+ else
+ return TCP_SKB_CB(tp->out_of_order_queue.prev)->end_seq -
+ TCP_SKB_CB(tp->out_of_order_queue.next)->seq;
+}
+
+void tcp_estats_update_recvq(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_estats_tables *tables = &tp->tcp_stats->tables;
+ struct tcp_estats_app_table *app_table = tables->app_table;
+ struct tcp_estats_stack_table *stack_table = tables->stack_table;
+
+ if (app_table != NULL) {
+ u32 len = tp->rcv_nxt - tp->copied_seq;
+ if (app_table->MaxAppRQueue < len)
+ app_table->MaxAppRQueue = len;
+ }
+
+ if (stack_table != NULL) {
+ u32 len = ofo_qlen(tp);
+ if (stack_table->MaxReasmQueue < len)
+ stack_table->MaxReasmQueue = len;
+ }
+}
+
+/*
+ * Manage connection ID table
+ */
+
+static int get_new_cid(struct tcp_estats *stats)
+{
+ int id_cid;
+
+again:
+ spin_lock_bh(&tcp_estats_idr_lock);
+ id_cid = idr_alloc(&tcp_estats_idr, stats, next_id, 0, GFP_KERNEL);
+ if (unlikely(id_cid == -ENOSPC)) {
+ spin_unlock_bh(&tcp_estats_idr_lock);
+ goto again;
+ }
+ if (unlikely(id_cid == -ENOMEM)) {
+ spin_unlock_bh(&tcp_estats_idr_lock);
+ return -ENOMEM;
+ }
+ next_id = (id_cid + 1) % ESTATS_MAX_CID;
+ stats->tcpe_cid = id_cid;
+ spin_unlock_bh(&tcp_estats_idr_lock);
+ return 0;
+}
+
+static void create_func(struct work_struct *work)
+{
+ /* stub for netlink notification of new connections */
+ ;
+}
+
+static void establish_func(struct work_struct *work)
+{
+ struct tcp_estats *stats = container_of(work, struct tcp_estats,
+ establish_notify);
+ int err = 0;
+
+ if ((stats->tcpe_cid) > 0) {
+ pr_err("TCP estats container established multiple times.\n");
+ return;
+ }
+
+ if ((stats->tcpe_cid) == 0) {
+ err = get_new_cid(stats);
+ if (err)
+ pr_devel("get_new_cid error %d\n", err);
+ }
+}
+
+static void destroy_func(struct work_struct *work)
+{
+ struct tcp_estats *stats = container_of(work, struct tcp_estats,
+ destroy_notify.work);
+
+ int id_cid = stats->tcpe_cid;
+
+ if (id_cid == 0)
+ pr_devel("TCP estats destroyed before being established.\n");
+
+ if (id_cid >= 0) {
+ if (id_cid) {
+ spin_lock_bh(&tcp_estats_idr_lock);
+ idr_remove(&tcp_estats_idr, id_cid);
+ spin_unlock_bh(&tcp_estats_idr_lock);
+ }
+ stats->tcpe_cid = -1;
+
+ tcp_estats_unuse(stats);
+ }
+}
+
+void __init tcp_estats_init()
+{
+ idr_init(&tcp_estats_idr);
+
+ create_notify_func = &create_func;
+ establish_notify_func = &establish_func;
+ destroy_notify_func = &destroy_func;
+
+ persist_delay = TCP_ESTATS_PERSIST_DELAY_SECS * HZ;
+
+ tcp_estats_wq = alloc_workqueue("tcp_estats", WQ_MEM_RECLAIM, 256);
+ if (tcp_estats_wq == NULL) {
+ pr_err("tcp_estats_init(): alloc_workqueue failed\n");
+ goto cleanup_fail;
+ }
+
+ tcp_estats_wq_enabled = 1;
+ return;
+
+cleanup_fail:
+ pr_err("TCP ESTATS: initialization failed.\n");
+}