@@ -426,13 +426,46 @@ extern void skb_dma_unmap(struct device *dev, struct sk_buff *skb,
enum dma_data_direction dir);
#endif
+/*
+ * skb might have a dst pointer attached, refcounted or not
+ * _skb_dst low order bit is set if refcount was taken
+ */
+#define SKB_DST_REFTAKEN 1UL
+#define SKB_DST_PTRMASK ~1UL
+/**
+ * skb_dst - returns skb dst
+ * @skb: buffer
+ *
+ * Returns skb dst, regardless of reference taken or not.
+ */
static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
{
- return (struct dst_entry *)skb->_skb_dst;
+ return (struct dst_entry *)(skb->_skb_dst & SKB_DST_PTRMASK);
}
+/**
+ * skb_dst_set - sets skb dst
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was taken on dst and should
+ * be released by skb_dst_drop()
+ */
static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
{
+ skb->_skb_dst = (unsigned long)dst | SKB_DST_REFTAKEN;
+}
+
+/**
+ * skb_dst_set_noref - sets skb dst, without a reference
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was not taken on dst
+ * skb_dst_drop() should not dst_release() this dst
+ */
+static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
+{
skb->_skb_dst = (unsigned long)dst;
}
@@ -179,13 +179,18 @@ static inline void dst_hold(struct dst_entry * dst)
atomic_inc(&dst->__refcnt);
}
-static inline void dst_use(struct dst_entry *dst, unsigned long time)
+static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
{
- dst_hold(dst);
dst->__use++;
dst->lastuse = time;
}
+static inline void dst_use(struct dst_entry *dst, unsigned long time)
+{
+ dst_hold(dst);
+ dst_use_noref(dst, time);
+}
+
static inline
struct dst_entry * dst_clone(struct dst_entry * dst)
{
@@ -195,13 +200,45 @@ struct dst_entry * dst_clone(struct dst_entry * dst)
}
extern void dst_release(struct dst_entry *dst);
+
+static inline void __skb_dst_drop(unsigned long _skb_dst)
+{
+ if (_skb_dst & SKB_DST_REFTAKEN)
+ dst_release((struct dst_entry *)(_skb_dst & SKB_DST_PTRMASK));
+}
+/**
+ * skb_dst_drop - drops skb dst
+ * @skb: buffer
+ *
+ * Drops dst reference count if low order bit is set.
+ */
static inline void skb_dst_drop(struct sk_buff *skb)
{
- if (skb->_skb_dst)
- dst_release(skb_dst(skb));
+ __skb_dst_drop(skb->_skb_dst);
skb->_skb_dst = 0UL;
}
+static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb)
+{
+ nskb->_skb_dst = oskb->_skb_dst;
+ if (nskb->_skb_dst & SKB_DST_REFTAKEN)
+ dst_clone(skb_dst(nskb));
+}
+
+/**
+ * skb_dst_force - makes sure skb dst is refcounted
+ * @skb: buffer
+ *
+ * If dst is not yet refcounted, let's do it
+ */
+static inline void skb_dst_force(struct sk_buff *skb)
+{
+ if (!(skb->_skb_dst & SKB_DST_REFTAKEN)) {
+ dst_clone(skb_dst(skb));
+ skb->_skb_dst |= SKB_DST_REFTAKEN;
+ }
+}
+
/* Children define the path of the packet through the
* Linux networking. Thus, destinations are stackable.
*/
@@ -378,6 +378,7 @@ extern int ip_options_rcv_srr(struct sk_buff *skb);
* Functions provided by ip_sockglue.c
*/
+extern int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
extern void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb);
extern int ip_cmsg_send(struct net *net,
struct msghdr *msg, struct ipcm_cookie *ipc);
@@ -115,7 +115,8 @@ extern void rt_cache_flush(struct net *net, int how);
extern int __ip_route_output_key(struct net *, struct rtable **, const struct flowi *flp);
extern int ip_route_output_key(struct net *, struct rtable **, struct flowi *flp);
extern int ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
-extern int ip_route_input(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin);
+extern int ip_route_input(struct sk_buff*, __be32 dst, __be32 src, u8 tos,
+ struct net_device *devin, int noref);
extern unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, unsigned short new_mtu, struct net_device *dev);
extern void ip_rt_send_redirect(struct sk_buff *skb);
@@ -349,7 +349,7 @@ static int br_nf_pre_routing_finish(struct sk_buff *skb)
}
nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
if (dnat_took_place(skb)) {
- if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
+ if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev, 1))) {
struct flowi fl = {
.nl_u = {
.ip4_u = {
@@ -1700,9 +1700,15 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
* If device doesnt need skb->dst, release it right now while
* its hot in this cpu cache
*/
- if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
- skb_dst_drop(skb);
-
+ if (skb_dst(skb)) {
+ if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+ skb_dst_drop(skb);
+ else
+ /*
+ * make sure dst was refcounted by caller
+ */
+ WARN_ON(!(skb->_skb_dst & SKB_DST_REFTAKEN));
+ }
rc = ops->ndo_start_xmit(skb, dev);
if (rc == NETDEV_TX_OK)
txq_trans_update(txq);
@@ -1861,6 +1867,12 @@ gso:
if (q->enqueue) {
spinlock_t *root_lock = qdisc_lock(q);
+ /*
+ * before queueing skb, we must enforce a refcounted dst
+ */
+ if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
+ skb_dst_force(skb);
+
spin_lock(root_lock);
if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
@@ -530,7 +530,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->transport_header = old->transport_header;
new->network_header = old->network_header;
new->mac_header = old->mac_header;
- skb_dst_set(new, dst_clone(skb_dst(old)));
+ skb_dst_copy(new, old);
#ifdef CONFIG_XFRM
new->sp = secpath_get(old->sp);
#endif
@@ -304,6 +304,10 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
* from the queue.
*/
skb_len = skb->len;
+ /*
+ * If we still have a dst entry linked, make sure we refcounted it
+ */
+ skb_dst_force(skb);
skb_queue_tail(&sk->sk_receive_queue, skb);
@@ -812,7 +812,7 @@ static int arp_process(struct sk_buff *skb)
}
if (arp->ar_op == htons(ARPOP_REQUEST) &&
- ip_route_input(skb, tip, sip, 0, dev) == 0) {
+ ip_route_input(skb, tip, sip, 0, dev, 0) == 0) {
rt = skb_rtable(skb);
addr_type = rt->rt_type;
@@ -584,20 +584,20 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
err = __ip_route_output_key(net, &rt2, &fl);
else {
struct flowi fl2 = {};
- struct dst_entry *odst;
+ unsigned long odst;
fl2.fl4_dst = fl.fl4_src;
if (ip_route_output_key(net, &rt2, &fl2))
goto relookup_failed;
/* Ugh! */
- odst = skb_dst(skb_in);
+ odst = skb_in->_skb_dst; /* save old dst */
err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
- RT_TOS(tos), rt2->u.dst.dev);
+ RT_TOS(tos), rt2->u.dst.dev, 0);
dst_release(&rt2->u.dst);
rt2 = skb_rtable(skb_in);
- skb_dst_set(skb_in, odst);
+ skb_in->_skb_dst = odst; /* restore old dst */
}
if (err)
@@ -331,7 +331,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
*/
if (skb_dst(skb) == NULL) {
int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
- skb->dev);
+ skb->dev, 1);
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
@@ -600,6 +600,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
unsigned char *optptr = skb_network_header(skb) + opt->srr;
struct rtable *rt = skb_rtable(skb);
struct rtable *rt2;
+ unsigned long odst;
int err;
if (!opt->srr)
@@ -623,16 +624,16 @@ int ip_options_rcv_srr(struct sk_buff *skb)
}
memcpy(&nexthop, &optptr[srrptr-1], 4);
- rt = skb_rtable(skb);
+ odst = skb->_skb_dst;
skb_dst_set(skb, NULL);
- err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
+ err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev, 0);
rt2 = skb_rtable(skb);
if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
- ip_rt_put(rt2);
- skb_dst_set(skb, &rt->u.dst);
+ skb_dst_drop(skb);
+ skb->_skb_dst = odst;
return -EINVAL;
}
- ip_rt_put(rt);
+ __skb_dst_drop(odst);
if (rt2->rt_type != RTN_LOCAL)
break;
/* Superfast 8) loopback forward */
@@ -946,6 +946,22 @@ e_inval:
return -EINVAL;
}
+/**
+ * ip_queue_rcv_skb - Queue an skb into sock receive queue
+ * @sk: socket
+ * @skb: buffer
+ *
+ * Queues an skb into socket receive queue. If IP_CMSG_PKTINFO option
+ * is not set, we drop skb dst entry now.
+ */
+int ip_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (!(inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO))
+ skb_dst_drop(skb);
+ return sock_queue_rcv_skb(sk, skb);
+}
+EXPORT_SYMBOL(ip_queue_rcv_skb);
+
int ip_setsockopt(struct sock *sk, int level,
int optname, char __user *optval, int optlen)
{
@@ -16,7 +16,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
struct flowi fl = {};
- struct dst_entry *odst;
+ unsigned long odst;
unsigned int hh_len;
unsigned int type;
@@ -50,14 +50,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
if (ip_route_output_key(net, &rt, &fl) != 0)
return -1;
- odst = skb_dst(skb);
+ odst = skb->_skb_dst;
if (ip_route_input(skb, iph->daddr, iph->saddr,
- RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
+ RT_TOS(iph->tos), rt->u.dst.dev, 0) != 0) {
dst_release(&rt->u.dst);
return -1;
}
dst_release(&rt->u.dst);
- dst_release(odst);
+ __skb_dst_drop(odst);
}
if (skb_dst(skb)->error)
@@ -291,7 +291,7 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
{
/* Charge it to the socket. */
- if (sock_queue_rcv_skb(sk, skb) < 0) {
+ if (ip_queue_rcv_skb(sk, skb) < 0) {
atomic_inc(&sk->sk_drops);
kfree_skb(skb);
return NET_RX_DROP;
@@ -2255,7 +2255,7 @@ martian_source:
}
int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev)
+ u8 tos, struct net_device *dev, int noref)
{
struct rtable * rth;
unsigned hash;
@@ -2281,10 +2281,15 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->fl.mark == skb->mark &&
net_eq(dev_net(rth->u.dst.dev), net) &&
!rt_is_expired(rth)) {
- dst_use(&rth->u.dst, jiffies);
+ if (noref) {
+ dst_use_noref(&rth->u.dst, jiffies);
+ skb_dst_set_noref(skb, &rth->u.dst);
+ } else {
+ dst_use(&rth->u.dst, jiffies);
+ skb_dst_set(skb, &rth->u.dst);
+ }
RT_CACHE_STAT_INC(in_hit);
rcu_read_unlock();
- skb_dst_set(skb, &rth->u.dst);
return 0;
}
RT_CACHE_STAT_INC(in_hlist_search);
@@ -2944,7 +2949,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
skb->protocol = htons(ETH_P_IP);
skb->dev = dev;
local_bh_disable();
- err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
+ err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev, 0);
local_bh_enable();
rt = skb_rtable(skb);
@@ -3008,7 +3013,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
continue;
if (rt_is_expired(rt))
continue;
- skb_dst_set(skb, dst_clone(&rt->u.dst));
+ skb_dst_set_noref(skb, &rt->u.dst);
if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq, RTM_NEWROUTE,
1, NLM_F_MULTI) <= 0) {
@@ -4297,6 +4297,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
goto drop;
+ skb_dst_drop(skb);
__skb_pull(skb, th->doff * 4);
TCP_ECN_accept_cwr(tp, skb);
@@ -1024,7 +1024,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
int is_udplite = IS_UDPLITE(sk);
int rc;
- if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) {
+ if ((rc = ip_queue_rcv_skb(sk, skb)) < 0) {
/* Note that an ENOMEM error is charged twice */
if (rc == -ENOMEM) {
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
@@ -27,7 +27,7 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
const struct iphdr *iph = ip_hdr(skb);
if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
- skb->dev))
+ skb->dev, 1))
goto drop;
}
return dst_input(skb);
@@ -564,7 +564,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
} else {
ip_rt_put(rt);
if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos,
- skb2->dev) ||
+ skb2->dev, 0) ||
skb_dst(skb2)->dev->type != ARPHRD_TUNNEL)
goto out;
}
@@ -385,7 +385,7 @@ int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
goto drop;
}
- if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) {
+ if ((rc = ip_queue_rcv_skb(sk, skb)) < 0) {
/* Note that an ENOMEM error is charged twice */
if (rc == -ENOMEM) {
UDP6_INC_STATS_BH(sock_net(sk),
@@ -169,6 +169,7 @@ static int __nf_queue(struct sk_buff *skb,
dev_hold(physoutdev);
}
#endif
+ skb_dst_force(skb);
afinfo->saveroute(skb, entry);
status = qh->outfn(entry, queuenum);