net: better IFF_XMIT_DST_RELEASE support
Testing xmit_more support with netperf and connected UDP sockets,
I found strange dst refcount false sharing.
Current handling of IFF_XMIT_DST_RELEASE is not optimal.
Dropping dst in validate_xmit_skb() is certainly too late in case
packet was queued by cpu X but dequeued by cpu Y
The logical point to take care of drop/force is in __dev_queue_xmit()
before even taking qdisc lock.
As Julian Anastasov pointed out, need for skb_dst() might come from some
packet schedulers or classifiers.
This patch adds new helper to cleanly express needs of various drivers
or qdiscs/classifiers.
Drivers that need skb_dst() in their ndo_start_xmit() should call
following helper in their setup instead of the prior :
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
->
netif_keep_dst(dev);
Instead of using a single bit, we use two bits, one being
eventually rebuilt in bonding/team drivers.
The other one, is permanent and blocks IFF_XMIT_DST_RELEASE being
rebuilt in bonding/team. Eventually, we could add something
smarter later.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 13e6e04..58b5aa3 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1364,7 +1364,7 @@
dev->tx_queue_len = ipoib_sendq_size * 2;
dev->features = (NETIF_F_VLAN_CHALLENGED |
NETIF_F_HIGHDMA);
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c
index 10d0dba..e90c6a7 100644
--- a/drivers/net/appletalk/ipddp.c
+++ b/drivers/net/appletalk/ipddp.c
@@ -74,7 +74,7 @@
if (!dev)
return ERR_PTR(-ENOMEM);
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
strcpy(dev->name, "ipddp%d");
if (version_printed++ == 0)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3ad5413..c9ac06c 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1002,7 +1002,8 @@
static void bond_compute_features(struct bonding *bond)
{
- unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE;
+ unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
+ IFF_XMIT_DST_RELEASE_PERM;
netdev_features_t vlan_features = BOND_VLAN_FEATURES;
netdev_features_t enc_features = BOND_ENC_FEATURES;
struct net_device *bond_dev = bond->dev;
@@ -1038,8 +1039,10 @@
bond_dev->gso_max_segs = gso_max_segs;
netif_set_gso_max_size(bond_dev, gso_max_size);
- flags = bond_dev->priv_flags & ~IFF_XMIT_DST_RELEASE;
- bond_dev->priv_flags = flags | dst_release_flag;
+ bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
+ dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
+ bond_dev->priv_flags |= IFF_XMIT_DST_RELEASE;
netdev_change_features(bond_dev);
}
diff --git a/drivers/net/eql.c b/drivers/net/eql.c
index 957e5c0..a10ad74 100644
--- a/drivers/net/eql.c
+++ b/drivers/net/eql.c
@@ -199,7 +199,7 @@
dev->type = ARPHRD_SLIP;
dev->tx_queue_len = 5; /* Hands them off fast */
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
}
static int eql_open(struct net_device *dev)
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index d2d4a3d..34f846b 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -185,7 +185,8 @@
dev->flags |= IFF_NOARP;
dev->flags &= ~IFF_MULTICAST;
- dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ netif_keep_dst(dev);
eth_hw_addr_random(dev);
}
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 8f22625..c76283c 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -169,7 +169,7 @@
dev->type = ARPHRD_LOOPBACK; /* 0x0001*/
dev->flags = IFF_LOOPBACK;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
dev->hw_features = NETIF_F_ALL_TSO | NETIF_F_UFO;
dev->features = NETIF_F_SG | NETIF_F_FRAGLIST
| NETIF_F_ALL_TSO
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index e8a453f..38b4fae 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1025,7 +1025,8 @@
{
ether_setup(dev);
- dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ netif_keep_dst(dev);
dev->priv_flags |= IFF_UNICAST_FLT;
dev->netdev_ops = &macvlan_netdev_ops;
dev->destructor = free_netdev;
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index fa0d717..80e6f34 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -1103,7 +1103,7 @@
dev->type = ARPHRD_PPP;
dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
dev->features |= NETIF_F_NETNS_LOCAL;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
}
/*
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 2277c36..a94a9df 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -970,7 +970,8 @@
struct team_port *port;
u32 vlan_features = TEAM_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL;
unsigned short max_hard_header_len = ETH_HLEN;
- unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE;
+ unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
+ IFF_XMIT_DST_RELEASE_PERM;
list_for_each_entry(port, &team->port_list, list) {
vlan_features = netdev_increment_features(vlan_features,
@@ -985,8 +986,9 @@
team->dev->vlan_features = vlan_features;
team->dev->hard_header_len = max_hard_header_len;
- flags = team->dev->priv_flags & ~IFF_XMIT_DST_RELEASE;
- team->dev->priv_flags = flags | dst_release_flag;
+ team->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ if (dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
+ team->dev->priv_flags |= IFF_XMIT_DST_RELEASE;
netdev_change_features(team->dev);
}
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 2af795d..2a51e6e 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2193,7 +2193,7 @@
dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
dev->hw_features |= NETIF_F_GSO_SOFTWARE;
dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
INIT_LIST_HEAD(&vxlan->next);
diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c
index e5c7e61..3ebed1c 100644
--- a/drivers/net/wan/hdlc_fr.c
+++ b/drivers/net/wan/hdlc_fr.c
@@ -1047,7 +1047,7 @@
dev->flags = IFF_POINTOPOINT;
dev->hard_header_len = 10;
dev->addr_len = 2;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
}
static const struct net_device_ops pvc_ops = {
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index f8427a2..afebb97 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -3306,7 +3306,7 @@
card->dev->features |= NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_CTAG_RX |
NETIF_F_HW_VLAN_CTAG_FILTER;
- card->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(card->dev);
card->dev->gso_max_size = 15 * PAGE_SIZE;
SET_NETDEV_DEV(card->dev, &card->gdev->dev);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2df86f5..3a4315b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1206,6 +1206,7 @@
IFF_SUPP_NOFCS = 1<<19,
IFF_LIVE_ADDR_CHANGE = 1<<20,
IFF_MACVLAN = 1<<21,
+ IFF_XMIT_DST_RELEASE_PERM = 1<<22,
};
#define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
@@ -1230,6 +1231,7 @@
#define IFF_SUPP_NOFCS IFF_SUPP_NOFCS
#define IFF_LIVE_ADDR_CHANGE IFF_LIVE_ADDR_CHANGE
#define IFF_MACVLAN IFF_MACVLAN
+#define IFF_XMIT_DST_RELEASE_PERM IFF_XMIT_DST_RELEASE_PERM
/**
* struct net_device - The DEVICE structure.
@@ -3588,6 +3590,12 @@
return dev->priv_flags & IFF_SUPP_NOFCS;
}
+/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
+static inline void netif_keep_dst(struct net_device *dev)
+{
+ dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM);
+}
+
extern struct pernet_operations __net_initdata loopback_net_ops;
/* Logging, debugging and troubleshooting/diagnostic helpers. */
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 35a6b6b..0d441ec 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -799,7 +799,8 @@
ether_setup(dev);
dev->priv_flags |= IFF_802_1Q_VLAN;
- dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ netif_keep_dst(dev);
dev->tx_queue_len = 0;
dev->netdev_ops = &vlan_netdev_ops;
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 1d9eaa4..17e55df 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -501,7 +501,7 @@
/* without any more elaborate queuing. 100 is a reasonable */
/* compromise between decent burst-tolerance and protection */
/* against memory hogs. */
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
}
static int clip_create(int number)
diff --git a/net/core/dev.c b/net/core/dev.c
index a63b8c4..3c5bdaa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2665,12 +2665,6 @@
if (skb->next)
return skb;
- /* If device doesn't need skb->dst, release it right now while
- * its hot in this cpu cache
- */
- if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
- skb_dst_drop(skb);
-
features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
if (unlikely(!skb))
@@ -2811,8 +2805,6 @@
* waiting to be sent out; and the qdisc is not running -
* xmit the skb directly.
*/
- if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
- skb_dst_force(skb);
qdisc_bstats_update(q, skb);
@@ -2827,7 +2819,6 @@
rc = NET_XMIT_SUCCESS;
} else {
- skb_dst_force(skb);
rc = q->enqueue(skb, q) & NET_XMIT_MASK;
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
@@ -2924,6 +2915,14 @@
skb_update_prio(skb);
+ /* If device/qdisc don't need skb->dst, release it right now while
+ * its hot in this cpu cache.
+ */
+ if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
+ skb_dst_drop(skb);
+ else
+ skb_dst_force(skb);
+
txq = netdev_pick_tx(dev, skb, accel_priv);
q = rcu_dereference_bh(txq->qdisc);
@@ -6674,7 +6673,7 @@
INIT_LIST_HEAD(&dev->adj_list.lower);
INIT_LIST_HEAD(&dev->all_adj_list.upper);
INIT_LIST_HEAD(&dev->all_adj_list.lower);
- dev->priv_flags = IFF_XMIT_DST_RELEASE;
+ dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
dev->num_tx_queues = txqs;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 0485ef1..12055fd 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -510,7 +510,7 @@
memcpy(dev->broadcast, &iph->daddr, 4);
dev->flags = IFF_NOARP;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
dev->addr_len = 4;
if (iph->daddr) {
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index e453cb7..3e86101 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -364,7 +364,7 @@
dev->iflink = 0;
dev->addr_len = 4;
dev->features |= NETIF_F_LLTX;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
return ip_tunnel_init(dev);
}
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ea88ab3..37096d6 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -289,7 +289,7 @@
dev->iflink = 0;
dev->addr_len = 4;
dev->features |= NETIF_F_LLTX;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
dev->features |= IPIP_FEATURES;
dev->hw_features |= IPIP_FEATURES;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 74b6779..de3b1c8 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1242,7 +1242,7 @@
dev->flags |= IFF_NOARP;
dev->iflink = 0;
dev->addr_len = sizeof(struct in6_addr);
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
}
static int ip6gre_tunnel_init(struct net_device *dev)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index d3e8888..9409887 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1493,7 +1493,7 @@
dev->mtu -= 8;
dev->flags |= IFF_NOARP;
dev->addr_len = sizeof(struct in6_addr);
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
/* This perm addr will be used as interface identifier by IPv6 */
dev->addr_assign_type = NET_ADDR_RANDOM;
eth_random_addr(dev->perm_addr);
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 5833a22..d440bb5 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -807,7 +807,7 @@
dev->mtu = ETH_DATA_LEN;
dev->flags |= IFF_NOARP;
dev->addr_len = sizeof(struct in6_addr);
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
}
/**
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 0d4e274..6eab37c 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1364,7 +1364,7 @@
dev->hard_header_len = LL_MAX_HEADER + t_hlen;
dev->mtu = ETH_DATA_LEN - t_hlen;
dev->flags = IFF_NOARP;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
dev->iflink = 0;
dev->addr_len = 4;
dev->features |= NETIF_F_LLTX;
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index a5d2b20..4ac515f 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -493,6 +493,8 @@
tcf_exts_change(tp, &fnew->exts, &e);
tcf_em_tree_change(tp, &fnew->ematches, &t);
+ netif_keep_dst(qdisc_dev(tp->q));
+
if (tb[TCA_FLOW_KEYS]) {
fnew->keymask = keymask;
fnew->nkeys = nkeys;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 6f22baa..109a329 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -524,6 +524,7 @@
if (f->handle < f1->handle)
break;
+ netif_keep_dst(qdisc_dev(tp->q));
rcu_assign_pointer(f->next, f1);
rcu_assign_pointer(*fp, f);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 2b349a4..38d58e6 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -47,7 +47,6 @@
static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
{
- skb_dst_force(skb);
q->gso_skb = skb;
q->qstats.requeues++;
q->q.qlen++; /* it's still part of the queue */
@@ -218,8 +217,6 @@
if (unlikely(!skb))
return 0;
- WARN_ON_ONCE(skb_dst_is_noref(skb));
-
root_lock = qdisc_lock(q);
dev = qdisc_dev(q);
txq = skb_get_tx_queue(dev, skb);
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 5cd291b..6ada423 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -470,7 +470,7 @@
dev->tx_queue_len = 100;
dev->flags = IFF_NOARP;
dev->hard_header_len = LL_MAX_HEADER;
- dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ netif_keep_dst(dev);
}
static LIST_HEAD(master_dev_list);