Merge branch 'net-nexthop-increase-weight-to-u16'

Petr Machata says:

====================
net: nexthop: Increase weight to u16

In CLOS networks, as link failures occur at various points in the network,
ECMP weights of the involved nodes are adjusted to compensate. With high
fan-out of the involved nodes, and overall high number of nodes,
a (non-)ECMP weight ratio that we would like to configure does not fit into
8 bits. Instead of, say, 255:254, we might like to configure something like
1000:999. For these deployments, the 8-bit weight may not be enough.

To that end, in this patchset increase the next hop weight from u8 to u16.

Patch #1 adds a flag that indicates whether the reserved fields are zeroed.
This is a follow-up to a new fix merged in commit 6d745cd0e9 ("net:
nexthop: Initialize all fields in dumped nexthops"). The theory behind this
patch is that there is a strict ordering between the fields actually being
zeroed, the kernel declaring that they are, and the kernel repurposing the
fields. Thus clients can use the flag to tell if it is safe to interpret
the reserved fields in any way.

Patch #2 contains the substantial code and the commit message covers the
details of the changes.

Patches #3 to #6 add selftests.
====================

Link: https://patch.msgid.link/cover.1723036486.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2024-08-12 17:50:36 -07:00
commit e96f6fd30e
9 changed files with 201 additions and 37 deletions

View File

@ -105,7 +105,7 @@ struct nh_grp_entry_stats {
struct nh_grp_entry {
struct nexthop *nh;
struct nh_grp_entry_stats __percpu *stats;
u8 weight;
u16 weight;
union {
struct {
@ -192,7 +192,7 @@ struct nh_notifier_single_info {
};
struct nh_notifier_grp_entry_info {
u8 weight;
u16 weight;
struct nh_notifier_single_info nh;
};

View File

@ -16,10 +16,15 @@ struct nhmsg {
struct nexthop_grp {
__u32 id; /* nexthop id - must exist */
__u8 weight; /* weight of this nexthop */
__u8 resvd1;
__u8 weight_high; /* high order bits of weight */
__u16 resvd2;
};
static inline __u16 nexthop_grp_weight(const struct nexthop_grp *entry)
{
return ((entry->weight_high << 8) | entry->weight) + 1;
}
enum {
NEXTHOP_GRP_TYPE_MPATH, /* hash-threshold nexthop group
* default type if not specified
@ -33,6 +38,9 @@ enum {
#define NHA_OP_FLAG_DUMP_STATS BIT(0)
#define NHA_OP_FLAG_DUMP_HW_STATS BIT(1)
/* Response OP_FLAGS. */
#define NHA_OP_FLAG_RESP_GRP_RESVD_0 BIT(31) /* Dump clears resvd fields. */
enum {
NHA_UNSPEC,
NHA_ID, /* u32; id for nexthop. id == 0 means auto-assign */

View File

@ -865,15 +865,18 @@ static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh,
}
static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh,
u32 op_flags)
u32 op_flags, u32 *resp_op_flags)
{
struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
struct nexthop_grp *p;
size_t len = nhg->num_nh * sizeof(*p);
struct nlattr *nla;
u16 group_type = 0;
u16 weight;
int i;
*resp_op_flags |= NHA_OP_FLAG_RESP_GRP_RESVD_0;
if (nhg->hash_threshold)
group_type = NEXTHOP_GRP_TYPE_MPATH;
else if (nhg->resilient)
@ -888,9 +891,12 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh,
p = nla_data(nla);
for (i = 0; i < nhg->num_nh; ++i) {
weight = nhg->nh_entries[i].weight - 1;
*p++ = (struct nexthop_grp) {
.id = nhg->nh_entries[i].nh->id,
.weight = nhg->nh_entries[i].weight - 1,
.weight = weight,
.weight_high = weight >> 8,
};
}
@ -934,10 +940,12 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
if (nh->is_group) {
struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
u32 resp_op_flags = 0;
if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB))
goto nla_put_failure;
if (nla_put_nh_group(skb, nh, op_flags))
if (nla_put_nh_group(skb, nh, op_flags, &resp_op_flags) ||
nla_put_u32(skb, NHA_OP_FLAGS, resp_op_flags))
goto nla_put_failure;
goto out;
}
@ -1050,7 +1058,9 @@ static size_t nh_nlmsg_size(struct nexthop *nh)
sz += nla_total_size(4); /* NHA_ID */
if (nh->is_group)
sz += nh_nlmsg_size_grp(nh);
sz += nh_nlmsg_size_grp(nh) +
nla_total_size(4) + /* NHA_OP_FLAGS */
0;
else
sz += nh_nlmsg_size_single(nh);
@ -1280,11 +1290,14 @@ static int nh_check_attr_group(struct net *net,
nhg = nla_data(tb[NHA_GROUP]);
for (i = 0; i < len; ++i) {
if (nhg[i].resvd1 || nhg[i].resvd2) {
NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0");
if (nhg[i].resvd2) {
NL_SET_ERR_MSG(extack, "Reserved field in nexthop_grp must be 0");
return -EINVAL;
}
if (nhg[i].weight > 254) {
if (nexthop_grp_weight(&nhg[i]) == 0) {
/* 0xffff got passed in, representing weight of 0x10000,
* which is too heavy.
*/
NL_SET_ERR_MSG(extack, "Invalid value for weight");
return -EINVAL;
}
@ -1880,9 +1893,9 @@ static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table)
static void nh_res_group_rebalance(struct nh_group *nhg,
struct nh_res_table *res_table)
{
int prev_upper_bound = 0;
int total = 0;
int w = 0;
u16 prev_upper_bound = 0;
u32 total = 0;
u32 w = 0;
int i;
INIT_LIST_HEAD(&res_table->uw_nh_entries);
@ -1892,11 +1905,12 @@ static void nh_res_group_rebalance(struct nh_group *nhg,
for (i = 0; i < nhg->num_nh; ++i) {
struct nh_grp_entry *nhge = &nhg->nh_entries[i];
int upper_bound;
u16 upper_bound;
u64 btw;
w += nhge->weight;
upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w,
total);
btw = ((u64)res_table->num_nh_buckets) * w;
upper_bound = DIV_ROUND_CLOSEST_ULL(btw, total);
nhge->res.wants_buckets = upper_bound - prev_upper_bound;
prev_upper_bound = upper_bound;
@ -1962,8 +1976,8 @@ static void replace_nexthop_grp_res(struct nh_group *oldg,
static void nh_hthr_group_rebalance(struct nh_group *nhg)
{
int total = 0;
int w = 0;
u32 total = 0;
u32 w = 0;
int i;
for (i = 0; i < nhg->num_nh; ++i)
@ -1971,7 +1985,7 @@ static void nh_hthr_group_rebalance(struct nh_group *nhg)
for (i = 0; i < nhg->num_nh; ++i) {
struct nh_grp_entry *nhge = &nhg->nh_entries[i];
int upper_bound;
u32 upper_bound;
w += nhge->weight;
upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
@ -2713,7 +2727,8 @@ static struct nexthop *nexthop_create_group(struct net *net,
goto out_no_nh;
}
nhg->nh_entries[i].nh = nhe;
nhg->nh_entries[i].weight = entry[i].weight + 1;
nhg->nh_entries[i].weight = nexthop_grp_weight(&entry[i]);
list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
nhg->nh_entries[i].nh_parent = nh;
}

View File

@ -78,7 +78,12 @@ log_test()
else
ret=1
nfail=$((nfail+1))
printf "TEST: %-60s [FAIL]\n" "${msg}"
if [[ $rc -eq $ksft_skip ]]; then
printf "TEST: %-60s [SKIP]\n" "${msg}"
else
printf "TEST: %-60s [FAIL]\n" "${msg}"
fi
if [ "$VERBOSE" = "1" ]; then
echo " rc=$rc, expected $expected"
fi
@ -923,6 +928,29 @@ ipv6_grp_fcnal()
ipv6_grp_refs
log_test $? 0 "Nexthop group replace refcounts"
#
# 16-bit weights.
#
run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1"
run_cmd "$IP nexthop add id 66 dev veth1"
run_cmd "$IP nexthop add id 103 group 62,1000"
if [[ $? == 0 ]]; then
local GRP="id 103 group 62,254/63,255/64,256/65,257/66,65535"
run_cmd "$IP nexthop replace $GRP"
check_nexthop "id 103" "$GRP"
rc=$?
else
rc=$ksft_skip
fi
$IP nexthop flush >/dev/null 2>&1
log_test $rc 0 "16-bit weights"
}
ipv6_res_grp_fcnal()
@ -987,6 +1015,31 @@ ipv6_res_grp_fcnal()
check_nexthop_bucket "list id 102" \
"id 102 index 0 nhid 63 id 102 index 1 nhid 62 id 102 index 2 nhid 62 id 102 index 3 nhid 62"
log_test $? 0 "Nexthop buckets updated after replace - nECMP"
#
# 16-bit weights.
#
run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1"
run_cmd "$IP nexthop add id 66 dev veth1"
run_cmd "$IP nexthop add id 103 group 62,1000 type resilient buckets 32"
if [[ $? == 0 ]]; then
local GRP="id 103 group 62,254/63,255/64,256/65,257/66,65535 $(:
)type resilient buckets 32 idle_timer 0 $(:
)unbalanced_timer 0"
run_cmd "$IP nexthop replace $GRP"
check_nexthop "id 103" "$GRP unbalanced_time 0"
rc=$?
else
rc=$ksft_skip
fi
$IP nexthop flush >/dev/null 2>&1
log_test $rc 0 "16-bit weights"
}
ipv6_fcnal_runtime()

View File

@ -509,6 +509,13 @@ xfail_on_slow()
fi
}
omit_on_slow()
{
if [[ $KSFT_MACHINE_SLOW != yes ]]; then
"$@"
fi
}
xfail_on_veth()
{
local dev=$1; shift

View File

@ -40,6 +40,7 @@ ALL_TESTS="
ping_ipv4
ping_ipv6
multipath_test
multipath16_test
ping_ipv4_blackhole
ping_ipv6_blackhole
nh_stats_test_v4
@ -226,9 +227,11 @@ routing_nh_obj()
multipath4_test()
{
local desc="$1"
local weight_rp12=$2
local weight_rp13=$3
local desc=$1; shift
local weight_rp12=$1; shift
local weight_rp13=$1; shift
local ports=${1-sp=1024,dp=0-32768}; shift
local t0_rp12 t0_rp13 t1_rp12 t1_rp13
local packets_rp12 packets_rp13
@ -242,7 +245,8 @@ multipath4_test()
t0_rp13=$(link_stats_tx_packets_get $rp13)
ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \
-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
-d $MZ_DELAY -t udp "$ports"
sleep 1
t1_rp12=$(link_stats_tx_packets_get $rp12)
t1_rp13=$(link_stats_tx_packets_get $rp13)
@ -258,9 +262,11 @@ multipath4_test()
multipath6_test()
{
local desc="$1"
local weight_rp12=$2
local weight_rp13=$3
local desc=$1; shift
local weight_rp12=$1; shift
local weight_rp13=$1; shift
local ports=${1-sp=1024,dp=0-32768}; shift
local t0_rp12 t0_rp13 t1_rp12 t1_rp13
local packets_rp12 packets_rp13
@ -275,7 +281,8 @@ multipath6_test()
t0_rp13=$(link_stats_tx_packets_get $rp13)
$MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \
-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
-d $MZ_DELAY -t udp "$ports"
sleep 1
t1_rp12=$(link_stats_tx_packets_get $rp12)
t1_rp13=$(link_stats_tx_packets_get $rp13)
@ -313,6 +320,23 @@ multipath_test()
multipath6_test "Weighted MP 11:45" 11 45
}
multipath16_test()
{
check_nhgw16 104 || return
log_info "Running 16-bit IPv4 multipath tests"
multipath4_test "65535:65535" 65535 65535
multipath4_test "128:512" 128 512
omit_on_slow \
multipath4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535
log_info "Running 16-bit IPv6 multipath tests"
multipath6_test "65535:65535" 65535 65535
multipath6_test "128:512" 128 512
omit_on_slow \
multipath6_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535
}
ping_ipv4_blackhole()
{
RET=0

View File

@ -117,3 +117,16 @@ __nh_stats_test_v6()
$MZ -6 $h1 -A 2001:db8:1::2 -B 2001:db8:2::2
sysctl_restore net.ipv6.fib_multipath_hash_policy
}
check_nhgw16()
{
local nhid=$1; shift
ip nexthop replace id 9999 group "$nhid,65535" &>/dev/null
if (( $? )); then
log_test_skip "16-bit multipath tests" \
"iproute2 or the kernel do not support 16-bit next hop weights"
return 1
fi
ip nexthop del id 9999 ||:
}

View File

@ -40,6 +40,7 @@ ALL_TESTS="
ping_ipv4
ping_ipv6
multipath_test
multipath16_test
nh_stats_test_v4
nh_stats_test_v6
"
@ -228,9 +229,11 @@ routing_nh_obj()
multipath4_test()
{
local desc="$1"
local weight_rp12=$2
local weight_rp13=$3
local desc=$1; shift
local weight_rp12=$1; shift
local weight_rp13=$1; shift
local ports=${1-sp=1024,dp=0-32768}; shift
local t0_rp12 t0_rp13 t1_rp12 t1_rp13
local packets_rp12 packets_rp13
@ -243,7 +246,8 @@ multipath4_test()
t0_rp13=$(link_stats_tx_packets_get $rp13)
ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \
-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
-d $MZ_DELAY -t udp "$ports"
sleep 1
t1_rp12=$(link_stats_tx_packets_get $rp12)
t1_rp13=$(link_stats_tx_packets_get $rp13)
@ -258,9 +262,11 @@ multipath4_test()
multipath6_l4_test()
{
local desc="$1"
local weight_rp12=$2
local weight_rp13=$3
local desc=$1; shift
local weight_rp12=$1; shift
local weight_rp13=$1; shift
local ports=${1-sp=1024,dp=0-32768}; shift
local t0_rp12 t0_rp13 t1_rp12 t1_rp13
local packets_rp12 packets_rp13
@ -273,7 +279,8 @@ multipath6_l4_test()
t0_rp13=$(link_stats_tx_packets_get $rp13)
$MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \
-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
-d $MZ_DELAY -t udp "$ports"
sleep 1
t1_rp12=$(link_stats_tx_packets_get $rp12)
t1_rp13=$(link_stats_tx_packets_get $rp13)
@ -371,6 +378,41 @@ multipath_test()
ip nexthop replace id 106 group 104,1/105,1 type resilient
}
multipath16_test()
{
check_nhgw16 104 || return
log_info "Running 16-bit IPv4 multipath tests"
ip nexthop replace id 103 group 101/102 type resilient idle_timer 0
ip nexthop replace id 103 group 101,65535/102,65535 type resilient
multipath4_test "65535:65535" 65535 65535
ip nexthop replace id 103 group 101,128/102,512 type resilient
multipath4_test "128:512" 128 512
ip nexthop replace id 103 group 101,255/102,65535 type resilient
omit_on_slow \
multipath4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535
ip nexthop replace id 103 group 101,1/102,1 type resilient
log_info "Running 16-bit IPv6 L4 hash multipath tests"
ip nexthop replace id 106 group 104/105 type resilient idle_timer 0
ip nexthop replace id 106 group 104,65535/105,65535 type resilient
multipath6_l4_test "65535:65535" 65535 65535
ip nexthop replace id 106 group 104,128/105,512 type resilient
multipath6_l4_test "128:512" 128 512
ip nexthop replace id 106 group 104,255/105,65535 type resilient
omit_on_slow \
multipath6_l4_test "255:65535" 255 65535 sp=1024-1026,dp=0-65535
ip nexthop replace id 106 group 104,1/105,1 type resilient
}
nh_stats_test_v4()
{
__nh_stats_test_v4 resilient

View File

@ -180,6 +180,7 @@ multipath4_test()
ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \
-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
sleep 1
t1_rp12=$(link_stats_tx_packets_get $rp12)
t1_rp13=$(link_stats_tx_packets_get $rp13)
@ -217,6 +218,7 @@ multipath6_test()
$MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \
-d $MZ_DELAY -t udp "sp=1024,dp=0-32768"
sleep 1
t1_rp12=$(link_stats_tx_packets_get $rp12)
t1_rp13=$(link_stats_tx_packets_get $rp13)