Re: [libvirt] [Fwd: [PATCH v9] add 802.1Qbh and 802.1Qbg handling]

On Thursday 27 May 2010, Stefan Berger wrote:
+static int +getPortProfileStatus(struct nlattr **tb, int32_t vf, uint16_t *status) +{ + int rc = 1; + const char *msg = NULL; + struct nlattr *tb2[IFLA_VF_PORT_MAX + 1], + *tb3[IFLA_PORT_MAX+1]; + + if (vf == PORT_SELF_VF) { + if (tb[IFLA_PORT_SELF]) { + if (nla_parse_nested(tb3, IFLA_PORT_MAX, tb[IFLA_PORT_SELF], + ifla_port_policy)) { + msg = _("error parsing nested IFLA_VF_PORT part"); + goto err_exit; + } + } + } else { + if (tb[IFLA_VF_PORTS]) { + if (nla_parse_nested(tb2, IFLA_VF_PORT_MAX, tb[IFLA_VF_PORTS], + ifla_vf_ports_policy)) { + msg = _("error parsing nested IFLA_VF_PORTS part"); + goto err_exit; + } + if (tb2[IFLA_VF_PORT]) { + if (nla_parse_nested(tb3, IFLA_PORT_MAX, tb2[IFLA_VF_PORT], + ifla_port_policy)) { + msg = _("error parsing nested IFLA_VF_PORT part"); + goto err_exit; + } + } + } + }
There may be multiple IFLA_VF_PORT attributes in the IFLA_VF_PORTS list, so you cannot do nla_parse_nested. I think this should be nla_for_each_attr instead, and compare the uuid to the one you expect.
+ memcpy(ifla_vf_mac.mac, macaddr, 6); + + rta = rtattrCreate(rtattbuf, sizeof(rtattbuf), IFLA_VFINFO_LIST, + NULL, 0); + if (!rta || + !(vfinfolist = nlAppend(nlm, sizeof(nlmsgbuf), + rtattbuf, rta->rta_len))) + goto buffer_too_small; + + rta = rtattrCreate(rtattbuf, sizeof(rtattbuf), IFLA_VF_INFO, + NULL, 0); + if (!rta || + !(vfinfo = nlAppend(nlm, sizeof(nlmsgbuf), + rtattbuf, rta->rta_len))) + goto buffer_too_small; + + rta = rtattrCreate(rtattbuf, sizeof(rtattbuf), IFLA_VF_MAC, + &ifla_vf_mac, sizeof(ifla_vf_mac)); + if (!rta || !nlAppend(nlm, sizeof(nlmsgbuf), rtattbuf, rta->rta_len)) + goto buffer_too_small; + + rta = rtattrCreate(rtattbuf, sizeof(rtattbuf), IFLA_VF_VLAN, + &ifla_vf_vlan, sizeof(ifla_vf_vlan)); + + if (!rta || !nlAppend(nlm, sizeof(nlmsgbuf), rtattbuf, rta->rta_len)) + goto buffer_too_small; + + vfinfo->rta_len = (char *)nlm + nlm->nlmsg_len - (char *)vfinfo; + + vfinfolist->rta_len = (char *)nlm + nlm->nlmsg_len - + (char *)vfinfolist; + }
This part looks good now.
+ if (vf == PORT_SELF_VF) { + rta = rtattrCreate(rtattbuf, sizeof(rtattbuf), IFLA_PORT_SELF, NULL, 0); + } else { + rta = rtattrCreate(rtattbuf, sizeof(rtattbuf), IFLA_VF_PORTS, NULL, 0); + if (!rta || + !(vfports = nlAppend(nlm, sizeof(nlmsgbuf), + rtattbuf, rta->rta_len))) + goto buffer_too_small; + + /* begin nesting vfports */ + rta = rtattrCreate(rtattbuf, sizeof(rtattbuf), IFLA_VF_PORT, NULL, 0); + }
But this still goes down the IFLA_PORT_SELF route because you pass PORT_SELF_VF even for nltarget_kernel==false, where it makes no sense. Maybe make the above if (vf == PORT_SELF_VF && nltarget_kernel)
+ if (vf != PORT_SELF_VF) { + /* end nesting of vfports */ + vfports->rta_len = (char *)nlm + nlm->nlmsg_len - (char *)vfports; + }
Here too.
+ if (nltarget_kernel) { + if (nlComm(nlm, &recvbuf, &recvbuflen) < 0) + return -1; + } else { + if (nlCommWaitSuccess(nlm, RTMGRP_LINK, &recvbuf, &recvbuflen, + 5 * MICROSEC_PER_SEC) < 0) + return -1; + }
I don't understand this part yet. Do we need this difference?
+ while (--repeats >= 0) { + rc = link_dump(nltarget_kernel, NULL, ifindex, tb, &recvbuf); + if (rc) + goto err_exit; + rc = getPortProfileStatus(tb, vf, &status); + if (rc == 0) { + if (status == PORT_PROFILE_RESPONSE_SUCCESS || + status == PORT_VDP_RESPONSE_SUCCESS) { + break; + } else if (status == PORT_PROFILE_RESPONSE_INPROGRESS) { + // keep trying... + } else { + virReportSystemError(EINVAL, + _("error %d during port-profile setlink on ifindex %d"), + status, ifindex); + rc = 1; + break; + }
Hmm, we seem to be missing an INPROGRESS status for Qbg. Any suggestions what we should return there? Should we possibly just leave out IFLA_PORT_RESPONSE in order to signal INPROGRESS, as in not clear yet?
+ rc = doPortProfileOpCommon(nltarget_kernel, + physdev_ifname, physdev_ifindex, + macaddr, + vlanid, + NULL, + &portVsi, + virtPort->u.virtPort8021Qbg.instanceID, + NULL, + PORT_SELF_VF, + op);
This is where we pass PORT_SELF_VF together with nltarget_kernel=false, as mentioned above. Arnd

On Thu, 2010-05-27 at 13:55 +0200, Arnd Bergmann wrote:
On Thursday 27 May 2010, Stefan Berger wrote:
+static int +getPortProfileStatus(struct nlattr **tb, int32_t vf, uint16_t *status) +{ + int rc = 1; + const char *msg = NULL; + struct nlattr *tb2[IFLA_VF_PORT_MAX + 1], + *tb3[IFLA_PORT_MAX+1]; + + if (vf == PORT_SELF_VF) { + if (tb[IFLA_PORT_SELF]) { + if (nla_parse_nested(tb3, IFLA_PORT_MAX, tb[IFLA_PORT_SELF], + ifla_port_policy)) { + msg = _("error parsing nested IFLA_VF_PORT part"); + goto err_exit; + } + } + } else { + if (tb[IFLA_VF_PORTS]) { + if (nla_parse_nested(tb2, IFLA_VF_PORT_MAX, tb[IFLA_VF_PORTS], + ifla_vf_ports_policy)) { + msg = _("error parsing nested IFLA_VF_PORTS part"); + goto err_exit; + } + if (tb2[IFLA_VF_PORT]) { + if (nla_parse_nested(tb3, IFLA_PORT_MAX, tb2[IFLA_VF_PORT], + ifla_port_policy)) { + msg = _("error parsing nested IFLA_VF_PORT part"); + goto err_exit; + } + } + } + }
There may be multiple IFLA_VF_PORT attributes in the IFLA_VF_PORTS list, so you cannot do nla_parse_nested. I think this should be nla_for_each_attr instead, and compare the uuid to the one you expect.
Ok. I'll change that for v10.
+ memcpy(ifla_vf_mac.mac, macaddr, 6); + + rta = rtattrCreate(rtattbuf, sizeof(rtattbuf), IFLA_VFINFO_LIST, + NULL, 0); + if (!rta || + !(vfinfolist = nlAppend(nlm, sizeof(nlmsgbuf), + rtattbuf, rta->rta_len))) + goto buffer_too_small; + + rta = rtattrCreate(rtattbuf, sizeof(rtattbuf), IFLA_VF_INFO, + NULL, 0); + if (!rta || + !(vfinfo = nlAppend(nlm, sizeof(nlmsgbuf), + rtattbuf, rta->rta_len))) + goto buffer_too_small; + + rta = rtattrCreate(rtattbuf, sizeof(rtattbuf), IFLA_VF_MAC, + &ifla_vf_mac, sizeof(ifla_vf_mac)); + if (!rta || !nlAppend(nlm, sizeof(nlmsgbuf), rtattbuf, rta->rta_len)) + goto buffer_too_small; + + rta = rtattrCreate(rtattbuf, sizeof(rtattbuf), IFLA_VF_VLAN, + &ifla_vf_vlan, sizeof(ifla_vf_vlan)); + + if (!rta || !nlAppend(nlm, sizeof(nlmsgbuf), rtattbuf, rta->rta_len)) + goto buffer_too_small; + + vfinfo->rta_len = (char *)nlm + nlm->nlmsg_len - (char *)vfinfo; + + vfinfolist->rta_len = (char *)nlm + nlm->nlmsg_len - + (char *)vfinfolist; + }
This part looks good now.
+ if (vf == PORT_SELF_VF) { + rta = rtattrCreate(rtattbuf, sizeof(rtattbuf), IFLA_PORT_SELF, NULL, 0); + } else { + rta = rtattrCreate(rtattbuf, sizeof(rtattbuf), IFLA_VF_PORTS, NULL, 0); + if (!rta || + !(vfports = nlAppend(nlm, sizeof(nlmsgbuf), + rtattbuf, rta->rta_len))) + goto buffer_too_small; + + /* begin nesting vfports */ + rta = rtattrCreate(rtattbuf, sizeof(rtattbuf), IFLA_VF_PORT, NULL, 0); + }
But this still goes down the IFLA_PORT_SELF route because you pass PORT_SELF_VF even for nltarget_kernel==false, where it makes no sense. Maybe make the above
if (vf == PORT_SELF_VF && nltarget_kernel)
So I'll pass vf = 0 and be done with it? I don't find this check for nltarget_kernel particularly intuitive. Why should the function create different messages for a kernel driver versus a user space daemon? Is this a technology specific thing that would prevent this type of message from being sent. Obviously it does work for Scott's 802.1Qbh part.
+ if (vf != PORT_SELF_VF) { + /* end nesting of vfports */ + vfports->rta_len = (char *)nlm + nlm->nlmsg_len - (char *)vfports; + }
Here too.
+ if (nltarget_kernel) { + if (nlComm(nlm, &recvbuf, &recvbuflen) < 0) + return -1; + } else { + if (nlCommWaitSuccess(nlm, RTMGRP_LINK, &recvbuf, &recvbuflen, + 5 * MICROSEC_PER_SEC) < 0) + return -1; + }
I don't understand this part yet. Do we need this difference?
From my experience with experiments that I have made I can only say that we do need it. The sending part is a little different and the receiveing part needs to filter out noise and only pick the response to the request that was previously sent.
+ while (--repeats >= 0) { + rc = link_dump(nltarget_kernel, NULL, ifindex, tb, &recvbuf); + if (rc) + goto err_exit; + rc = getPortProfileStatus(tb, vf, &status); + if (rc == 0) { + if (status == PORT_PROFILE_RESPONSE_SUCCESS || + status == PORT_VDP_RESPONSE_SUCCESS) { + break; + } else if (status == PORT_PROFILE_RESPONSE_INPROGRESS) { + // keep trying... + } else { + virReportSystemError(EINVAL, + _("error %d during port-profile setlink on ifindex %d"), + status, ifindex); + rc = 1; + break; + }
Hmm, we seem to be missing an INPROGRESS status for Qbg. Any suggestions
You mean that's not defined in the (pre-)standard?
what we should return there? Should we possibly just leave out IFLA_PORT_RESPONSE in order to signal INPROGRESS, as in not clear yet?
We want to be able to signal failure in case the switch setup failed so that we don't have a malfunctioning network interface where the user then doesn't know what to debug. In case of failure we simply wouldn't start the VM or hotplug the interface and return an indication that something went wrong during the negotiation with the switch.
+ rc = doPortProfileOpCommon(nltarget_kernel, + physdev_ifname, physdev_ifindex, + macaddr, + vlanid, + NULL, + &portVsi, + virtPort->u.virtPort8021Qbg.instanceID, + NULL, + PORT_SELF_VF, + op);
This is where we pass PORT_SELF_VF together with nltarget_kernel=false, as mentioned above.
nltarget_kernel is in fact false here. So now if I change the PORT_SELF_VF to '0', then I suppose it will be ok? The vf in the netlink message to lldpad will then show 0 rather than PORT_SELF_VF (-1). I guess 0 wouldn't have the meaning of the 0-st virtual function, but counting would start at 1? Stefan
Arnd

On Thursday 27 May 2010, Stefan Berger wrote:
But this still goes down the IFLA_PORT_SELF route because you pass PORT_SELF_VF even for nltarget_kernel==false, where it makes no sense. Maybe make the above
if (vf == PORT_SELF_VF && nltarget_kernel)
So I'll pass vf = 0 and be done with it? I don't find this check for nltarget_kernel particularly intuitive. Why should the function create different messages for a kernel driver versus a user space daemon? Is this a technology specific thing that would prevent this type of message from being sent. Obviously it does work for Scott's 802.1Qbh part.
The difference is that enic (and likely any other driver implementing Qbg or Qbh in the firmware) has access to the underlying data channel. IFLA_PORT_SELF essentially means that we ask a logical device to associate itself with the switch, which is very unlike the case where we ask a master device to associate a slave to the switch. Neither the PORT_SELF nor the VF_PORTS list are exactly what we really want, but the VF_PORTS stuff is closer. PORT_SELF does not work to start with, because it does not allow to query information about more than one VF. VF_PORTS is a bit fishy because we don't actually have VFs but an arbitrary number of software defined ports, but I think it's close enough. We could also invent another list for software ports that are identified by uuid instead of VF, but that would require defining attributes in the kernel that are only used in user space.
+ if (vf != PORT_SELF_VF) { + /* end nesting of vfports */ + vfports->rta_len = (char *)nlm + nlm->nlmsg_len - (char *)vfports; + }
Here too.
+ if (nltarget_kernel) { + if (nlComm(nlm, &recvbuf, &recvbuflen) < 0) + return -1; + } else { + if (nlCommWaitSuccess(nlm, RTMGRP_LINK, &recvbuf, &recvbuflen, + 5 * MICROSEC_PER_SEC) < 0) + return -1; + }
I don't understand this part yet. Do we need this difference?
From my experience with experiments that I have made I can only say that we do need it. The sending part is a little different and the receiveing part needs to filter out noise and only pick the response to the request that was previously sent.
I'm even more confused now. Why should the response be different from the response we get from the kernel? What's different on the sending side other than the PID? Also, what is the RTMGRP_LINK argument used for? I thought we don't need multicast any more because we don't target kernel and lldpad in the same message but only one of them.
+ while (--repeats >= 0) { + rc = link_dump(nltarget_kernel, NULL, ifindex, tb, &recvbuf); + if (rc) + goto err_exit; + rc = getPortProfileStatus(tb, vf, &status); + if (rc == 0) { + if (status == PORT_PROFILE_RESPONSE_SUCCESS || + status == PORT_VDP_RESPONSE_SUCCESS) { + break; + } else if (status == PORT_PROFILE_RESPONSE_INPROGRESS) { + // keep trying... + } else { + virReportSystemError(EINVAL, + _("error %d during port-profile setlink on ifindex %d"), + status, ifindex); + rc = 1; + break; + }
Hmm, we seem to be missing an INPROGRESS status for Qbg. Any suggestions
You mean that's not defined in the (pre-)standard?
Yes, the Qbg wire protocol has no need for this, because the messages are only sent after the state has changed, we never see a VDP message with an incomplete status in there, so there is no need to specify it in Qbg, but we need something in the netlink protocol.
what we should return there? Should we possibly just leave out IFLA_PORT_RESPONSE in order to signal INPROGRESS, as in not clear yet?
We want to be able to signal failure in case the switch setup failed so that we don't have a malfunctioning network interface where the user then doesn't know what to debug. In case of failure we simply wouldn't start the VM or hotplug the interface and return an indication that something went wrong during the negotiation with the switch.
So leaving out IFLA_PORT_RESPONSE would work for that, right? The cases we need to care about are: getPortProfileStatus returns without IFLA_PORT_RESPONSE -> keep trying getPortProfileStatus returns success from IFLA_PORT_RESPONSE -> done getPortProfileStatus returns failure from IFLA_PORT_RESPONSE -> give up
+ rc = doPortProfileOpCommon(nltarget_kernel, + physdev_ifname, physdev_ifindex, + macaddr, + vlanid, + NULL, + &portVsi, + virtPort->u.virtPort8021Qbg.instanceID, + NULL, + PORT_SELF_VF, + op);
This is where we pass PORT_SELF_VF together with nltarget_kernel=false, as mentioned above.
nltarget_kernel is in fact false here. So now if I change the PORT_SELF_VF to '0', then I suppose it will be ok? The vf in the netlink message to lldpad will then show 0 rather than PORT_SELF_VF (-1). I guess 0 wouldn't have the meaning of the 0-st virtual function, but counting would start at 1?
That's something we need to define now. You already put vf=0 into the IFLA_VF_MAC and IFLA_VF_VLAN attributes, and you leave out the IFLA_PORT_VF attribute for Qbg, which is probably the best approximation we can get. I think we should just mandate this for the request, and let lldpad decide on the fake vf number, which it returns to getPortProfileStatus. Arnd

On Thu, 2010-05-27 at 15:37 +0200, Arnd Bergmann wrote:
On Thursday 27 May 2010, Stefan Berger wrote:
But this still goes down the IFLA_PORT_SELF route because you pass PORT_SELF_VF even for nltarget_kernel==false, where it makes no sense. Maybe make the above
if (vf == PORT_SELF_VF && nltarget_kernel)
So I'll pass vf = 0 and be done with it? I don't find this check for nltarget_kernel particularly intuitive. Why should the function create different messages for a kernel driver versus a user space daemon? Is this a technology specific thing that would prevent this type of message from being sent. Obviously it does work for Scott's 802.1Qbh part.
The difference is that enic (and likely any other driver implementing Qbg or Qbh in the firmware) has access to the underlying data channel. IFLA_PORT_SELF essentially means that we ask a logical device to associate itself with the switch, which is very unlike the case where we ask a master device to associate a slave to the switch.
Neither the PORT_SELF nor the VF_PORTS list are exactly what we really want, but the VF_PORTS stuff is closer. PORT_SELF does not work to start with, because it does not allow to query information about more than one VF. VF_PORTS is a bit fishy because we don't actually have VFs but an arbitrary number of software defined ports, but I think it's close enough. We could also invent another list for software ports that are identified by uuid instead of VF, but that would require defining attributes in the kernel that are only used in user space.
Time is fleeting ...
+ if (vf != PORT_SELF_VF) { + /* end nesting of vfports */ + vfports->rta_len = (char *)nlm + nlm->nlmsg_len - (char *)vfports; + }
Here too.
+ if (nltarget_kernel) { + if (nlComm(nlm, &recvbuf, &recvbuflen) < 0) + return -1; + } else { + if (nlCommWaitSuccess(nlm, RTMGRP_LINK, &recvbuf, &recvbuflen, + 5 * MICROSEC_PER_SEC) < 0) + return -1; + }
I don't understand this part yet. Do we need this difference?
From my experience with experiments that I have made I can only say that we do need it. The sending part is a little different and the receiveing part needs to filter out noise and only pick the response to the request that was previously sent.
I'm even more confused now. Why should the response be different from the response we get from the kernel? What's different on the sending side other than the PID? Also, what is the RTMGRP_LINK argument used for? I thought we don't need multicast any more because we don't target kernel and lldpad in the same message but only one of them.
Fact is that if I set RTMGRP_LINK to 0 here on libvirt only side, the dummy server doesn't get a message. If I set it to 0 on both side, the dummy server also doesn't get a message. I think it's necessary for user-space-to-user-space communication, but I am also only learning about these types of sockets while I 'go'.
+ while (--repeats >= 0) { + rc = link_dump(nltarget_kernel, NULL, ifindex, tb, &recvbuf); + if (rc) + goto err_exit; + rc = getPortProfileStatus(tb, vf, &status); + if (rc == 0) { + if (status == PORT_PROFILE_RESPONSE_SUCCESS || + status == PORT_VDP_RESPONSE_SUCCESS) { + break; + } else if (status == PORT_PROFILE_RESPONSE_INPROGRESS) { + // keep trying... + } else { + virReportSystemError(EINVAL, + _("error %d during port-profile setlink on ifindex %d"), + status, ifindex); + rc = 1; + break; + }
Hmm, we seem to be missing an INPROGRESS status for Qbg. Any suggestions
You mean that's not defined in the (pre-)standard?
Yes, the Qbg wire protocol has no need for this, because the messages are only sent after the state has changed, we never see a VDP message with an incomplete status in there, so there is no need to specify it in Qbg, but we need something in the netlink protocol.
Yes, I would suggest to mimic 802.1Qbh in this case.
what we should return there? Should we possibly just leave out IFLA_PORT_RESPONSE in order to signal INPROGRESS, as in not clear yet?
We want to be able to signal failure in case the switch setup failed so that we don't have a malfunctioning network interface where the user then doesn't know what to debug. In case of failure we simply wouldn't start the VM or hotplug the interface and return an indication that something went wrong during the negotiation with the switch.
So leaving out IFLA_PORT_RESPONSE would work for that, right?
We need to poll for the current status. At least in 802.1Qbh case using an RTM_GETLINK type of message. Don't know how we could leave out IFLA_PORT_RESPONSE...
The cases we need to care about are:
getPortProfileStatus returns without IFLA_PORT_RESPONSE -> keep trying
Why not just mimic 802.1Qbh and return an INPROGRESS status for as long as it's not success ?
getPortProfileStatus returns success from IFLA_PORT_RESPONSE -> done getPortProfileStatus returns failure from IFLA_PORT_RESPONSE -> give up
+ rc = doPortProfileOpCommon(nltarget_kernel, + physdev_ifname, physdev_ifindex, + macaddr, + vlanid, + NULL, + &portVsi, + virtPort->u.virtPort8021Qbg.instanceID, + NULL, + PORT_SELF_VF, + op);
This is where we pass PORT_SELF_VF together with nltarget_kernel=false, as mentioned above.
nltarget_kernel is in fact false here. So now if I change the PORT_SELF_VF to '0', then I suppose it will be ok? The vf in the netlink message to lldpad will then show 0 rather than PORT_SELF_VF (-1). I guess 0 wouldn't have the meaning of the 0-st virtual function, but counting would start at 1?
Actually what I said above was quite right. PORT_SELF_VF (-1) will not be shown in the request, but in the -1 case the request simply won't have the IFLA_PORT_VF attribute.
That's something we need to define now. You already put vf=0 into the IFLA_VF_MAC and IFLA_VF_VLAN attributes, and you leave out the IFLA_PORT_VF
No, I just copy whatever vf holds in there, which also may be VF_PORT_SELF.
attribute for Qbg, which is probably the best approximation we can get. I think we should just mandate this for the request, and let lldpad decide on the fake vf number, which it returns to getPortProfileStatus.
Just be a bit more clear about the exact parameter to pass. Stefan
Arnd

On Thursday 27 May 2010, Stefan Berger wrote:
On Thu, 2010-05-27 at 15:37 +0200, Arnd Bergmann wrote:
I'm even more confused now. Why should the response be different from the response we get from the kernel? What's different on the sending side other than the PID? Also, what is the RTMGRP_LINK argument used for? I thought we don't need multicast any more because we don't target kernel and lldpad in the same message but only one of them.
Fact is that if I set RTMGRP_LINK to 0 here on libvirt only side, the dummy server doesn't get a message. If I set it to 0 on both side, the dummy server also doesn't get a message. I think it's necessary for user-space-to-user-space communication, but I am also only learning about these types of sockets while I 'go'.
AFAICT, RTMGRP_LINK makes it a multicast message, which means that anyone can receive it, which a unicast message will only be sent to the task that has a matching pid.
You mean that's not defined in the (pre-)standard?
Yes, the Qbg wire protocol has no need for this, because the messages are only sent after the state has changed, we never see a VDP message with an incomplete status in there, so there is no need to specify it in Qbg, but we need something in the netlink protocol.
Yes, I would suggest to mimic 802.1Qbh in this case.
Like what? The first 256 result numbers in the netlink protocol are meant to be the same as the ones in the wire protocol, the next 256 are for the Qbh protocol. We could of course define yet another range for the inprogress result and possibly future extensions, starting at 512 if you insist on requiring the IFLA_PORT_RESPONSE. I originally wanted to defer the response until we hear back from the switch, but that may take a really long time (over a minute with all the VDP timeouts). That would be cleaner IMHO, but you may not want to wait that long in libvirt.
what we should return there? Should we possibly just leave out IFLA_PORT_RESPONSE in order to signal INPROGRESS, as in not clear yet?
We want to be able to signal failure in case the switch setup failed so that we don't have a malfunctioning network interface where the user then doesn't know what to debug. In case of failure we simply wouldn't start the VM or hotplug the interface and return an indication that something went wrong during the negotiation with the switch.
So leaving out IFLA_PORT_RESPONSE would work for that, right?
We need to poll for the current status. At least in 802.1Qbh case using an RTM_GETLINK type of message. Don't know how we could leave out IFLA_PORT_RESPONSE...
The response can simply contain all attributes of the request but not contain an IFLA_PORT_RESPONSE attribute inside of the IFLA_VF_PORT.
The cases we need to care about are:
getPortProfileStatus returns without IFLA_PORT_RESPONSE -> keep trying
Why not just mimic 802.1Qbh and return an INPROGRESS status for as long as it's not success ?
because we have not defined such a value yet. We can either define one now and change the if_link.h file or define INPROGRESS to be the absence of a IFLA_PORT_RESPONSE attribute.
getPortProfileStatus returns success from IFLA_PORT_RESPONSE -> done getPortProfileStatus returns failure from IFLA_PORT_RESPONSE -> give up
+ rc = doPortProfileOpCommon(nltarget_kernel, + physdev_ifname, physdev_ifindex, + macaddr, + vlanid, + NULL, + &portVsi, + virtPort->u.virtPort8021Qbg.instanceID, + NULL, + PORT_SELF_VF, + op);
This is where we pass PORT_SELF_VF together with nltarget_kernel=false, as mentioned above.
nltarget_kernel is in fact false here. So now if I change the PORT_SELF_VF to '0', then I suppose it will be ok? The vf in the netlink message to lldpad will then show 0 rather than PORT_SELF_VF (-1). I guess 0 wouldn't have the meaning of the 0-st virtual function, but counting would start at 1?
Actually what I said above was quite right. PORT_SELF_VF (-1) will not be shown in the request, but in the -1 case the request simply won't have the IFLA_PORT_VF attribute.
Right.
That's something we need to define now. You already put vf=0 into the IFLA_VF_MAC and IFLA_VF_VLAN attributes, and you leave out the IFLA_PORT_VF
No, I just copy whatever vf holds in there, which also may be VF_PORT_SELF.
Ah, that should be fine as well.
attribute for Qbg, which is probably the best approximation we can get. I think we should just mandate this for the request, and let lldpad decide on the fake vf number, which it returns to getPortProfileStatus.
Just be a bit more clear about the exact parameter to pass.
I'd suggest doing a request with vfinfo_attr[IFLA_VF_MAC] = { .vf = -1u, .mac = OUR_MAC }; vfinfo_attr[IFLA_VF_VLAN] = { .vf = -1u, vlan = OUT_VLAN }; vf_port_attr[IFLA_PORT_VF] = NULL; /* don't send this at all */ vf_port_attr[IFLA_PORT_RESPONSE] = NULL; /* also don't send this */ vf_port_attr[IFLA_PORT_*] = WHATEVER_WE_NEED; and waiting for the receiver to choose a free VF and ack this message with vfinfo_attr[IFLA_VF_MAC] = { .vf = CHOSEN_VF, .mac = OUR_MAC }; vfinfo_attr[IFLA_VF_VLAN] = { .vf = CHOSEN_VF, vlan = OUT_VLAN }; vf_port_attr[IFLA_PORT_VF] = { CHOSEN_VF }; /* to identify the vf_port_attr */ vf_port_attr[IFLA_PORT_RESPONSE] = NULL; /* no response yet */ vf_port_attr[IFLA_PORT_RESPONSE] = PORT_RESPONSE_INPROGRESS; /* alternative, if you prefer */ vf_port_attr[IFLA_PORT_*] = WHATEVER_WE_REQUESTED; I don't care which of the two responses we put in there, we just need to make a decision here since we forgot to define one before. When waiting for the request to complete, we can then check for the specific vf returned from lldpad (or the kernel, if we ever need to go there). Arnd

On Thu, 2010-05-27 at 17:43 +0200, Arnd Bergmann wrote:
On Thursday 27 May 2010, Stefan Berger wrote:
On Thu, 2010-05-27 at 15:37 +0200, Arnd Bergmann wrote:
I'm even more confused now. Why should the response be different from the response we get from the kernel? What's different on the sending side other than the PID? Also, what is the RTMGRP_LINK argument used for? I thought we don't need multicast any more because we don't target kernel and lldpad in the same message but only one of them.
Fact is that if I set RTMGRP_LINK to 0 here on libvirt only side, the dummy server doesn't get a message. If I set it to 0 on both side, the dummy server also doesn't get a message. I think it's necessary for user-space-to-user-space communication, but I am also only learning about these types of sockets while I 'go'.
AFAICT, RTMGRP_LINK makes it a multicast message, which means that anyone can receive it, which a unicast message will only be sent to the task that has a matching pid.
Right. So, yes, if I read the pid of lldpad from /var/run/lldpad.pid then I can direct the netlink message and nlComm() becomes common code for talking to the kernel or another userspace process.
You mean that's not defined in the (pre-)standard?
Yes, the Qbg wire protocol has no need for this, because the messages are only sent after the state has changed, we never see a VDP message with an incomplete status in there, so there is no need to specify it in Qbg, but we need something in the netlink protocol.
Yes, I would suggest to mimic 802.1Qbh in this case.
Like what? The first 256 result numbers in the netlink protocol are meant to be the same as the ones in the wire protocol, the next 256 are for the Qbh protocol.
I did not look carefully and differentiate between error codes of 802.1Qbg or .1Qbh but took them as shared between both technologies.
We could of course define yet another range for the inprogress result and possibly future extensions, starting at 512 if you insist on requiring the IFLA_PORT_RESPONSE.
No, I'll fake it for now with the INPROGRESS value of 802.Qbh.
I originally wanted to defer the response until we hear back from the switch, but that may take a really long time (over a minute with all the VDP timeouts). That would be cleaner IMHO, but you may not want to wait that long in libvirt.
802.1Qbh is polling for max. 10 seconds. So we need to tune this to a even higher value for 802.1Qbg ?
what we should return there? Should we possibly just leave out IFLA_PORT_RESPONSE in order to signal INPROGRESS, as in not clear yet?
We want to be able to signal failure in case the switch setup failed so that we don't have a malfunctioning network interface where the user then doesn't know what to debug. In case of failure we simply wouldn't start the VM or hotplug the interface and return an indication that something went wrong during the negotiation with the switch.
So leaving out IFLA_PORT_RESPONSE would work for that, right?
We need to poll for the current status. At least in 802.1Qbh case using an RTM_GETLINK type of message. Don't know how we could leave out IFLA_PORT_RESPONSE...
The response can simply contain all attributes of the request but not contain an IFLA_PORT_RESPONSE attribute inside of the IFLA_VF_PORT.
Yes, understood now.
attribute for Qbg, which is probably the best approximation we can get. I think we should just mandate this for the request, and let lldpad decide on the fake vf number, which it returns to getPortProfileStatus.
Just be a bit more clear about the exact parameter to pass.
I'd suggest doing a request with
vfinfo_attr[IFLA_VF_MAC] = { .vf = -1u, .mac = OUR_MAC }; vfinfo_attr[IFLA_VF_VLAN] = { .vf = -1u, vlan = OUT_VLAN }; vf_port_attr[IFLA_PORT_VF] = NULL; /* don't send this at all */ vf_port_attr[IFLA_PORT_RESPONSE] = NULL; /* also don't send this */ vf_port_attr[IFLA_PORT_*] = WHATEVER_WE_NEED;
and waiting for the receiver to choose a free VF and ack this message with
I think the typical ACK in a netlink message on a RTM_SETLINK is just an error code indication of whether the message was successfully process or now. The subsequent RTM_GETLINK would then return what you show here.
vfinfo_attr[IFLA_VF_MAC] = { .vf = CHOSEN_VF, .mac = OUR_MAC }; vfinfo_attr[IFLA_VF_VLAN] = { .vf = CHOSEN_VF, vlan = OUT_VLAN }; vf_port_attr[IFLA_PORT_VF] = { CHOSEN_VF }; /* to identify the vf_port_attr */ vf_port_attr[IFLA_PORT_RESPONSE] = NULL; /* no response yet */ vf_port_attr[IFLA_PORT_RESPONSE] = PORT_RESPONSE_INPROGRESS; /* alternative, if you prefer */ vf_port_attr[IFLA_PORT_*] = WHATEVER_WE_REQUESTED;
I don't care which of the two responses we put in there, we just need to make a decision here since we forgot to define one before.
I don't do much with the returned vf so for now I don't read it.
When waiting for the request to complete, we can then check for the specific vf returned from lldpad (or the kernel, if we ever need to go there).
I have a v11 that does hopefully most of the above now correctly. I will send it to you and Scott privately first before generating more noise on the list. Stefan
Arnd

On Thursday 27 May 2010, Stefan Berger wrote:
On Thu, 2010-05-27 at 17:43 +0200, Arnd Bergmann wrote:
Yes, the Qbg wire protocol has no need for this, because the messages are only sent after the state has changed, we never see a VDP message with an incomplete status in there, so there is no need to specify it in Qbg, but we need something in the netlink protocol.
Yes, I would suggest to mimic 802.1Qbh in this case.
Like what? The first 256 result numbers in the netlink protocol are meant to be the same as the ones in the wire protocol, the next 256 are for the Qbh protocol.
I did not look carefully and differentiate between error codes of 802.1Qbg or .1Qbh but took them as shared between both technologies.
We discussed several options here but in the end went for separate ranges.
We could of course define yet another range for the inprogress result and possibly future extensions, starting at 512 if you insist on requiring the IFLA_PORT_RESPONSE.
No, I'll fake it for now with the INPROGRESS value of 802.Qbh.
ok. This could probably use a comment of some sorts in the code then ;-)
I originally wanted to defer the response until we hear back from the switch, but that may take a really long time (over a minute with all the VDP timeouts). That would be cleaner IMHO, but you may not want to wait that long in libvirt.
802.1Qbh is polling for max. 10 seconds. So we need to tune this to a even higher value for 802.1Qbg ?
I fear so, yes. The actual maximum timeout get negotiated between the switch and the host, but since there are retries on both the lower ECP and the upper VDP layer, it can take a long time. Maybe Jens knows a reasonable default for you.
I'd suggest doing a request with
vfinfo_attr[IFLA_VF_MAC] = { .vf = -1u, .mac = OUR_MAC }; vfinfo_attr[IFLA_VF_VLAN] = { .vf = -1u, vlan = OUT_VLAN }; vf_port_attr[IFLA_PORT_VF] = NULL; /* don't send this at all */ vf_port_attr[IFLA_PORT_RESPONSE] = NULL; /* also don't send this */ vf_port_attr[IFLA_PORT_*] = WHATEVER_WE_NEED;
and waiting for the receiver to choose a free VF and ack this message with
I think the typical ACK in a netlink message on a RTM_SETLINK is just an error code indication of whether the message was successfully process or now. The subsequent RTM_GETLINK would then return what you show here.
Ok, I didn't know that. I assumed that the response was always the full message that got sent as a request, plus anything filled in that is needed (like the VF number here). Arnd

On Thu, 2010-05-27 at 08:47 -0400, Stefan Berger wrote:
On Thu, 2010-05-27 at 13:55 +0200, Arnd Bergmann wrote:
On Thursday 27 May 2010, Stefan Berger wrote:
+static int +getPortProfileStatus(struct nlattr **tb, int32_t vf, uint16_t *status) +{ + int rc = 1; + const char *msg = NULL; + struct nlattr *tb2[IFLA_VF_PORT_MAX + 1], + *tb3[IFLA_PORT_MAX+1]; + + if (vf == PORT_SELF_VF) { + if (tb[IFLA_PORT_SELF]) { + if (nla_parse_nested(tb3, IFLA_PORT_MAX, tb[IFLA_PORT_SELF], + ifla_port_policy)) { + msg = _("error parsing nested IFLA_VF_PORT part"); + goto err_exit; + } + } + } else { + if (tb[IFLA_VF_PORTS]) { + if (nla_parse_nested(tb2, IFLA_VF_PORT_MAX, tb[IFLA_VF_PORTS], + ifla_vf_ports_policy)) { + msg = _("error parsing nested IFLA_VF_PORTS part"); + goto err_exit; + } + if (tb2[IFLA_VF_PORT]) { + if (nla_parse_nested(tb3, IFLA_PORT_MAX, tb2[IFLA_VF_PORT], + ifla_port_policy)) { + msg = _("error parsing nested IFLA_VF_PORT part"); + goto err_exit; + } + } + } + }
There may be multiple IFLA_VF_PORT attributes in the IFLA_VF_PORTS list, so you cannot do nla_parse_nested. I think this should be nla_for_each_attr instead, and compare the uuid to the one you expect.
Ok. I'll change that for v10.
Actually I believe calling nla_for_each_nested(attr, tb[IFLA_VF_PORTS], ...) } is even better than calling nla_for_each_attr directly. (You can check how Scott coded the kernel function do_setlink in his recent kernel patch "Add netlink support for virtual port" for a similar example) /Christian

On Thu, 2010-05-27 at 16:50 +0200, Christian Benvenuti (benve) wrote:
On Thu, 2010-05-27 at 08:47 -0400, Stefan Berger wrote:
On Thu, 2010-05-27 at 13:55 +0200, Arnd Bergmann wrote:
On Thursday 27 May 2010, Stefan Berger wrote:
+static int +getPortProfileStatus(struct nlattr **tb, int32_t vf, uint16_t *status) +{ + int rc = 1; + const char *msg = NULL; + struct nlattr *tb2[IFLA_VF_PORT_MAX + 1], + *tb3[IFLA_PORT_MAX+1]; + + if (vf == PORT_SELF_VF) { + if (tb[IFLA_PORT_SELF]) { + if (nla_parse_nested(tb3, IFLA_PORT_MAX, tb[IFLA_PORT_SELF], + ifla_port_policy)) { + msg = _("error parsing nested IFLA_VF_PORT part"); + goto err_exit; + } + } + } else { + if (tb[IFLA_VF_PORTS]) { + if (nla_parse_nested(tb2, IFLA_VF_PORT_MAX, tb[IFLA_VF_PORTS], + ifla_vf_ports_policy)) { + msg = _("error parsing nested IFLA_VF_PORTS part"); + goto err_exit; + } + if (tb2[IFLA_VF_PORT]) { + if (nla_parse_nested(tb3, IFLA_PORT_MAX, tb2[IFLA_VF_PORT], + ifla_port_policy)) { + msg = _("error parsing nested IFLA_VF_PORT part"); + goto err_exit; + } + } + } + }
There may be multiple IFLA_VF_PORT attributes in the IFLA_VF_PORTS list, so you cannot do nla_parse_nested. I think this should be nla_for_each_attr instead, and compare the uuid to the one you expect.
Ok. I'll change that for v10.
Actually I believe calling
nla_for_each_nested(attr, tb[IFLA_VF_PORTS], ...) }
is even better than calling nla_for_each_attr directly. (You can check how Scott coded the kernel function do_setlink in his recent kernel patch "Add netlink support for virtual port" for a similar example)
Probably a typo from Arnd. I looked in rtnetlink.c. Thanks. Stefan
/Christian
participants (3)
-
Arnd Bergmann
-
Christian Benvenuti (benve)
-
Stefan Berger